Source code for graphnetz.datasets

"""Dataset registry, organized by research category and task.

Each category exposes thin loader functions that return a ``torch_geometric``
dataset (PyG built-in or :class:`~graphnetz.datasets.Netz`).  The
:data:`LOADER_REGISTRY` table maps every loader to its category and the task it can serve, mirroring the structure used by
:data:`graphnetz.benchmark.BENCHMARK_TASKS` (``[category][task_type] -> [...]``).

Categories
----------
- ``combinatorial``: synthetic TSP, VRP, max-cut, max-flow, matching, coloring.
- ``biology``: MUTAG, PROTEINS, ENZYMES, PPI, Peptides (LRGB), C. elegans,
  connectomes, contact networks.
- ``social``: Cora, CiteSeer, PubMed, WikiCS, Heterophilous (Roman-empire,
  Amazon-ratings, Minesweeper, Tolokers, Questions), Karate, ego networks.
- ``knowledge``: FB15k-237, WordNet18-RR, Netz WordNet.
- ``infrastructure``: power grids, road and air networks.
- ``finance``: product space, board interlocks, patents, Elliptic Bitcoin.
- ``computing``: AS topology, Skitter, BGP route views.
- ``vision``: MNIST/CIFAR10 superpixel graphs, ModelNet, ShapeNet.
- ``physics``: QM9, ZINC, Ising lattice.
- ``security``: terrorist association networks, MalNet-Tiny.

Tasks
----------
``node_cls``, ``graph_cls``, ``graph_reg``, ``link_pred``. A loader may
serve more than one task type (e.g. ``cora`` is used for both ``node_cls`` and
``link_pred``). Deep Graph Infomax is *not* a task: it is a
self-supervised training objective whose metric is its own loss, so the
benchmark routes unlabelled graphs through ``link_pred`` (a real held-out
edge split with an AUC metric) instead. ``train_dgi`` and the
``DGIWrapper`` adapter remain available as utilities for users who want
unsupervised pre-training on top of any encoder.
"""

import importlib.util

from graphnetz.datasets import (
    biology,
    combinatorial,
    computing,
    finance,
    infrastructure,
    knowledge,
    physics,
    security,
    social,
    vision,
)
from graphnetz.datasets._netz import Netz, download_all_networks_netz

# OGB loaders live directly inside the domain modules (``social.ogbn_arxiv``,
# ``biology.ogbg_molhiv``, etc.). They raise a clear ``ImportError`` at
# call time when the optional ``ogb`` package is missing; the registries
# below only reference them when ``ogb`` is importable so the curated
# benchmark stays runnable without the extra installed.
_HAS_OGB = importlib.util.find_spec("ogb") is not None

CATEGORIES = {
    "combinatorial": combinatorial,
    "biology": biology,
    "social": social,
    "knowledge": knowledge,
    "infrastructure": infrastructure,
    "finance": finance,
    "computing": computing,
    "vision": vision,
    "physics": physics,
    "security": security,
}


# Source-of-truth taxonomy: category -> task -> [(loader_name, callable)].
# The benchmark dispatcher curates a subset of these; users can also load any
# loader directly via the category module.
LOADER_REGISTRY: dict[str, dict[str, list[tuple[str, object]]]] = {
    "combinatorial": {
        # Synthetic combinatorial generators ship no graph-level ``y``,
        # so they cannot serve graph_cls / graph_reg without a labelling
        # convention. They enter the benchmark exclusively through
        # link_pred (a real held-out edge split with an AUC metric).
        "link_pred": [
            ("random_bipartite_matching", combinatorial.random_bipartite_matching),
            ("random_tsp", combinatorial.random_tsp),
            ("random_vrp", combinatorial.random_vrp),
            ("random_coloring", combinatorial.random_coloring),
            ("random_maxcut", combinatorial.random_maxcut),
            ("random_maxflow", combinatorial.random_maxflow),
        ],
    },
    "biology": {
        "graph_cls": [
            ("mutag", biology.mutag),
            ("proteins", biology.proteins),
            ("enzymes", biology.enzymes),
            ("peptides_func", biology.peptides_func),
        ],
        "graph_reg": [("peptides_struct", biology.peptides_struct)],
        # PPI is a multi-graph inductive dataset and does not fit the
        # single-``Data`` + ``train_mask`` shape that
        # ``train_node_classification`` expects, so it enters the
        # benchmark through ``link_pred`` -- ``RandomLinkSplit`` on the
        # first graph yields a real held-out-edge AUC, matching the
        # protocol the framework uses for the other unlabelled biology
        # graphs (celegans, connectome, contact networks).
        "link_pred": [
            ("celegans", biology.celegans),
            ("budapest_connectome", biology.budapest_connectome),
            ("hospital_contacts", biology.hospital_contacts),
            ("high_school_contacts", biology.high_school_contacts),
            ("ppi", biology.ppi),
        ],
    },
    "social": {
        "node_cls": [
            ("cora", social.cora),
            ("citeseer", social.citeseer),
            ("pubmed", social.pubmed),
            ("wikics", social.wikics),
            ("roman_empire", social.roman_empire),
            ("amazon_ratings", social.amazon_ratings),
            ("minesweeper", social.minesweeper),
            ("tolokers", social.tolokers),
            ("questions", social.questions),
        ],
        "link_pred": [
            ("cora", social.cora),
            ("citeseer", social.citeseer),
            ("pubmed", social.pubmed),
            ("movielens100k", social.movielens100k),
            ("karate", social.karate),
            ("facebook_friends", social.facebook_friends),
            ("dblp_coauthor", social.dblp_coauthor),
            ("dnc_emails", social.dnc_emails),
        ],
    },
    "knowledge": {
        "link_pred": [
            ("fb15k_237", knowledge.fb15k_237),
            ("wordnet18rr", knowledge.wordnet18rr),
            ("wordnet_netz", knowledge.wordnet_netz),
        ],
    },
    "infrastructure": {
        "link_pred": [
            ("power_grid", infrastructure.power_grid),
            ("euroroad", infrastructure.euroroad),
            ("us_roads", infrastructure.us_roads),
            ("eu_airlines", infrastructure.eu_airlines),
            ("london_transport", infrastructure.london_transport),
            ("urban_streets", infrastructure.urban_streets),
        ],
    },
    "finance": {
        "node_cls": [("elliptic_bitcoin", finance.elliptic_bitcoin)],
        "link_pred": [
            ("product_space", finance.product_space),
            ("board_directors", finance.board_directors),
            ("us_patents", finance.us_patents),
        ],
    },
    "computing": {
        "link_pred": [
            ("internet_as", computing.internet_as),
            ("topology", computing.topology),
            ("as_skitter", computing.as_skitter),
            ("route_views", computing.route_views),
        ],
    },
    "vision": {
        "graph_cls": [
            ("mnist_superpixels", vision.mnist_superpixels),
            ("cifar10_superpixels", vision.cifar10_superpixels),
            ("modelnet10", vision.modelnet10),
            ("modelnet40", vision.modelnet40),
        ],
        "node_cls": [("shapenet", vision.shapenet)],
    },
    "physics": {
        "graph_reg": [
            ("qm9", physics.qm9),
            ("zinc", physics.zinc),
        ],
        "link_pred": [("ising_lattice", physics.ising_lattice)],
    },
    "security": {
        "graph_cls": [("malnet_tiny", security.malnet_tiny)],
        "link_pred": [
            ("terrorists_911", security.terrorists_911),
            ("train_terrorists", security.train_terrorists),
        ],
    },
}

if _HAS_OGB:
    # OGB loaders live in the domain modules; we only fold them into the
    # registry when the ``ogb`` extra is importable so default benchmark
    # runs stay green without it. Mapping rationale:
    #   - ogbn_arxiv     -> social   (citation network)
    #   - ogbn_products  -> finance  (e-commerce co-purchase, alongside
    #                                  product_space)
    #   - ogbg_molhiv    -> biology  (molecular bioactivity)
    #   - ogbg_molpcba   -> biology
    #   - ogbl_collab    -> social   (collaboration / coauthorship)
    LOADER_REGISTRY["social"]["node_cls"].append(("ogbn_arxiv", social.ogbn_arxiv))
    LOADER_REGISTRY["social"]["link_pred"].append(("ogbl_collab", social.ogbl_collab))
    LOADER_REGISTRY["finance"]["node_cls"].append(("ogbn_products", finance.ogbn_products))
    LOADER_REGISTRY["biology"]["graph_cls"].append(("ogbg_molhiv", biology.ogbg_molhiv))
    LOADER_REGISTRY["biology"]["graph_cls"].append(("ogbg_molpcba", biology.ogbg_molpcba))



[docs]
def list_datasets(
    category: str | None = None,
    task: str | None = None,
) -> dict[str, dict[str, list[str]]]:
    """Return loader names organized by category and task.

    Output shape: ``{category: {task_type: [loader_name, ...]}}``. Pass
    ``category`` and/or ``task`` to restrict the view.
    """
    cats = [category] if category is not None else list(LOADER_REGISTRY)
    out: dict[str, dict[str, list[str]]] = {}
    for c in cats:
        per_cat = LOADER_REGISTRY.get(c, {})
        tasks = [task] if task is not None else list(per_cat)
        out[c] = {k: [name for name, _ in per_cat.get(k, [])] for k in tasks if k in per_cat}
    return out



__all__ = [
    "CATEGORIES",
    "LOADER_REGISTRY",
    "Netz",
    "biology",
    "combinatorial",
    "computing",
    "download_all_networks_netz",
    "finance",
    "infrastructure",
    "knowledge",
    "list_datasets",
    "physics",
    "security",
    "social",
    "vision",
]