File size: 4,228 Bytes
bf620c6
f74dd01
 
 
bf620c6
 
 
f74dd01
bf620c6
 
 
f74dd01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf620c6
 
 
 
 
 
f74dd01
bf620c6
f74dd01
bf620c6
 
f74dd01
bf620c6
 
 
 
 
 
 
 
 
 
 
f74dd01
bf620c6
f74dd01
bf620c6
 
 
 
f74dd01
 
bf620c6
 
 
f74dd01
 
bf620c6
f74dd01
 
bf620c6
f74dd01
bf620c6
 
f74dd01
bf620c6
 
f74dd01
 
bf620c6
 
 
 
 
 
 
 
 
 
 
 
 
f74dd01
 
 
 
 
 
 
 
 
 
 
bf620c6
 
 
 
 
f74dd01
 
bf620c6
 
 
f74dd01
 
 
bf620c6
 
f74dd01
 
bf620c6
f74dd01
 
 
 
bf620c6
f74dd01
 
 
 
bf620c6
 
 
f74dd01
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
make_edgelists.py

Create a canonical edgelist (or a directory of edgelists).

Usage
-----
    python make_edgelists.py [--data_root <root>] <dataset_name> <edges_out>

Arguments
---------
dataset_name
    The name of the PyG dataset (e.g. "Cora", "TUDatasetName", etc.).
edges_out
    * If the dataset contains a single graph (e.g. Planetoid Cora) – this is a
      file path (`graph.txt`, `edges.txt`, …).
    * If the dataset contains many graphs (e.g. TUDataset) – this is a
      directory path where each graph is written as
      `graph_000000.txt`, `graph_000001.txt`, …

Examples
--------
# One‑graph dataset (Planetoid Cora)
python make_edgelists.py Cora ./cora_edges.txt

# Many‑graph dataset (TUDataset Facebook)
python make_edgelists.py Facebook ./facebook_edgelists
"""

from __future__ import annotations

import argparse
from pathlib import Path
from typing import Iterable, Tuple, Set

# -------------------------------------------------------------

def canonical_edges(edge_index) -> Set[Tuple[int, int]]:
    """Return a set of undirected (u,v) pairs with u<v and u!=v."""
    seen: Set[Tuple[int, int]] = set()
    for u, v in edge_index.t().tolist():
        if u == v:
            continue
        if u > v:
            u, v = v, u
        seen.add((u, v))
    return seen


def write_edges(out_file: Path, edges: Iterable[Tuple[int, int]]) -> None:
    """Write `u v` per line to `out_file`."""
    out_file.parent.mkdir(parents=True, exist_ok=True)
    with out_file.open("w") as f:
        for u, v in sorted(edges):
            f.write(f"{u} {v}\n")


def process_planetoid_dataset(root: Path, name: str, out_dir: Path | Path):
    """Planetoid datasets contain a single graph."""
    from torch_geometric.datasets import Planetoid

    ds = Planetoid(root=str(root), name=name)
    data = ds[0]  # the only graph
    edges = canonical_edges(data.edge_index)

    if isinstance(out_dir, Path) and out_dir.is_dir():
        out_file = out_dir / "graph_000000.txt"
    else:
        out_file = out_dir

    write_edges(out_file, edges)
    # No output to stdout – the edgelist(s) are written to disk


def process_tudataset(root: Path, name: str, out_dir: Path):
    """TUDataset may contain many graphs – write each to <out_dir>/graph_XXXXXX.txt."""
    from torch_geometric.datasets import TUDataset

    ds = TUDataset(root=str(root), name=name)
    out_dir.mkdir(parents=True, exist_ok=True)

    for i, data in enumerate(ds):
        edges = canonical_edges(data.edge_index)
        out_file = out_dir / f"graph_{i:06d}.txt"
        write_edges(out_file, edges)


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__.strip(), formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        "--data_root", default="./data", help="Root directory for PyG datasets"
    )
    parser.add_argument("dataset_name", help="PyG dataset name (e.g. Cora)")
    parser.add_argument(
        "edges_out",
        help=(
            "File path (for single‑graph datasets) or directory "
            "(for multi‑graph datasets) to write the canonical edgelist(s)"
        ),
    )
    args = parser.parse_args()

    root = Path(args.data_root)
    out_path = Path(args.edges_out)

    # We try to guess whether the requested dataset is a Planetoid or TUDataset.
    # If it can be loaded as a Planetoid we use that; otherwise we fall back to TUDataset.
    try:
        from torch_geometric.datasets import Planetoid

        _ = Planetoid(root=str(root), name=args.dataset_name)
        dataset_type = "Planetoid"
    except Exception:  # pragma: no cover – normal branch failure
        from torch_geometric.datasets import TUDataset

        _ = TUDataset(root=str(root), name=args.dataset_name)
        dataset_type = "TUDataset"

    # Dispatch
    if dataset_type == "Planetoid":
        process_planetoid_dataset(root, args.dataset_name, out_path)
    else:  # TUDataset
        if out_path.is_file():
            raise ValueError(
                "For multi‑graph datasets (e.g. TUDataset) the output must be a directory"
            )
        process_tudataset(root, args.dataset_name, out_path)


if __name__ == "__main__":
    main()