File size: 4,228 Bytes
bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 bf620c6 f74dd01 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
"""
make_edgelists.py
Create a canonical edgelist (or a directory of edgelists).
Usage
-----
python make_edgelists.py [--data_root <root>] <dataset_name> <edges_out>
Arguments
---------
dataset_name
The name of the PyG dataset (e.g. "Cora", "TUDatasetName", etc.).
edges_out
* If the dataset contains a single graph (e.g. Planetoid Cora) – this is a
file path (`graph.txt`, `edges.txt`, …).
* If the dataset contains many graphs (e.g. TUDataset) – this is a
directory path where each graph is written as
`graph_000000.txt`, `graph_000001.txt`, …
Examples
--------
# One‑graph dataset (Planetoid Cora)
python make_edgelists.py Cora ./cora_edges.txt
# Many‑graph dataset (TUDataset Facebook)
python make_edgelists.py Facebook ./facebook_edgelists
"""
from __future__ import annotations
import argparse
from pathlib import Path
from typing import Iterable, Tuple, Set
# -------------------------------------------------------------
def canonical_edges(edge_index) -> Set[Tuple[int, int]]:
"""Return a set of undirected (u,v) pairs with u<v and u!=v."""
seen: Set[Tuple[int, int]] = set()
for u, v in edge_index.t().tolist():
if u == v:
continue
if u > v:
u, v = v, u
seen.add((u, v))
return seen
def write_edges(out_file: Path, edges: Iterable[Tuple[int, int]]) -> None:
"""Write `u v` per line to `out_file`."""
out_file.parent.mkdir(parents=True, exist_ok=True)
with out_file.open("w") as f:
for u, v in sorted(edges):
f.write(f"{u} {v}\n")
def process_planetoid_dataset(root: Path, name: str, out_dir: Path | Path):
"""Planetoid datasets contain a single graph."""
from torch_geometric.datasets import Planetoid
ds = Planetoid(root=str(root), name=name)
data = ds[0] # the only graph
edges = canonical_edges(data.edge_index)
if isinstance(out_dir, Path) and out_dir.is_dir():
out_file = out_dir / "graph_000000.txt"
else:
out_file = out_dir
write_edges(out_file, edges)
# No output to stdout – the edgelist(s) are written to disk
def process_tudataset(root: Path, name: str, out_dir: Path):
"""TUDataset may contain many graphs – write each to <out_dir>/graph_XXXXXX.txt."""
from torch_geometric.datasets import TUDataset
ds = TUDataset(root=str(root), name=name)
out_dir.mkdir(parents=True, exist_ok=True)
for i, data in enumerate(ds):
edges = canonical_edges(data.edge_index)
out_file = out_dir / f"graph_{i:06d}.txt"
write_edges(out_file, edges)
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__.strip(), formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
"--data_root", default="./data", help="Root directory for PyG datasets"
)
parser.add_argument("dataset_name", help="PyG dataset name (e.g. Cora)")
parser.add_argument(
"edges_out",
help=(
"File path (for single‑graph datasets) or directory "
"(for multi‑graph datasets) to write the canonical edgelist(s)"
),
)
args = parser.parse_args()
root = Path(args.data_root)
out_path = Path(args.edges_out)
# We try to guess whether the requested dataset is a Planetoid or TUDataset.
# If it can be loaded as a Planetoid we use that; otherwise we fall back to TUDataset.
try:
from torch_geometric.datasets import Planetoid
_ = Planetoid(root=str(root), name=args.dataset_name)
dataset_type = "Planetoid"
except Exception: # pragma: no cover – normal branch failure
from torch_geometric.datasets import TUDataset
_ = TUDataset(root=str(root), name=args.dataset_name)
dataset_type = "TUDataset"
# Dispatch
if dataset_type == "Planetoid":
process_planetoid_dataset(root, args.dataset_name, out_path)
else: # TUDataset
if out_path.is_file():
raise ValueError(
"For multi‑graph datasets (e.g. TUDataset) the output must be a directory"
)
process_tudataset(root, args.dataset_name, out_path)
if __name__ == "__main__":
main()
|