Spaces:

Molbap
/

modular-detector-v2

Sleeping

App Files Files Community

Molbap HF Staff commited on Jan 15

Commit

7a4622c

verified ·

1 Parent(s): a5519b2

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

scripts/build_index.py +30 -108

scripts/build_index.py CHANGED Viewed

@@ -1,20 +1,7 @@
 #!/usr/bin/env python3
-"""
-Build local similarity indexes from a Transformers checkout.
-This script reuses core components from `utils/modular_model_detector.py` in a local
-Transformers clone, including the embedding pipeline and sanitization/tokenization.
-Outputs are written to the chosen output directory (default: repo root):
-- embeddings*.safetensors
-- code_index_map*.json
-- code_index_tokens*.json
-"""
 from __future__ import annotations
-import argparse
 import ast
 import importlib.util
 import json
@@ -22,29 +9,26 @@ import os
 from pathlib import Path
 import numpy as np
 from safetensors.numpy import save_file as safetensors_save
-try:
-    from tqdm import tqdm
-except ImportError:  # pragma: no cover - optional dependency
-    def tqdm(iterable, **_kwargs):
-        return iterable
 ROOT = Path(__file__).resolve().parent.parent
-def _load_detector_module(transformers_dir: Path):
     module_path = transformers_dir / "utils" / "modular_model_detector.py"
     if not module_path.exists():
-        raise SystemExit(f"Expected modular_model_detector at {module_path}")
     spec = importlib.util.spec_from_file_location("modular_model_detector", module_path)
     if spec is None or spec.loader is None:
-        raise SystemExit(f"Could not load modular_model_detector from {module_path}")
     module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(module)
     return module
-def _extract_segment(source: str, node: ast.AST, lines: list[str]) -> str | None:
     segment = ast.get_source_segment(source, node)
     if segment is None and hasattr(node, "lineno") and hasattr(node, "end_lineno"):
         start = max(0, node.lineno - 1)
@@ -53,35 +37,13 @@ def _extract_segment(source: str, node: ast.AST, lines: list[str]) -> str | None
     return segment
-def _collect_definitions(analyzer, models_root: Path) -> tuple[list[str], list[str], dict[str, list[str]]]:
     identifiers: list[str] = []
     sanitized_sources: list[str] = []
     tokens_map: dict[str, list[str]] = {}
     modeling_files = sorted(models_root.rglob("modeling_*.py"))
-    print(f"Parsing {len(modeling_files)} modeling files (definition granularity)...")
-    for file_path in tqdm(modeling_files, desc="parse definitions", unit="file"):
-        try:
-            definitions_raw, definitions_sanitized, definitions_tokens, _ = analyzer._extract_definitions(
-                file_path, models_root, analyzer._infer_model_from_relative_path(file_path)
-            )
-        except (OSError, SyntaxError):
-            continue
-        for identifier, sanitized in definitions_sanitized.items():
-            identifiers.append(identifier)
-            sanitized_sources.append(sanitized)
-            tokens_map[identifier] = definitions_tokens[identifier]
-    return identifiers, sanitized_sources, tokens_map
-def _collect_methods(detector, analyzer, models_root: Path) -> tuple[list[str], list[str], dict[str, list[str]]]:
-    identifiers: list[str] = []
-    sanitized_sources: list[str] = []
-    tokens_map: dict[str, list[str]] = {}
-    modeling_files = sorted(models_root.rglob("modeling_*.py"))
-    print(f"Parsing {len(modeling_files)} modeling files (method granularity)...")
-    for file_path in tqdm(modeling_files, desc="parse methods", unit="file"):
         try:
             source = file_path.read_text(encoding="utf-8")
         except OSError:
@@ -97,7 +59,7 @@ def _collect_methods(detector, analyzer, models_root: Path) -> tuple[list[str],
         for node in ast.iter_child_nodes(tree):
             if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
-                segment = _extract_segment(source, node, lines)
                 if not segment:
                     continue
                 identifier = f"{relative_path}:{node.name}"
@@ -110,7 +72,7 @@ def _collect_methods(detector, analyzer, models_root: Path) -> tuple[list[str],
             if not isinstance(node, ast.ClassDef):
                 continue
-            class_segment = _extract_segment(source, node, lines)
             class_header = class_segment.splitlines()[0].strip() if class_segment else ""
             class_docstring = ast.get_docstring(node)
             class_context = class_header
@@ -121,7 +83,7 @@ def _collect_methods(detector, analyzer, models_root: Path) -> tuple[list[str],
             for child in node.body:
                 if not isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
                     continue
-                segment = _extract_segment(source, child, lines)
                 if not segment:
                     continue
                 identifier = f"{relative_path}:{node.name}.{child.name}"
@@ -130,74 +92,34 @@ def _collect_methods(detector, analyzer, models_root: Path) -> tuple[list[str],
                 identifiers.append(identifier)
                 sanitized_sources.append(sanitized)
                 tokens_map[identifier] = sorted(detector._tokenize(sanitized))
-    return identifiers, sanitized_sources, tokens_map
-def _write_index(output_dir: Path, granularity: str, identifiers: list[str], embeddings: np.ndarray, tokens: dict) -> None:
-    output_dir.mkdir(parents=True, exist_ok=True)
-    if granularity == "method":
-        emb_name = "embeddings_methods.safetensors"
-        map_name = "code_index_map_methods.json"
-        tok_name = "code_index_tokens_methods.json"
-    else:
-        emb_name = "embeddings.safetensors"
-        map_name = "code_index_map.json"
-        tok_name = "code_index_tokens.json"
-    safetensors_save({"embeddings": embeddings.astype("float32")}, output_dir / emb_name)
-    with open(output_dir / map_name, "w", encoding="utf-8") as file:
-        json.dump({int(i): identifiers[i] for i in range(len(identifiers))}, file)
-    with open(output_dir / tok_name, "w", encoding="utf-8") as file:
-        json.dump(tokens, file)
-def build_index(detector, analyzer, transformers_dir: Path, output_dir: Path, granularity: str) -> None:
-    models_root = transformers_dir / "src" / "transformers" / "models"
-    if not models_root.exists():
-        raise SystemExit(f"Expected models directory at {models_root}")
-    if granularity == "method":
-        identifiers, sanitized_sources, tokens_map = _collect_methods(detector, analyzer, models_root)
-    else:
-        identifiers, sanitized_sources, tokens_map = _collect_definitions(analyzer, models_root)
     if not identifiers:
-        raise SystemExit("No modeling definitions found to index.")
-    print(f"Encoding {len(identifiers)} definitions ({granularity}) with {detector.EMBEDDING_MODEL}")
     embeddings = analyzer.encode(sanitized_sources)
-    _write_index(output_dir, granularity, identifiers, embeddings, tokens_map)
-    print(f"Wrote index ({granularity}) to {output_dir}")
 def main() -> None:
-    parser = argparse.ArgumentParser(description="Build modular model graph indexes locally.")
-    parser.add_argument(
-        "--transformers-dir",
-        type=Path,
-        default=ROOT / "transformers",
-        help="Path to a transformers git clone (expects src/transformers/models inside).",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=Path,
-        default=ROOT,
-        help="Where to place the generated index files (default: repo root).",
-    )
-    parser.add_argument(
-        "--granularity",
-        choices=["definition", "method", "both"],
-        default="both",
-        help="Which index to build. 'both' runs definition + method sequentially.",
-    )
-    args = parser.parse_args()
-    detector = _load_detector_module(args.transformers_dir.resolve())
     hub_dataset = os.getenv("HUB_DATASET", detector.HUB_DATASET_DEFAULT)
     analyzer = detector.CodeSimilarityAnalyzer(hub_dataset=hub_dataset)
-    analyzer.models_root = (args.transformers_dir / "src" / "transformers" / "models").resolve()
-    targets = ["definition", "method"] if args.granularity == "both" else [args.granularity]
-    for target in targets:
-        build_index(detector, analyzer, args.transformers_dir.resolve(), args.output_dir.resolve(), target)
 if __name__ == "__main__":

 #!/usr/bin/env python3
 from __future__ import annotations
 import ast
 import importlib.util
 import json
 from pathlib import Path
 import numpy as np
+import torch
 from safetensors.numpy import save_file as safetensors_save
 ROOT = Path(__file__).resolve().parent.parent
+def load_detector(transformers_dir: Path):
     module_path = transformers_dir / "utils" / "modular_model_detector.py"
     if not module_path.exists():
+        raise SystemExit(f"Missing modular_model_detector.py at {module_path}")
     spec = importlib.util.spec_from_file_location("modular_model_detector", module_path)
     if spec is None or spec.loader is None:
+        raise SystemExit(f"Could not load detector from {module_path}")
     module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(module)
     return module
+def extract_segment(source: str, node: ast.AST, lines: list[str]) -> str | None:
     segment = ast.get_source_segment(source, node)
     if segment is None and hasattr(node, "lineno") and hasattr(node, "end_lineno"):
         start = max(0, node.lineno - 1)
     return segment
+def build_method_index(detector, analyzer, models_root: Path, output_dir: Path) -> None:
     identifiers: list[str] = []
     sanitized_sources: list[str] = []
     tokens_map: dict[str, list[str]] = {}
     modeling_files = sorted(models_root.rglob("modeling_*.py"))
+    for file_path in modeling_files:
         try:
             source = file_path.read_text(encoding="utf-8")
         except OSError:
         for node in ast.iter_child_nodes(tree):
             if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                segment = extract_segment(source, node, lines)
                 if not segment:
                     continue
                 identifier = f"{relative_path}:{node.name}"
             if not isinstance(node, ast.ClassDef):
                 continue
+            class_segment = extract_segment(source, node, lines)
             class_header = class_segment.splitlines()[0].strip() if class_segment else ""
             class_docstring = ast.get_docstring(node)
             class_context = class_header
             for child in node.body:
                 if not isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
                     continue
+                segment = extract_segment(source, child, lines)
                 if not segment:
                     continue
                 identifier = f"{relative_path}:{node.name}.{child.name}"
                 identifiers.append(identifier)
                 sanitized_sources.append(sanitized)
                 tokens_map[identifier] = sorted(detector._tokenize(sanitized))
     if not identifiers:
+        raise SystemExit("No modeling methods found.")
+    print(f"Encoding {len(identifiers)} definitions (method) with {detector.EMBEDDING_MODEL}")
     embeddings = analyzer.encode(sanitized_sources)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    safetensors_save({"embeddings": embeddings.astype("float32")}, output_dir / "embeddings_methods.safetensors")
+    with open(output_dir / "code_index_map_methods.json", "w", encoding="utf-8") as file:
+        json.dump({int(i): identifiers[i] for i in range(len(identifiers))}, file)
+    with open(output_dir / "code_index_tokens_methods.json", "w", encoding="utf-8") as file:
+        json.dump(tokens_map, file)
 def main() -> None:
+    transformers_dir = ROOT / "transformers"
+    if not transformers_dir.exists():
+        transformers_dir = ROOT / "transformers_repo"
+    if not transformers_dir.exists():
+        raise SystemExit("Expected a transformers clone at ./transformers or ./transformers_repo")
+    detector = load_detector(transformers_dir)
     hub_dataset = os.getenv("HUB_DATASET", detector.HUB_DATASET_DEFAULT)
     analyzer = detector.CodeSimilarityAnalyzer(hub_dataset=hub_dataset)
+    analyzer.models_root = (transformers_dir / "src" / "transformers" / "models").resolve()
+    analyzer.dtype = torch.float16 if analyzer.device.type == "cuda" else torch.float32
+    build_method_index(detector, analyzer, analyzer.models_root, ROOT)
 if __name__ == "__main__":