Spaces:

Molbap
/

modular-detector-v2

Sleeping

App Files Files Community

Molbap HF Staff commited on Jan 14

Commit

46a9b7a

verified ·

1 Parent(s): c987363

Add FastAPI app + static UI

Browse files

Files changed (11) hide show

Dockerfile +18 -0
app/__pycache__/detector.cpython-312.pyc +0 -0
app/__pycache__/graph.cpython-312.pyc +0 -0
app/__pycache__/main.cpython-312.pyc +0 -0
app/detector.py +538 -0
app/graph.py +90 -0
app/main.py +70 -0
requirements.txt +7 -0
static/app.js +215 -0
static/index.html +81 -0
static/styles.css +241 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM python:3.11-slim
+WORKDIR /app
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    HF_HUB_DISABLE_PROGRESS_BARS=1 \
+    TRANSFORMERS_VERBOSITY=error
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+COPY app /app/app
+COPY static /app/static
+EXPOSE 7860
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app/__pycache__/detector.cpython-312.pyc ADDED Viewed

Binary file (24.1 kB). View file

app/__pycache__/graph.cpython-312.pyc ADDED Viewed

Binary file (5.11 kB). View file

app/__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (3.77 kB). View file

app/detector.py ADDED Viewed

	@@ -0,0 +1,538 @@

+import ast
+import json
+import math
+import os
+import re
+from dataclasses import dataclass
+from functools import cache
+from pathlib import Path
+import numpy as np
+from huggingface_hub import snapshot_download
+from safetensors.numpy import load_file as safetensors_load
+import torch
+from transformers import AutoModel, AutoTokenizer
+import transformers
+MODELS_ROOT = Path(transformers.__file__).resolve().parent / "models"
+EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-4B"
+BATCH_SIZE = 16
+MAX_LENGTH = 4096
+HYBRID_ALPHA = 0.7
+HUB_DATASET_DEFAULT = "hf-internal-testing/transformers_code_embeddings"
+@dataclass
+class Match:
+    identifier: str
+    relative_path: str
+    match_name: str
+    score: float
+def _sanitize_for_embedding(code: str, model_hint: str | None, symbol_hint: str | None) -> str:
+    base = "\n".join(
+        line
+        for line in re.sub(r"#.*", "", re.sub(r'(\"\"\"|\'\'\')(?:.|\n)*?\1', "", code)).splitlines()
+        if not re.match(r"\s*(from|import)\s+", line)
+    )
+    variants = set()
+    if model_hint:
+        variants.add(model_hint)
+        variants.add(model_hint.replace("_", ""))
+        variants.add(re.sub(r"\d+", "", model_hint))
+    if symbol_hint:
+        match = re.match(r"^([A-Z][a-z0-9]+)", symbol_hint) or re.match(r"^([A-Za-z0-9]+)", symbol_hint)
+        prefix = match.group(1) if match else ""
+        if prefix:
+            variants.add(prefix)
+            variants.add(prefix.replace("_", ""))
+            variants.add(re.sub(r"\d+", "", prefix))
+    variants |= {variant.lower() for variant in list(variants)}
+    sanitized = base
+    for variant in sorted({x for x in variants if len(x) >= 3}, key=len, reverse=True):
+        sanitized = re.sub(re.escape(variant), "Model", sanitized, flags=re.IGNORECASE)
+    return sanitized
+def _compute_idf(tokens_map: dict[str, list[str]]) -> tuple[dict[str, float], float]:
+    doc_count = len(tokens_map)
+    if doc_count == 0:
+        return {}, 1.0
+    df: dict[str, int] = {}
+    for tokens in tokens_map.values():
+        for token in set(tokens):
+            df[token] = df.get(token, 0) + 1
+    idf = {token: math.log((doc_count + 1) / (count + 1)) + 1.0 for token, count in df.items()}
+    default_idf = math.log((doc_count + 1) / 1) + 1.0
+    return idf, default_idf
+def _weighted_jaccard(
+    query_tokens: set[str], candidate_tokens: set[str], idf_map: dict[str, float], default_idf: float
+) -> float:
+    if not query_tokens or not candidate_tokens:
+        return 0.0
+    intersection = query_tokens & candidate_tokens
+    if not intersection:
+        return 0.0
+    union = query_tokens | candidate_tokens
+    union_weight = sum(idf_map.get(token, default_idf) for token in union)
+    if union_weight <= 0:
+        return 0.0
+    intersection_weight = sum(idf_map.get(token, default_idf) for token in intersection)
+    return intersection_weight / union_weight
+@cache
+def _load_definition_line_map(relative_path: str) -> dict[str, int]:
+    file_path = MODELS_ROOT / relative_path
+    try:
+        source = file_path.read_text(encoding="utf-8")
+    except (FileNotFoundError, OSError):
+        return {}
+    try:
+        tree = ast.parse(source)
+    except SyntaxError:
+        return {}
+    line_map: dict[str, int] = {}
+    for node in ast.iter_child_nodes(tree):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+            line_map[node.name] = getattr(node, "lineno", None) or 1
+            if isinstance(node, ast.ClassDef):
+                for child in node.body:
+                    if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                        key = f"{node.name}.{child.name}"
+                        line_map[key] = getattr(child, "lineno", None) or 1
+    return line_map
+def _resolve_definition_location(relative_path: str, definition: str) -> tuple[str, int | None]:
+    full_path = (MODELS_ROOT / relative_path).resolve()
+    line = _load_definition_line_map(relative_path).get(definition)
+    return str(full_path), line
+class CodeSimilarityAnalyzer:
+    def __init__(self, hub_dataset: str, precision: str = "float32", granularity: str = "method"):
+        self.hub_dataset = hub_dataset
+        self.precision = precision
+        self.requested_granularity = granularity
+        self.index_granularity = granularity
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.dtype = torch.float16 if self.device.type == "cuda" else torch.float32
+        self.tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL, trust_remote_code=True)
+        self.model = AutoModel.from_pretrained(
+            EMBEDDING_MODEL, trust_remote_code=True, torch_dtype=self.dtype, device_map=None
+        ).to(self.device)
+        self.model.eval()
+        self.index_dir: Path | None = None
+        self.index_origin: str | None = None
+        self.missing_files: tuple[str, ...] = ()
+        self._index_cache: dict[str, object] | None = None
+    def _embedding_filename(self, granularity: str | None = None) -> str:
+        granularity = granularity or self.index_granularity
+        suffix = ""
+        if granularity == "method":
+            suffix += "_methods"
+        if self.precision == "int8":
+            suffix += "_int8"
+        if not suffix:
+            return "embeddings.safetensors"
+        return f"embeddings{suffix}.safetensors"
+    def _index_map_filename(self, granularity: str | None = None) -> str:
+        granularity = granularity or self.index_granularity
+        if granularity == "method":
+            return "code_index_map_methods.json"
+        return "code_index_map.json"
+    def _tokens_filename(self, granularity: str | None = None) -> str:
+        granularity = granularity or self.index_granularity
+        if granularity == "method":
+            return "code_index_tokens_methods.json"
+        return "code_index_tokens.json"
+    def _resolve_index_path(self, filename: str) -> Path:
+        if self.index_dir is None:
+            return Path(filename)
+        return self.index_dir / filename
+    def _required_index_files(self, granularity: str | None = None) -> tuple[str, ...]:
+        return (
+            self._embedding_filename(granularity),
+            self._index_map_filename(granularity),
+            self._tokens_filename(granularity),
+        )
+    def ensure_local_index(self) -> None:
+        required_files = self._required_index_files(self.requested_granularity)
+        if self.index_dir is not None and all((self.index_dir / fname).exists() for fname in required_files):
+            return
+        def missing_files(directory: Path, granularity: str) -> list[str]:
+            return [fname for fname in self._required_index_files(granularity) if not (directory / fname).exists()]
+        candidates: list[tuple[str, Path]] = []
+        env_dir = os.getenv("INDEX_DIR")
+        if env_dir:
+            candidates.append(("env", Path(env_dir)))
+        candidates.append(("cwd", Path.cwd()))
+        candidates.append(("repo", Path(__file__).resolve().parent.parent))
+        missing_preferred: list[str] = []
+        for origin, candidate in candidates:
+            missing_preferred = missing_files(candidate, self.requested_granularity)
+            if not missing_preferred:
+                self.index_dir = candidate
+                self.index_origin = origin
+                self.index_granularity = self.requested_granularity
+                self.missing_files = ()
+                self._index_cache = None
+                return
+        fallback_dir: Path | None = None
+        fallback_origin: str | None = None
+        fallback_missing: list[str] = []
+        if self.requested_granularity == "method":
+            for origin, candidate in candidates:
+                fallback_missing = missing_files(candidate, "definition")
+                if not fallback_missing:
+                    fallback_dir = candidate
+                    fallback_origin = origin
+                    break
+        snapshot_dir = Path(snapshot_download(repo_id=self.hub_dataset, repo_type="dataset"))
+        hub_missing_preferred = missing_files(snapshot_dir, self.requested_granularity)
+        hub_missing_fallback: list[str] = []
+        if self.requested_granularity == "method":
+            hub_missing_fallback = missing_files(snapshot_dir, "definition")
+        if not hub_missing_preferred:
+            self.index_dir = snapshot_dir
+            self.index_origin = "hub"
+            self.index_granularity = self.requested_granularity
+            self.missing_files = ()
+            self._index_cache = None
+            return
+        if self.requested_granularity == "method" and not hub_missing_fallback:
+            self.index_dir = snapshot_dir
+            self.index_origin = "hub"
+            self.index_granularity = "definition"
+            self.missing_files = tuple(hub_missing_preferred)
+            self._index_cache = None
+            return
+        if fallback_dir is not None:
+            self.index_dir = fallback_dir
+            self.index_origin = fallback_origin
+            self.index_granularity = "definition"
+            self.missing_files = tuple(missing_preferred)
+            self._index_cache = None
+            return
+        missing_detail = ", ".join(hub_missing_preferred or missing_preferred)
+        raise FileNotFoundError(
+            "Missing expected files for requested granularity; unable to fall back to definition index. "
+            f"Missing: {missing_detail}"
+        )
+    def _load_index(self) -> dict[str, object]:
+        if self._index_cache is not None:
+            return self._index_cache
+        self.ensure_local_index()
+        embedding_path = self._resolve_index_path(self._embedding_filename())
+        base = safetensors_load(str(embedding_path))
+        base_embeddings = base["embeddings"]
+        scales = base.get("scale") if self.precision == "int8" else None
+        with open(self._resolve_index_path(self._index_map_filename()), "r", encoding="utf-8") as file:
+            identifier_map = {int(key): value for key, value in json.load(file).items()}
+        with open(self._resolve_index_path(self._tokens_filename()), "r", encoding="utf-8") as file:
+            tokens_map = json.load(file)
+        idf_map, default_idf = _compute_idf(tokens_map)
+        self._index_cache = {
+            "embeddings": base_embeddings,
+            "scales": scales,
+            "identifier_map": identifier_map,
+            "tokens_map": tokens_map,
+            "idf_map": idf_map,
+            "default_idf": default_idf,
+        }
+        return self._index_cache
+    def _encode_batch(self, texts: list[str]) -> np.ndarray:
+        encoded = self.tokenizer(texts, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
+        encoded = {key: value.to(self.device) for key, value in encoded.items()}
+        with (
+            torch.autocast(device_type=self.device.type, dtype=self.dtype)
+            if self.device.type == "cuda"
+            else torch.no_grad()
+        ):
+            output = self.model(**encoded)
+            if hasattr(output, "last_hidden_state"):
+                embeddings = output.last_hidden_state
+                mask = encoded["attention_mask"].unsqueeze(-1)
+                embeddings = (embeddings * mask).sum(dim=1) / mask.sum(dim=1).clamp_min(1e-9)
+            elif hasattr(output, "pooler_output"):
+                embeddings = output.pooler_output
+            else:
+                embeddings = output[0].mean(dim=1)
+        embeddings = torch.nn.functional.normalize(embeddings.float(), p=2, dim=1)
+        return embeddings.cpu().numpy().astype("float32")
+    def encode(self, texts: list[str]) -> np.ndarray:
+        if not texts:
+            return np.zeros((0, 0), dtype="float32")
+        output = []
+        for i in range(0, len(texts), BATCH_SIZE):
+            output.append(self._encode_batch(texts[i : i + BATCH_SIZE]))
+            if self.device.type == "cuda":
+                torch.cuda.empty_cache()
+        return np.vstack(output) if output else np.zeros((0, 0), dtype="float32")
+    def _topk(
+        self,
+        query_embedding_row: np.ndarray,
+        base_embeddings: np.ndarray,
+        scales: np.ndarray | None,
+        identifier_map: dict[int, str],
+        k: int,
+        pool_size: int | None = None,
+    ) -> list[tuple[str, float]]:
+        if self.precision == "int8":
+            if scales is None:
+                raise ValueError("Missing int8 scales for int8 search.")
+            weighted_query = (query_embedding_row * scales).astype("float32")
+            similarities = weighted_query @ base_embeddings.T.astype("float32")
+        else:
+            similarities = query_embedding_row @ base_embeddings.T
+        pool = k + 32 if pool_size is None else max(k, pool_size)
+        indices = np.argpartition(-similarities, pool)[:pool]
+        indices = indices[np.argsort(-similarities[indices])]
+        output = []
+        for match_id in indices:
+            identifier = identifier_map[int(match_id)]
+            output.append((identifier, float(similarities[match_id])))
+            if len(output) >= k:
+                break
+        return output
+    def _combine_hybrid(
+        self,
+        candidates: list[tuple[str, float]],
+        query_tokens: set[str],
+        tokens_map: dict[str, list[str]],
+        idf_map: dict[str, float],
+        default_idf: float,
+        k: int,
+    ) -> tuple[list[tuple[str, float]], dict[str, float], dict[str, float]]:
+        embedding_scores: dict[str, float] = {}
+        jaccard_scores: dict[str, float] = {}
+        hybrid_scores = []
+        for identifier, embedding_score in candidates:
+            tokens = set(tokens_map.get(identifier, []))
+            jaccard_score = _weighted_jaccard(query_tokens, tokens, idf_map, default_idf)
+            embedding_scores[identifier] = embedding_score
+            jaccard_scores[identifier] = jaccard_score
+            hybrid = HYBRID_ALPHA * max(0.0, embedding_score) + (1.0 - HYBRID_ALPHA) * jaccard_score
+            hybrid_scores.append((identifier, hybrid))
+        hybrid_scores.sort(key=lambda item: item[1], reverse=True)
+        return hybrid_scores[:k], embedding_scores, jaccard_scores
+    def _extract_definitions_from_code(
+        self,
+        code: str,
+        model_hint: str | None,
+        granularity: str,
+    ) -> tuple[dict[str, str], dict[str, str], dict[str, list[str]], dict[str, str]]:
+        definitions_raw: dict[str, str] = {}
+        definitions_sanitized: dict[str, str] = {}
+        definitions_tokens: dict[str, list[str]] = {}
+        definitions_kind: dict[str, str] = {}
+        lines = code.splitlines()
+        tree = ast.parse(code)
+        for node in ast.iter_child_nodes(tree):
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and granularity in ("definition", "method"):
+                segment = ast.get_source_segment(code, node)
+                if segment is None and hasattr(node, "lineno") and hasattr(node, "end_lineno"):
+                    start = max(0, node.lineno - 1)
+                    end = node.end_lineno
+                    segment = "\n".join(lines[start:end])
+                if not segment:
+                    continue
+                identifier = node.name
+                definitions_raw[identifier] = segment
+                sanitized = _sanitize_for_embedding(segment, model_hint, node.name)
+                definitions_sanitized[identifier] = sanitized
+                definitions_tokens[identifier] = sorted(
+                    set(re.findall(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", sanitized))
+                )
+                definitions_kind[identifier] = "function"
+                continue
+            if isinstance(node, ast.ClassDef):
+                class_segment = ast.get_source_segment(code, node)
+                if class_segment is None and hasattr(node, "lineno") and hasattr(node, "end_lineno"):
+                    start = max(0, node.lineno - 1)
+                    end = node.end_lineno
+                    class_segment = "\n".join(lines[start:end])
+                class_header = ""
+                if class_segment:
+                    class_header = class_segment.splitlines()[0].strip()
+                class_docstring = ast.get_docstring(node)
+                class_context = class_header
+                if class_docstring:
+                    first_line = class_docstring.strip().splitlines()[0]
+                    class_context = f'{class_header}\n"""{first_line}"""' if class_header else first_line
+                if granularity == "definition":
+                    if not class_segment:
+                        continue
+                    identifier = node.name
+                    definitions_raw[identifier] = class_segment
+                    sanitized = _sanitize_for_embedding(class_segment, model_hint, node.name)
+                    definitions_sanitized[identifier] = sanitized
+                    definitions_tokens[identifier] = sorted(
+                        set(re.findall(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", sanitized))
+                    )
+                    definitions_kind[identifier] = "class"
+                    continue
+                for child in node.body:
+                    if not isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                        continue
+                    segment = ast.get_source_segment(code, child)
+                    if segment is None and hasattr(child, "lineno") and hasattr(child, "end_lineno"):
+                        start = max(0, child.lineno - 1)
+                        end = child.end_lineno
+                        segment = "\n".join(lines[start:end])
+                    if not segment:
+                        continue
+                    method_name = child.name
+                    combined = f"{class_context}\n{segment}" if class_context else segment
+                    identifier = f"{node.name}.{method_name}"
+                    definitions_raw[identifier] = segment
+                    sanitized = _sanitize_for_embedding(combined, model_hint, node.name)
+                    definitions_sanitized[identifier] = sanitized
+                    definitions_tokens[identifier] = sorted(
+                        set(re.findall(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", sanitized))
+                    )
+                    definitions_kind[identifier] = "method"
+        return definitions_raw, definitions_sanitized, definitions_tokens, definitions_kind
+    def analyze_code(
+        self,
+        code: str,
+        top_k_per_item: int = 5,
+        use_jaccard: bool = False,
+        model_hint: str | None = None,
+    ) -> dict[str, dict[str, object]]:
+        index_data = self._load_index()
+        base_embeddings = index_data["embeddings"]
+        scales = index_data["scales"]
+        identifier_map = index_data["identifier_map"]
+        tokens_map = index_data["tokens_map"]
+        idf_map = index_data["idf_map"]
+        default_idf = index_data["default_idf"]
+        identifiers = [identifier_map[i] for i in range(len(identifier_map))]
+        definitions_raw, definitions_sanitized, _, definitions_kind = self._extract_definitions_from_code(
+            code, model_hint, self.index_granularity
+        )
+        query_identifiers = list(definitions_raw.keys())
+        query_sources_sanitized = [definitions_sanitized[key] for key in query_identifiers]
+        query_tokens_list = [
+            set(re.findall(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", source)) for source in query_sources_sanitized
+        ]
+        query_embeddings = self.encode(query_sources_sanitized)
+        output = {}
+        for i, query_identifier in enumerate(query_identifiers):
+            query_name = query_identifier
+            pool_size = max(top_k_per_item * 5, top_k_per_item + 32)
+            candidates = self._topk(
+                query_embeddings[i],
+                base_embeddings,
+                scales,
+                identifier_map,
+                pool_size,
+                pool_size=pool_size,
+            )
+            embedding_top, embedding_scores, jaccard_scores = self._combine_hybrid(
+                candidates,
+                query_tokens_list[i],
+                tokens_map,
+                idf_map,
+                default_idf,
+                top_k_per_item,
+            )
+            entry: dict[str, object] = {
+                "kind": definitions_kind.get(query_identifier, "function"),
+                "embedding": [],
+            }
+            for identifier, score in embedding_top:
+                if ":" not in identifier:
+                    continue
+                relative_path, match_name = identifier.split(":", 1)
+                full_path, line = _resolve_definition_location(relative_path, match_name)
+                entry["embedding"].append(
+                    {
+                        "identifier": identifier,
+                        "relative_path": relative_path,
+                        "match_name": match_name,
+                        "score": score,
+                        "embedding_score": embedding_scores.get(identifier),
+                        "jaccard_score": jaccard_scores.get(identifier),
+                        "full_path": full_path,
+                        "line": line,
+                    }
+                )
+            if use_jaccard:
+                entry["jaccard"] = []
+            output[query_name] = entry
+        aggregate_scores: dict[str, float] = {}
+        for data in output.values():
+            for match in data.get("embedding", []):
+                relative_path = match.get("relative_path")
+                score = match.get("score")
+                if relative_path is None or score is None:
+                    continue
+                aggregate_scores[relative_path] = aggregate_scores.get(relative_path, 0.0) + float(score)
+        overall = sorted(
+            (
+                {"relative_path": relative_path, "score": score}
+                for relative_path, score in aggregate_scores.items()
+            ),
+            key=lambda item: item["score"],
+            reverse=True,
+        )
+        return {
+            "results": output,
+            "overall": overall,
+        }
+    def index_status(self) -> dict[str, object]:
+        return {
+            "requested_granularity": self.requested_granularity,
+            "resolved_granularity": self.index_granularity,
+            "precision": self.precision,
+            "hub_dataset": self.hub_dataset,
+            "index_dir": str(self.index_dir) if self.index_dir else None,
+            "index_origin": self.index_origin,
+            "missing_files": list(self.missing_files),
+            "embedding_model": EMBEDDING_MODEL,
+        }
+def get_default_hub_dataset() -> str:
+    return os.getenv("HUB_DATASET", HUB_DATASET_DEFAULT)

app/graph.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import ast
+from dataclasses import dataclass
+@dataclass
+class Graph:
+    nodes: list[dict[str, str]]
+    edges: list[dict[str, str]]
+def _call_target_name(node: ast.AST) -> str | None:
+    if isinstance(node, ast.Name):
+        return node.id
+    if isinstance(node, ast.Attribute) and isinstance(node.attr, str):
+        return node.attr
+    return None
+def build_graph(code: str) -> Graph:
+    tree = ast.parse(code)
+    functions = {}
+    classes: dict[str, list[str]] = {}
+    for node in tree.body:
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            functions[node.name] = node
+        elif isinstance(node, ast.ClassDef):
+            method_names = []
+            for child in node.body:
+                if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                    method_names.append(child.name)
+            classes[node.name] = method_names
+    nodes: list[dict[str, str]] = []
+    edges: list[dict[str, str]] = []
+    for class_name, method_names in classes.items():
+        nodes.append({"id": class_name, "label": class_name, "type": "class"})
+        for method_name in method_names:
+            method_id = f"{class_name}.{method_name}"
+            nodes.append({"id": method_id, "label": method_name, "type": "method"})
+            edges.append({"source": class_name, "target": method_id, "type": "contains"})
+    for func_name in functions:
+        nodes.append({"id": func_name, "label": func_name, "type": "function"})
+    known_nodes = {node["id"] for node in nodes}
+    call_edges = set()
+    def add_call_edge(source: str, target: str) -> None:
+        if source == target:
+            return
+        if target not in known_nodes:
+            return
+        call_edges.add((source, target))
+    for func_name, func_node in functions.items():
+        for call in [n for n in ast.walk(func_node) if isinstance(n, ast.Call)]:
+            target = _call_target_name(call.func)
+            if target is None:
+                continue
+            if target in functions:
+                add_call_edge(func_name, target)
+    for class_name, method_names in classes.items():
+        for node in tree.body:
+            if isinstance(node, ast.ClassDef) and node.name == class_name:
+                for child in node.body:
+                    if not isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                        continue
+                    source_id = f"{class_name}.{child.name}"
+                    for call in [n for n in ast.walk(child) if isinstance(n, ast.Call)]:
+                        target = call.func
+                        if isinstance(target, ast.Name):
+                            if target.id in functions:
+                                add_call_edge(source_id, target.id)
+                            continue
+                        if isinstance(target, ast.Attribute):
+                            if isinstance(target.value, ast.Name):
+                                if target.value.id == "self":
+                                    target_id = f"{class_name}.{target.attr}"
+                                    add_call_edge(source_id, target_id)
+                                elif target.value.id in classes:
+                                    target_id = f"{target.value.id}.{target.attr}"
+                                    add_call_edge(source_id, target_id)
+    for source, target in sorted(call_edges):
+        edges.append({"source": source, "target": target, "type": "calls"})
+    return Graph(nodes=nodes, edges=edges)

app/main.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from pathlib import Path
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import FileResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel, Field
+from app.detector import CodeSimilarityAnalyzer, get_default_hub_dataset
+from app.graph import build_graph
+BASE_DIR = Path(__file__).resolve().parent.parent
+STATIC_DIR = BASE_DIR / "static"
+app = FastAPI(title="Modular Model Graph")
+app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
+class AnalyzeRequest(BaseModel):
+    code: str = Field(..., min_length=1)
+    top_k: int = Field(default=5, ge=1, le=25)
+    use_jaccard: bool = False
+    granularity: str = "method"
+    precision: str = "float32"
+    hub_dataset: str | None = None
+_ANALYZERS: dict[tuple[str, str, str], CodeSimilarityAnalyzer] = {}
+def _get_analyzer(precision: str, granularity: str, hub_dataset: str) -> CodeSimilarityAnalyzer:
+    key = (precision, granularity, hub_dataset)
+    if key in _ANALYZERS:
+        return _ANALYZERS[key]
+    analyzer = CodeSimilarityAnalyzer(
+        hub_dataset=hub_dataset,
+        precision=precision,
+        granularity=granularity,
+    )
+    _ANALYZERS[key] = analyzer
+    return analyzer
+@app.get("/")
+async def index() -> FileResponse:
+    return FileResponse(STATIC_DIR / "index.html")
+@app.post("/api/analyze")
+async def analyze(request: AnalyzeRequest) -> dict:
+    hub_dataset = request.hub_dataset or get_default_hub_dataset()
+    if request.granularity not in ("method", "definition"):
+        raise HTTPException(status_code=400, detail="granularity must be 'method' or 'definition'")
+    if request.precision not in ("float32", "int8"):
+        raise HTTPException(status_code=400, detail="precision must be 'float32' or 'int8'")
+    analyzer = _get_analyzer(request.precision, request.granularity, hub_dataset)
+    try:
+        graph = build_graph(request.code)
+    except SyntaxError as exc:
+        raise HTTPException(status_code=400, detail=f"Syntax error: {exc.msg} at line {exc.lineno}") from exc
+    results = analyzer.analyze_code(
+        request.code,
+        top_k_per_item=request.top_k,
+        use_jaccard=request.use_jaccard,
+    )
+    return {
+        "graph": {"nodes": graph.nodes, "edges": graph.edges},
+        "results": results["results"],
+        "overall": results["overall"],
+        "index_info": analyzer.index_status(),
+    }

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi==0.115.0
+uvicorn==0.30.6
+sentence-transformers
+transformers
+huggingface_hub==0.24.6
+safetensors==0.4.5
+numpy==1.26.4

static/app.js ADDED Viewed

	@@ -0,0 +1,215 @@

+const analyzeBtn = document.getElementById("analyzeBtn");
+const codeInput = document.getElementById("codeInput");
+const statusEl = document.getElementById("status");
+const indexInfoEl = document.getElementById("indexInfo");
+const graphEl = document.getElementById("graph");
+const matchesEl = document.getElementById("matches");
+const overallEl = document.getElementById("overall");
+function setStatus(message) {
+  statusEl.textContent = message;
+}
+function renderIndexInfo(info) {
+  if (!indexInfoEl) return;
+  if (!info) {
+    indexInfoEl.textContent = "";
+    return;
+  }
+  const requested = info.requested_granularity || "method";
+  const resolved = info.resolved_granularity || requested;
+  const origin = info.index_origin;
+  const dir = info.index_dir;
+  const pieces = [`Using ${resolved} index`];
+  if (requested !== resolved) {
+    pieces.push(`(fallback from ${requested})`);
+  }
+  if (origin === "hub") {
+    pieces.push("from Hub");
+  } else if (origin) {
+    pieces.push(`from ${origin}`);
+  }
+  if (dir) {
+    pieces.push(`@ ${dir}`);
+  }
+  indexInfoEl.textContent = pieces.join(" ");
+}
+function renderOverall(overall) {
+  overallEl.innerHTML = "";
+  if (!overall || overall.length === 0) {
+    overallEl.textContent = "No aggregate matches yet.";
+    return;
+  }
+  const slice = overall.slice(0, 10);
+  for (const entry of slice) {
+    const div = document.createElement("div");
+    div.className = "overall-item";
+    div.textContent = `${entry.relative_path} (${entry.score.toFixed(4)})`;
+    overallEl.appendChild(div);
+  }
+}
+function renderMatches(results) {
+  matchesEl.innerHTML = "";
+  const keys = Object.keys(results);
+  if (keys.length === 0) {
+    matchesEl.textContent = "No matches returned.";
+    return;
+  }
+  for (const symbol of keys) {
+    const card = document.createElement("div");
+    card.className = "match-card";
+    const title = document.createElement("h3");
+    title.textContent = symbol;
+    card.appendChild(title);
+    const list = document.createElement("div");
+    list.className = "match-list";
+    const matches = results[symbol].embedding || [];
+    for (const match of matches) {
+      const row = document.createElement("div");
+      row.className = "match-row";
+      const left = document.createElement("span");
+      left.textContent = `${match.match_name} (${match.score.toFixed(4)})`;
+      const right = document.createElement("span");
+      right.textContent = match.relative_path;
+      row.appendChild(left);
+      row.appendChild(right);
+      list.appendChild(row);
+    }
+    card.appendChild(list);
+    matchesEl.appendChild(card);
+  }
+}
+function renderGraph(graph) {
+  graphEl.innerHTML = "";
+  const width = graphEl.clientWidth;
+  const height = graphEl.clientHeight;
+  const svg = d3
+    .select(graphEl)
+    .append("svg")
+    .attr("width", width)
+    .attr("height", height);
+  const nodes = graph.nodes.map((node) => ({ ...node }));
+  const links = graph.edges.map((edge) => ({ ...edge }));
+  const color = (type) => {
+    if (type === "class") return "#d6572b";
+    if (type === "method") return "#2b6fd6";
+    if (type === "function") return "#1b8d57";
+    return "#666";
+  };
+  const simulation = d3
+    .forceSimulation(nodes)
+    .force("link", d3.forceLink(links).id((d) => d.id).distance(80))
+    .force("charge", d3.forceManyBody().strength(-220))
+    .force("center", d3.forceCenter(width / 2, height / 2));
+  const link = svg
+    .append("g")
+    .attr("stroke", "#333")
+    .attr("stroke-opacity", 0.4)
+    .selectAll("line")
+    .data(links)
+    .join("line")
+    .attr("stroke-width", (d) => (d.type === "contains" ? 1.5 : 1));
+  const node = svg
+    .append("g")
+    .attr("stroke", "#fff")
+    .attr("stroke-width", 1.5)
+    .selectAll("circle")
+    .data(nodes)
+    .join("circle")
+    .attr("r", (d) => (d.type === "class" ? 9 : 6))
+    .attr("fill", (d) => color(d.type))
+    .call(drag(simulation));
+  const labels = svg
+    .append("g")
+    .selectAll("text")
+    .data(nodes)
+    .join("text")
+    .text((d) => d.label)
+    .attr("font-size", 11)
+    .attr("fill", "#2b1e13")
+    .attr("dx", 12)
+    .attr("dy", 3);
+  node.append("title").text((d) => d.id);
+  simulation.on("tick", () => {
+    link
+      .attr("x1", (d) => d.source.x)
+      .attr("y1", (d) => d.source.y)
+      .attr("x2", (d) => d.target.x)
+      .attr("y2", (d) => d.target.y);
+    node.attr("cx", (d) => d.x).attr("cy", (d) => d.y);
+    labels.attr("x", (d) => d.x).attr("y", (d) => d.y);
+  });
+  function drag(sim) {
+    function dragstarted(event, d) {
+      if (!event.active) sim.alphaTarget(0.3).restart();
+      d.fx = d.x;
+      d.fy = d.y;
+    }
+    function dragged(event, d) {
+      d.fx = event.x;
+      d.fy = event.y;
+    }
+    function dragended(event, d) {
+      if (!event.active) sim.alphaTarget(0);
+      d.fx = null;
+      d.fy = null;
+    }
+    return d3.drag().on("start", dragstarted).on("drag", dragged).on("end", dragended);
+  }
+}
+analyzeBtn.addEventListener("click", async () => {
+  const code = codeInput.value.trim();
+  if (!code) {
+    setStatus("Paste some code first.");
+    return;
+  }
+  renderIndexInfo(null);
+  setStatus("Analyzing... this can take a bit on first run.");
+  analyzeBtn.disabled = true;
+  try {
+    const payload = {
+      code,
+      top_k: Number(document.getElementById("topK").value || 5),
+      granularity: document.getElementById("granularity").value,
+      use_jaccard: document.getElementById("useJaccard").checked,
+      precision: "float32",
+    };
+    const response = await fetch("/api/analyze", {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(payload),
+    });
+    if (!response.ok) {
+      const detail = await response.text();
+      throw new Error(detail || "Request failed");
+    }
+    const data = await response.json();
+    renderGraph(data.graph);
+    renderOverall(data.overall);
+    renderMatches(data.results);
+    renderIndexInfo(data.index_info);
+    setStatus("Done.");
+  } catch (error) {
+    setStatus(`Error: ${error.message || error}`);
+  } finally {
+    analyzeBtn.disabled = false;
+  }
+});

static/index.html ADDED Viewed

	@@ -0,0 +1,81 @@

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Modular Model Graph</title>
+    <link rel="stylesheet" href="/static/styles.css" />
+    <script src="https://d3js.org/d3.v7.min.js"></script>
+  </head>
+  <body>
+    <div class="page">
+      <header class="hero">
+        <div>
+          <p class="eyebrow">Transformers similarity explorer</p>
+          <h1>Modular Model Graph</h1>
+          <p class="subhead">
+            Paste a modeling file, visualize its structure, and compare against Transformers models.
+          </p>
+        </div>
+      </header>
+      <section class="panel">
+        <div class="panel-header">
+          <h2>Input</h2>
+          <div class="controls">
+            <label>
+              Top K
+              <input id="topK" type="number" value="5" min="1" max="25" />
+            </label>
+            <label>
+              Granularity
+              <select id="granularity">
+                <option value="method" selected>method</option>
+                <option value="definition">definition</option>
+              </select>
+            </label>
+            <label class="checkbox">
+              <input id="useJaccard" type="checkbox" />
+              Use Jaccard
+            </label>
+            <button id="analyzeBtn">Analyze</button>
+          </div>
+        </div>
+        <textarea id="codeInput" placeholder="Paste modeling file code here..."></textarea>
+        <p id="status" class="status"></p>
+        <p id="indexInfo" class="status"></p>
+      </section>
+      <section class="grid">
+        <div class="panel">
+          <div class="panel-header">
+            <h2>Graph</h2>
+            <div class="legend">
+              <span class="dot class">Class</span>
+              <span class="dot method">Method</span>
+              <span class="dot function">Function</span>
+              <span class="dot call">Call edge</span>
+            </div>
+          </div>
+          <div id="graph" class="graph"></div>
+        </div>
+        <div class="panel">
+          <div class="panel-header">
+            <h2>Closest Models</h2>
+          </div>
+          <div id="overall" class="overall"></div>
+        </div>
+      </section>
+      <section class="panel">
+        <div class="panel-header">
+          <h2>Matches by Symbol</h2>
+        </div>
+        <div id="matches" class="matches"></div>
+      </section>
+    </div>
+    <script src="/static/app.js"></script>
+  </body>
+</html>

static/styles.css ADDED Viewed

	@@ -0,0 +1,241 @@

+@import url("https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&display=swap");
+:root {
+  --bg: #f6f0e6;
+  --panel: #fff7ee;
+  --ink: #1b1b1b;
+  --muted: #6b5f55;
+  --accent: #d6572b;
+  --accent-2: #2b6fd6;
+  --accent-3: #1b8d57;
+  --shadow: rgba(27, 27, 27, 0.1);
+}
+* {
+  box-sizing: border-box;
+}
+body {
+  margin: 0;
+  font-family: "Space Grotesk", system-ui, sans-serif;
+  color: var(--ink);
+  background: radial-gradient(circle at 20% 20%, #ffe4c7 0%, transparent 55%),
+    radial-gradient(circle at 85% 15%, #f5d2e8 0%, transparent 40%),
+    radial-gradient(circle at 70% 80%, #d8f0e2 0%, transparent 45%),
+    var(--bg);
+}
+.page {
+  max-width: 1200px;
+  margin: 0 auto;
+  padding: 32px 24px 64px;
+}
+.hero {
+  display: flex;
+  justify-content: space-between;
+  align-items: flex-end;
+  gap: 24px;
+  margin-bottom: 24px;
+}
+.eyebrow {
+  text-transform: uppercase;
+  letter-spacing: 0.12em;
+  font-size: 12px;
+  margin: 0 0 8px;
+  color: var(--muted);
+}
+h1 {
+  font-size: 40px;
+  margin: 0 0 12px;
+}
+.subhead {
+  font-size: 16px;
+  max-width: 640px;
+  margin: 0;
+  color: var(--muted);
+}
+.panel {
+  background: var(--panel);
+  border-radius: 20px;
+  padding: 20px;
+  box-shadow: 0 14px 30px var(--shadow);
+  margin-bottom: 24px;
+}
+.panel-header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  flex-wrap: wrap;
+  gap: 12px;
+}
+h2 {
+  margin: 0;
+  font-size: 20px;
+}
+.controls {
+  display: flex;
+  align-items: center;
+  gap: 12px;
+  flex-wrap: wrap;
+}
+.controls label {
+  display: flex;
+  gap: 8px;
+  align-items: center;
+  font-size: 14px;
+  color: var(--muted);
+}
+.controls input,
+.controls select {
+  border: 1px solid #d9cbbd;
+  border-radius: 10px;
+  padding: 6px 8px;
+  background: #fff;
+  font-size: 14px;
+}
+.controls button {
+  background: var(--accent);
+  color: #fff;
+  border: none;
+  border-radius: 12px;
+  padding: 8px 16px;
+  font-weight: 600;
+  cursor: pointer;
+  transition: transform 0.2s ease;
+}
+.controls button:hover {
+  transform: translateY(-1px);
+}
+textarea {
+  width: 100%;
+  min-height: 240px;
+  margin-top: 16px;
+  border-radius: 16px;
+  border: 1px solid #d9cbbd;
+  padding: 16px;
+  font-family: "Space Grotesk", monospace;
+  background: #fff;
+  resize: vertical;
+}
+.status {
+  margin-top: 8px;
+  color: var(--muted);
+}
+.grid {
+  display: grid;
+  grid-template-columns: minmax(0, 2fr) minmax(0, 1fr);
+  gap: 24px;
+}
+.graph {
+  width: 100%;
+  height: 480px;
+  border-radius: 18px;
+  background: #fff;
+  border: 1px solid #e3d6c8;
+}
+.legend {
+  display: flex;
+  gap: 10px;
+  align-items: center;
+  flex-wrap: wrap;
+  font-size: 12px;
+  color: var(--muted);
+}
+.dot {
+  display: inline-flex;
+  align-items: center;
+  gap: 6px;
+}
+.dot::before {
+  content: "";
+  width: 10px;
+  height: 10px;
+  border-radius: 50%;
+  background: var(--muted);
+}
+.dot.class::before {
+  background: var(--accent);
+}
+.dot.method::before {
+  background: var(--accent-2);
+}
+.dot.function::before {
+  background: var(--accent-3);
+}
+.dot.call::before {
+  background: #333;
+}
+.overall {
+  display: flex;
+  flex-direction: column;
+  gap: 12px;
+  margin-top: 12px;
+}
+.overall-item {
+  padding: 10px 12px;
+  background: #fff;
+  border-radius: 12px;
+  border: 1px solid #eadccd;
+}
+.matches {
+  display: grid;
+  gap: 16px;
+  margin-top: 16px;
+}
+.match-card {
+  background: #fff;
+  border-radius: 16px;
+  border: 1px solid #eadccd;
+  padding: 16px;
+}
+.match-card h3 {
+  margin: 0 0 8px;
+  font-size: 16px;
+}
+.match-list {
+  display: grid;
+  gap: 6px;
+  font-size: 14px;
+  color: var(--muted);
+}
+.match-row {
+  display: flex;
+  justify-content: space-between;
+  gap: 12px;
+}
+@media (max-width: 960px) {
+  .grid {
+    grid-template-columns: 1fr;
+  }
+}