Spaces:
Sleeping
Sleeping
Update app with better diff, new style
Browse files- app/__pycache__/detector.cpython-312.pyc +0 -0
- app/__pycache__/main.cpython-312.pyc +0 -0
- app/detector.py +172 -38
- app/main.py +17 -3
- static/app.js +78 -1
- static/styles.css +43 -12
app/__pycache__/detector.cpython-312.pyc
CHANGED
|
Binary files a/app/__pycache__/detector.cpython-312.pyc and b/app/__pycache__/detector.cpython-312.pyc differ
|
|
|
app/__pycache__/main.cpython-312.pyc
CHANGED
|
Binary files a/app/__pycache__/main.cpython-312.pyc and b/app/__pycache__/main.cpython-312.pyc differ
|
|
|
app/detector.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
| 3 |
import math
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
-
from typing import Callable
|
| 7 |
from functools import cache
|
| 8 |
from pathlib import Path
|
| 9 |
|
|
@@ -15,7 +15,18 @@ from transformers import AutoModel, AutoTokenizer
|
|
| 15 |
|
| 16 |
import transformers
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-0.6B"
|
| 21 |
BATCH_SIZE = 16
|
|
@@ -24,28 +35,46 @@ HUB_DATASET_DEFAULT = "Molbap/modular-detector-embeddings"
|
|
| 24 |
BOILERPLATE_NAMES = {
|
| 25 |
"__init__",
|
| 26 |
"_init_weights",
|
|
|
|
|
|
|
| 27 |
"get_input_embeddings",
|
| 28 |
"set_input_embeddings",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
}
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
def _sanitize_for_embedding(code: str, model_hint: str | None, symbol_hint: str | None) -> str:
|
| 33 |
code = _strip_docstrings(code)
|
|
|
|
| 34 |
base = "\n".join(
|
| 35 |
-
line for line in
|
| 36 |
)
|
| 37 |
variants = set()
|
| 38 |
if model_hint:
|
| 39 |
variants.add(model_hint)
|
| 40 |
variants.add(model_hint.replace("_", ""))
|
| 41 |
-
variants.add(
|
| 42 |
if symbol_hint:
|
| 43 |
-
match =
|
| 44 |
prefix = match.group(1) if match else ""
|
| 45 |
if prefix:
|
| 46 |
variants.add(prefix)
|
| 47 |
variants.add(prefix.replace("_", ""))
|
| 48 |
-
variants.add(
|
| 49 |
variants |= {variant.lower() for variant in list(variants)}
|
| 50 |
sanitized = base
|
| 51 |
for variant in sorted({x for x in variants if len(x) >= 3}, key=len, reverse=True):
|
|
@@ -54,11 +83,11 @@ def _sanitize_for_embedding(code: str, model_hint: str | None, symbol_hint: str
|
|
| 54 |
|
| 55 |
|
| 56 |
def _normalize(value: str | None) -> str:
|
| 57 |
-
return
|
| 58 |
|
| 59 |
|
| 60 |
def _leading_prefix(name: str) -> str:
|
| 61 |
-
match =
|
| 62 |
return match.group(1) if match else ""
|
| 63 |
|
| 64 |
|
|
@@ -95,11 +124,22 @@ def _infer_model_prefixes(definitions_kind: dict[str, str]) -> set[str]:
|
|
| 95 |
return prefixes
|
| 96 |
|
| 97 |
|
| 98 |
-
def _calculate_reconstruction_score(
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
def _normalize_source_path(path: str | None) -> str | None:
|
|
@@ -144,6 +184,43 @@ def _strip_docstrings(source: str) -> str:
|
|
| 144 |
return source
|
| 145 |
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
def _normalize_code_for_compare(source: str) -> str:
|
| 148 |
stripped = _strip_docstrings(source)
|
| 149 |
return "".join(line.strip() for line in stripped.splitlines() if line.strip())
|
|
@@ -449,6 +526,8 @@ class CodeSimilarityAnalyzer:
|
|
| 449 |
definitions_kind: dict[str, str] = {}
|
| 450 |
lines = code.splitlines()
|
| 451 |
tree = ast.parse(code)
|
|
|
|
|
|
|
| 452 |
for node in ast.iter_child_nodes(tree):
|
| 453 |
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and granularity in ("definition", "method"):
|
| 454 |
segment = ast.get_source_segment(code, node)
|
|
@@ -460,9 +539,8 @@ class CodeSimilarityAnalyzer:
|
|
| 460 |
continue
|
| 461 |
identifier = node.name
|
| 462 |
definitions_raw[identifier] = segment
|
| 463 |
-
sanitized = _sanitize_for_embedding(segment, model_hint, node.name)
|
| 464 |
-
definitions_sanitized[identifier] = sanitized
|
| 465 |
definitions_kind[identifier] = "function"
|
|
|
|
| 466 |
continue
|
| 467 |
|
| 468 |
if isinstance(node, ast.ClassDef):
|
|
@@ -471,19 +549,14 @@ class CodeSimilarityAnalyzer:
|
|
| 471 |
start = max(0, node.lineno - 1)
|
| 472 |
end = node.end_lineno
|
| 473 |
class_segment = "\n".join(lines[start:end])
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
class_header = class_segment.splitlines()[0].strip()
|
| 477 |
-
class_context = class_header
|
| 478 |
|
| 479 |
if granularity == "definition":
|
| 480 |
-
if not class_segment:
|
| 481 |
-
continue
|
| 482 |
identifier = node.name
|
| 483 |
definitions_raw[identifier] = class_segment
|
| 484 |
-
sanitized = _sanitize_for_embedding(class_segment, model_hint, node.name)
|
| 485 |
-
definitions_sanitized[identifier] = sanitized
|
| 486 |
definitions_kind[identifier] = "class"
|
|
|
|
| 487 |
continue
|
| 488 |
|
| 489 |
for child in node.body:
|
|
@@ -497,12 +570,24 @@ class CodeSimilarityAnalyzer:
|
|
| 497 |
if not segment:
|
| 498 |
continue
|
| 499 |
method_name = child.name
|
| 500 |
-
combined = f"{class_context}\n{segment}" if class_context else segment
|
| 501 |
identifier = f"{node.name}.{method_name}"
|
| 502 |
definitions_raw[identifier] = segment
|
| 503 |
-
sanitized = _sanitize_for_embedding(combined, model_hint, node.name)
|
| 504 |
-
definitions_sanitized[identifier] = sanitized
|
| 505 |
definitions_kind[identifier] = "method"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
return definitions_raw, definitions_sanitized, definitions_kind
|
| 507 |
|
| 508 |
def analyze_code(
|
|
@@ -578,6 +663,15 @@ class CodeSimilarityAnalyzer:
|
|
| 578 |
}
|
| 579 |
for identifier, score in candidates:
|
| 580 |
relative_path, match_name = identifier.split(":", 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
if len(entry_all["embedding"]) < top_k_per_item:
|
| 582 |
full_path, line = _resolve_definition_location(relative_path, match_name)
|
| 583 |
entry_all["embedding"].append(
|
|
@@ -588,16 +682,12 @@ class CodeSimilarityAnalyzer:
|
|
| 588 |
"score": score,
|
| 589 |
"full_path": full_path,
|
| 590 |
"line": line,
|
|
|
|
| 591 |
}
|
| 592 |
)
|
| 593 |
-
if exclude_identical:
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
match_norm = _normalize_code_for_compare(match_segment)
|
| 597 |
-
query_norm = query_compare.get(query_identifier)
|
| 598 |
-
if query_norm and match_norm == query_norm:
|
| 599 |
-
identical_filtered += 1
|
| 600 |
-
continue
|
| 601 |
full_path, line = _resolve_definition_location(relative_path, match_name)
|
| 602 |
entry["embedding"].append(
|
| 603 |
{
|
|
@@ -607,6 +697,7 @@ class CodeSimilarityAnalyzer:
|
|
| 607 |
"score": score,
|
| 608 |
"full_path": full_path,
|
| 609 |
"line": line,
|
|
|
|
| 610 |
}
|
| 611 |
)
|
| 612 |
if len(entry["embedding"]) >= top_k_per_item and len(entry_all["embedding"]) >= top_k_per_item:
|
|
@@ -625,6 +716,38 @@ class CodeSimilarityAnalyzer:
|
|
| 625 |
by_class: dict[str, dict[tuple[str, str], dict[str, object]]] = {}
|
| 626 |
for query_identifier, entry in result_map.items():
|
| 627 |
kind = entry.get("kind", "function")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
qcls = query_class_key(query_identifier, kind)
|
| 629 |
matches = entry.get("embedding", [])
|
| 630 |
if not matches:
|
|
@@ -660,15 +783,26 @@ class CodeSimilarityAnalyzer:
|
|
| 660 |
by_class_out: dict[str, list[dict[str, object]]] = {}
|
| 661 |
for qcls, cand_map in by_class.items():
|
| 662 |
q_method_count = len(
|
| 663 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 664 |
)
|
| 665 |
q_method_count = max(1, q_method_count)
|
| 666 |
rows = []
|
| 667 |
for _, slot in cand_map.items():
|
| 668 |
-
|
| 669 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 670 |
coverage_ratio = coverage_count / float(q_method_count)
|
| 671 |
-
base_score = _calculate_reconstruction_score(scores, coverage_ratio)
|
| 672 |
contributors = sorted(slot["contributors"], key=lambda x: float(x["score"]), reverse=True)[:5]
|
| 673 |
rows.append(
|
| 674 |
{
|
|
|
|
| 3 |
import math
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
+
from typing import Callable, Iterable
|
| 7 |
from functools import cache
|
| 8 |
from pathlib import Path
|
| 9 |
|
|
|
|
| 15 |
|
| 16 |
import transformers
|
| 17 |
|
| 18 |
+
_LIB_PATH = Path(transformers.__file__).resolve().parent
|
| 19 |
+
_ENV_REPO = os.getenv("TRANSFORMERS_REPO")
|
| 20 |
+
if _ENV_REPO:
|
| 21 |
+
_env_path = Path(_ENV_REPO)
|
| 22 |
+
_candidate = _env_path / "src" / "transformers" / "models"
|
| 23 |
+
if _candidate.exists():
|
| 24 |
+
MODELS_ROOT = _candidate
|
| 25 |
+
else:
|
| 26 |
+
_fallback = _env_path / "models"
|
| 27 |
+
MODELS_ROOT = _fallback if _fallback.exists() else _LIB_PATH / "models"
|
| 28 |
+
else:
|
| 29 |
+
MODELS_ROOT = _LIB_PATH / "models"
|
| 30 |
|
| 31 |
EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-0.6B"
|
| 32 |
BATCH_SIZE = 16
|
|
|
|
| 35 |
BOILERPLATE_NAMES = {
|
| 36 |
"__init__",
|
| 37 |
"_init_weights",
|
| 38 |
+
"__repr__",
|
| 39 |
+
"extra_repr",
|
| 40 |
"get_input_embeddings",
|
| 41 |
"set_input_embeddings",
|
| 42 |
+
"get_output_embeddings",
|
| 43 |
+
"set_output_embeddings",
|
| 44 |
+
"tie_weights",
|
| 45 |
+
"post_init",
|
| 46 |
+
"forward",
|
| 47 |
+
"init_weights",
|
| 48 |
+
"reset_parameters",
|
| 49 |
+
"training",
|
| 50 |
}
|
| 51 |
|
| 52 |
+
_RE_COMMENT = re.compile(r"#.*")
|
| 53 |
+
_RE_IMPORT = re.compile(r"\s*(from|import)\s+")
|
| 54 |
+
_RE_MODEL_HINT = re.compile(r"\d+")
|
| 55 |
+
_RE_LEADING_PREFIX = re.compile(r"^([A-Z][a-z0-9]+)")
|
| 56 |
+
_RE_ALPHANUM = re.compile(r"^([A-Za-z0-9]+)")
|
| 57 |
+
_RE_NORMALIZE = re.compile(r"[^a-z0-9]+")
|
| 58 |
+
|
| 59 |
|
| 60 |
def _sanitize_for_embedding(code: str, model_hint: str | None, symbol_hint: str | None) -> str:
|
| 61 |
code = _strip_docstrings(code)
|
| 62 |
+
cleaned = _RE_COMMENT.sub("", code)
|
| 63 |
base = "\n".join(
|
| 64 |
+
line for line in cleaned.splitlines() if line.strip() and not _RE_IMPORT.match(line)
|
| 65 |
)
|
| 66 |
variants = set()
|
| 67 |
if model_hint:
|
| 68 |
variants.add(model_hint)
|
| 69 |
variants.add(model_hint.replace("_", ""))
|
| 70 |
+
variants.add(_RE_MODEL_HINT.sub("", model_hint))
|
| 71 |
if symbol_hint:
|
| 72 |
+
match = _RE_LEADING_PREFIX.match(symbol_hint) or _RE_ALPHANUM.match(symbol_hint)
|
| 73 |
prefix = match.group(1) if match else ""
|
| 74 |
if prefix:
|
| 75 |
variants.add(prefix)
|
| 76 |
variants.add(prefix.replace("_", ""))
|
| 77 |
+
variants.add(_RE_MODEL_HINT.sub("", prefix))
|
| 78 |
variants |= {variant.lower() for variant in list(variants)}
|
| 79 |
sanitized = base
|
| 80 |
for variant in sorted({x for x in variants if len(x) >= 3}, key=len, reverse=True):
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
def _normalize(value: str | None) -> str:
|
| 86 |
+
return _RE_NORMALIZE.sub("", value.lower()) if value else ""
|
| 87 |
|
| 88 |
|
| 89 |
def _leading_prefix(name: str) -> str:
|
| 90 |
+
match = _RE_LEADING_PREFIX.match(name) or _RE_ALPHANUM.match(name)
|
| 91 |
return match.group(1) if match else ""
|
| 92 |
|
| 93 |
|
|
|
|
| 124 |
return prefixes
|
| 125 |
|
| 126 |
|
| 127 |
+
def _calculate_reconstruction_score(
|
| 128 |
+
contributors: Iterable[dict[str, object]],
|
| 129 |
+
query_method_count: int,
|
| 130 |
+
) -> tuple[float, int]:
|
| 131 |
+
if query_method_count <= 0:
|
| 132 |
+
return 0.0, 0
|
| 133 |
+
best_scores: dict[str, float] = {}
|
| 134 |
+
for contributor in contributors:
|
| 135 |
+
query_name = contributor.get("query")
|
| 136 |
+
if not query_name:
|
| 137 |
+
continue
|
| 138 |
+
score = float(contributor.get("score", 0.0))
|
| 139 |
+
if score > best_scores.get(str(query_name), 0.0):
|
| 140 |
+
best_scores[str(query_name)] = score
|
| 141 |
+
total_similarity = sum(best_scores.values())
|
| 142 |
+
return total_similarity / float(query_method_count), len(best_scores)
|
| 143 |
|
| 144 |
|
| 145 |
def _normalize_source_path(path: str | None) -> str | None:
|
|
|
|
| 184 |
return source
|
| 185 |
|
| 186 |
|
| 187 |
+
def _strip_docstrings_in_tree(tree: ast.AST) -> None:
|
| 188 |
+
def strip_in_body(body: list[ast.stmt]) -> None:
|
| 189 |
+
if not body:
|
| 190 |
+
return
|
| 191 |
+
first = body[0]
|
| 192 |
+
if isinstance(first, ast.Expr) and isinstance(getattr(first, "value", None), ast.Constant):
|
| 193 |
+
if isinstance(first.value.value, str):
|
| 194 |
+
body.pop(0)
|
| 195 |
+
|
| 196 |
+
strip_in_body(tree.body)
|
| 197 |
+
for node in ast.walk(tree):
|
| 198 |
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
| 199 |
+
strip_in_body(node.body)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def _sanitize_unparsed_code(code: str, model_hint: str | None, symbol_hint: str | None) -> str:
|
| 203 |
+
cleaned = _RE_COMMENT.sub("", code)
|
| 204 |
+
base = "\n".join(line for line in cleaned.splitlines() if line.strip() and not _RE_IMPORT.match(line))
|
| 205 |
+
hints: set[str] = set()
|
| 206 |
+
if model_hint:
|
| 207 |
+
hints.add(model_hint)
|
| 208 |
+
hints.add(model_hint.replace("_", ""))
|
| 209 |
+
hints.add(_RE_MODEL_HINT.sub("", model_hint))
|
| 210 |
+
if symbol_hint:
|
| 211 |
+
match = _RE_LEADING_PREFIX.match(symbol_hint) or _RE_ALPHANUM.match(symbol_hint)
|
| 212 |
+
prefix = match.group(1) if match else ""
|
| 213 |
+
if prefix:
|
| 214 |
+
hints.add(prefix)
|
| 215 |
+
hints.add(prefix.replace("_", ""))
|
| 216 |
+
hints.add(_RE_MODEL_HINT.sub("", prefix))
|
| 217 |
+
hints = {h for h in hints if len(h) >= 3}
|
| 218 |
+
if hints:
|
| 219 |
+
pattern = re.compile("|".join(re.escape(h) for h in sorted(hints, key=len, reverse=True)), re.IGNORECASE)
|
| 220 |
+
base = pattern.sub("Model", base)
|
| 221 |
+
return base
|
| 222 |
+
|
| 223 |
+
|
| 224 |
def _normalize_code_for_compare(source: str) -> str:
|
| 225 |
stripped = _strip_docstrings(source)
|
| 226 |
return "".join(line.strip() for line in stripped.splitlines() if line.strip())
|
|
|
|
| 526 |
definitions_kind: dict[str, str] = {}
|
| 527 |
lines = code.splitlines()
|
| 528 |
tree = ast.parse(code)
|
| 529 |
+
entries: list[tuple[str, str, ast.AST, ast.ClassDef | None]] = []
|
| 530 |
+
|
| 531 |
for node in ast.iter_child_nodes(tree):
|
| 532 |
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and granularity in ("definition", "method"):
|
| 533 |
segment = ast.get_source_segment(code, node)
|
|
|
|
| 539 |
continue
|
| 540 |
identifier = node.name
|
| 541 |
definitions_raw[identifier] = segment
|
|
|
|
|
|
|
| 542 |
definitions_kind[identifier] = "function"
|
| 543 |
+
entries.append((identifier, "function", node, None))
|
| 544 |
continue
|
| 545 |
|
| 546 |
if isinstance(node, ast.ClassDef):
|
|
|
|
| 549 |
start = max(0, node.lineno - 1)
|
| 550 |
end = node.end_lineno
|
| 551 |
class_segment = "\n".join(lines[start:end])
|
| 552 |
+
if not class_segment:
|
| 553 |
+
continue
|
|
|
|
|
|
|
| 554 |
|
| 555 |
if granularity == "definition":
|
|
|
|
|
|
|
| 556 |
identifier = node.name
|
| 557 |
definitions_raw[identifier] = class_segment
|
|
|
|
|
|
|
| 558 |
definitions_kind[identifier] = "class"
|
| 559 |
+
entries.append((identifier, "class", node, None))
|
| 560 |
continue
|
| 561 |
|
| 562 |
for child in node.body:
|
|
|
|
| 570 |
if not segment:
|
| 571 |
continue
|
| 572 |
method_name = child.name
|
|
|
|
| 573 |
identifier = f"{node.name}.{method_name}"
|
| 574 |
definitions_raw[identifier] = segment
|
|
|
|
|
|
|
| 575 |
definitions_kind[identifier] = "method"
|
| 576 |
+
entries.append((identifier, "method", child, node))
|
| 577 |
+
|
| 578 |
+
_strip_docstrings_in_tree(tree)
|
| 579 |
+
for identifier, kind, node, parent in entries:
|
| 580 |
+
try:
|
| 581 |
+
if kind == "method" and parent is not None:
|
| 582 |
+
parent_header = ast.unparse(parent).splitlines()[0]
|
| 583 |
+
combined = f"{parent_header}\n{ast.unparse(node)}"
|
| 584 |
+
sanitized = _sanitize_unparsed_code(combined, model_hint, parent.name)
|
| 585 |
+
else:
|
| 586 |
+
sanitized = _sanitize_unparsed_code(ast.unparse(node), model_hint, identifier)
|
| 587 |
+
except Exception:
|
| 588 |
+
sanitized = definitions_raw.get(identifier, "")
|
| 589 |
+
definitions_sanitized[identifier] = sanitized
|
| 590 |
+
|
| 591 |
return definitions_raw, definitions_sanitized, definitions_kind
|
| 592 |
|
| 593 |
def analyze_code(
|
|
|
|
| 663 |
}
|
| 664 |
for identifier, score in candidates:
|
| 665 |
relative_path, match_name = identifier.split(":", 1)
|
| 666 |
+
is_identical = False
|
| 667 |
+
match_segment = None
|
| 668 |
+
if exclude_identical or query_compare:
|
| 669 |
+
match_segment = _get_definition_segment(relative_path, match_name)
|
| 670 |
+
if match_segment is not None:
|
| 671 |
+
match_norm = _normalize_code_for_compare(match_segment)
|
| 672 |
+
query_norm = query_compare.get(query_identifier)
|
| 673 |
+
if query_norm and match_norm == query_norm:
|
| 674 |
+
is_identical = True
|
| 675 |
if len(entry_all["embedding"]) < top_k_per_item:
|
| 676 |
full_path, line = _resolve_definition_location(relative_path, match_name)
|
| 677 |
entry_all["embedding"].append(
|
|
|
|
| 682 |
"score": score,
|
| 683 |
"full_path": full_path,
|
| 684 |
"line": line,
|
| 685 |
+
"is_identical": is_identical,
|
| 686 |
}
|
| 687 |
)
|
| 688 |
+
if exclude_identical and is_identical:
|
| 689 |
+
identical_filtered += 1
|
| 690 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 691 |
full_path, line = _resolve_definition_location(relative_path, match_name)
|
| 692 |
entry["embedding"].append(
|
| 693 |
{
|
|
|
|
| 697 |
"score": score,
|
| 698 |
"full_path": full_path,
|
| 699 |
"line": line,
|
| 700 |
+
"is_identical": is_identical,
|
| 701 |
}
|
| 702 |
)
|
| 703 |
if len(entry["embedding"]) >= top_k_per_item and len(entry_all["embedding"]) >= top_k_per_item:
|
|
|
|
| 716 |
by_class: dict[str, dict[tuple[str, str], dict[str, object]]] = {}
|
| 717 |
for query_identifier, entry in result_map.items():
|
| 718 |
kind = entry.get("kind", "function")
|
| 719 |
+
if kind == "function":
|
| 720 |
+
matches = entry.get("embedding", [])
|
| 721 |
+
if not matches:
|
| 722 |
+
continue
|
| 723 |
+
best_per_cand: dict[tuple[str, str], dict[str, object]] = {}
|
| 724 |
+
for match in matches:
|
| 725 |
+
rel = match.get("relative_path")
|
| 726 |
+
mname = match.get("match_name")
|
| 727 |
+
score = match.get("score")
|
| 728 |
+
if rel is None or not mname or score is None:
|
| 729 |
+
continue
|
| 730 |
+
if "." in mname:
|
| 731 |
+
continue
|
| 732 |
+
ckey = (rel, mname)
|
| 733 |
+
prev = best_per_cand.get(ckey)
|
| 734 |
+
if prev is None or float(score) > float(prev.get("score", -1.0)):
|
| 735 |
+
best_per_cand[ckey] = match
|
| 736 |
+
for ckey, match in best_per_cand.items():
|
| 737 |
+
slot = by_class.setdefault(query_identifier, {}).setdefault(
|
| 738 |
+
ckey,
|
| 739 |
+
{
|
| 740 |
+
"relative_path": ckey[0],
|
| 741 |
+
"class_name": ckey[1],
|
| 742 |
+
"scores": [],
|
| 743 |
+
"contributors": [],
|
| 744 |
+
},
|
| 745 |
+
)
|
| 746 |
+
slot["scores"].append(float(match["score"]))
|
| 747 |
+
slot["contributors"].append(
|
| 748 |
+
{"query": query_identifier, "match": match["identifier"], "score": float(match["score"])}
|
| 749 |
+
)
|
| 750 |
+
continue
|
| 751 |
qcls = query_class_key(query_identifier, kind)
|
| 752 |
matches = entry.get("embedding", [])
|
| 753 |
if not matches:
|
|
|
|
| 783 |
by_class_out: dict[str, list[dict[str, object]]] = {}
|
| 784 |
for qcls, cand_map in by_class.items():
|
| 785 |
q_method_count = len(
|
| 786 |
+
[
|
| 787 |
+
key
|
| 788 |
+
for key, kind in definitions_kind.items()
|
| 789 |
+
if kind == "method"
|
| 790 |
+
and key.startswith(f"{qcls}.")
|
| 791 |
+
and key.split(".")[-1] not in BOILERPLATE_NAMES
|
| 792 |
+
]
|
| 793 |
)
|
| 794 |
q_method_count = max(1, q_method_count)
|
| 795 |
rows = []
|
| 796 |
for _, slot in cand_map.items():
|
| 797 |
+
filtered_contributors = [
|
| 798 |
+
item
|
| 799 |
+
for item in slot["contributors"]
|
| 800 |
+
if str(item.get("query", "")).split(".")[-1] not in BOILERPLATE_NAMES
|
| 801 |
+
]
|
| 802 |
+
base_score, coverage_count = _calculate_reconstruction_score(
|
| 803 |
+
filtered_contributors, q_method_count
|
| 804 |
+
)
|
| 805 |
coverage_ratio = coverage_count / float(q_method_count)
|
|
|
|
| 806 |
contributors = sorted(slot["contributors"], key=lambda x: float(x["score"]), reverse=True)[:5]
|
| 807 |
rows.append(
|
| 808 |
{
|
app/main.py
CHANGED
|
@@ -157,12 +157,26 @@ def _get_structural_flow(node: ast.AST) -> str:
|
|
| 157 |
flow: list[str] = []
|
| 158 |
for child in ast.walk(node):
|
| 159 |
if isinstance(child, ast.Call):
|
| 160 |
-
name =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
if name:
|
| 162 |
flow.append(name)
|
|
|
|
|
|
|
|
|
|
| 163 |
elif isinstance(child, (ast.If, ast.While, ast.For)):
|
| 164 |
-
flow.append(
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
|
| 168 |
def _extract_ast(source: str, symbol: str) -> tuple[str | None, dict[str, object] | None]:
|
|
|
|
| 157 |
flow: list[str] = []
|
| 158 |
for child in ast.walk(node):
|
| 159 |
if isinstance(child, ast.Call):
|
| 160 |
+
name = None
|
| 161 |
+
if isinstance(child.func, ast.Attribute) and isinstance(child.func.value, ast.Name):
|
| 162 |
+
if child.func.value.id == "self":
|
| 163 |
+
name = f"self.{child.func.attr}"
|
| 164 |
+
if name is None:
|
| 165 |
+
name = _call_name(child.func)
|
| 166 |
if name:
|
| 167 |
flow.append(name)
|
| 168 |
+
elif isinstance(child, ast.Attribute):
|
| 169 |
+
if isinstance(child.value, ast.Name) and child.value.id == "self":
|
| 170 |
+
flow.append(f"self.{child.attr}")
|
| 171 |
elif isinstance(child, (ast.If, ast.While, ast.For)):
|
| 172 |
+
flow.append("[LOGIC]")
|
| 173 |
+
elif isinstance(child, ast.Return):
|
| 174 |
+
flow.append("Return")
|
| 175 |
+
reduced: list[str] = []
|
| 176 |
+
for item in flow:
|
| 177 |
+
if not reduced or reduced[-1] != item:
|
| 178 |
+
reduced.append(item)
|
| 179 |
+
return " -> ".join(reduced[:20])
|
| 180 |
|
| 181 |
|
| 182 |
def _extract_ast(source: str, symbol: str) -> tuple[str | None, dict[str, object] | None]:
|
static/app.js
CHANGED
|
@@ -162,6 +162,63 @@ function formatSummary(summary) {
|
|
| 162 |
return parts.join(" · ") || "No structural summary.";
|
| 163 |
}
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
function setAst(queryAst, matchAst, querySummary, matchSummary) {
|
| 166 |
if (astQueryEl) {
|
| 167 |
astQueryEl.textContent = queryAst || "AST not found.";
|
|
@@ -180,6 +237,26 @@ function setAst(queryAst, matchAst, querySummary, matchSummary) {
|
|
| 180 |
const matchFlow = matchSummary?.flow ? matchSummary.flow : "unavailable";
|
| 181 |
flowComparisonEl.textContent = `Selected flow:\n${queryFlow}\n\nMatch flow:\n${matchFlow}`;
|
| 182 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
}
|
| 184 |
|
| 185 |
async function loadAst(symbol, matchIdentifier) {
|
|
@@ -287,7 +364,7 @@ function renderBlueprint(byClass) {
|
|
| 287 |
item.addEventListener("click", () => {
|
| 288 |
document.querySelectorAll(".blueprint-item").forEach((el) => el.classList.remove("is-active"));
|
| 289 |
item.classList.add("is-active");
|
| 290 |
-
const matchIdentifier = top.
|
| 291 |
const activeName = document.getElementById("activeModuleName");
|
| 292 |
if (activeName) {
|
| 293 |
activeName.textContent = `${qcls} vs ${top.class_name}`;
|
|
|
|
| 162 |
return parts.join(" · ") || "No structural summary.";
|
| 163 |
}
|
| 164 |
|
| 165 |
+
function escapeHtml(text) {
|
| 166 |
+
if (!text) return "";
|
| 167 |
+
return text
|
| 168 |
+
.replace(/&/g, "&")
|
| 169 |
+
.replace(/</g, "<")
|
| 170 |
+
.replace(/>/g, ">")
|
| 171 |
+
.replace(/\"/g, """)
|
| 172 |
+
.replace(/'/g, "'");
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
function diffLCS(text1, text2) {
|
| 176 |
+
const lines1 = text1 ? text1.split("\n") : [];
|
| 177 |
+
const lines2 = text2 ? text2.split("\n") : [];
|
| 178 |
+
const n = lines1.length;
|
| 179 |
+
const m = lines2.length;
|
| 180 |
+
const dp = Array.from({ length: n + 1 }, () => Array(m + 1).fill(0));
|
| 181 |
+
|
| 182 |
+
for (let i = 1; i <= n; i += 1) {
|
| 183 |
+
for (let j = 1; j <= m; j += 1) {
|
| 184 |
+
if (lines1[i - 1] === lines2[j - 1]) {
|
| 185 |
+
dp[i][j] = dp[i - 1][j - 1] + 1;
|
| 186 |
+
} else {
|
| 187 |
+
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
let i = n;
|
| 193 |
+
let j = m;
|
| 194 |
+
const result = [];
|
| 195 |
+
while (i > 0 || j > 0) {
|
| 196 |
+
if (i > 0 && j > 0 && lines1[i - 1] === lines2[j - 1]) {
|
| 197 |
+
result.unshift({ type: "same", text: lines1[i - 1] });
|
| 198 |
+
i -= 1;
|
| 199 |
+
j -= 1;
|
| 200 |
+
} else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
|
| 201 |
+
result.unshift({ type: "add", text: lines2[j - 1] });
|
| 202 |
+
j -= 1;
|
| 203 |
+
} else {
|
| 204 |
+
result.unshift({ type: "del", text: lines1[i - 1] });
|
| 205 |
+
i -= 1;
|
| 206 |
+
}
|
| 207 |
+
}
|
| 208 |
+
return result;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
function renderDiff(text1, text2) {
|
| 212 |
+
const diff = diffLCS(text1, text2);
|
| 213 |
+
return diff
|
| 214 |
+
.map((part) => {
|
| 215 |
+
const cls = part.type === "add" ? "diff-add" : part.type === "del" ? "diff-del" : "";
|
| 216 |
+
const safe = escapeHtml(part.text || " ");
|
| 217 |
+
return `<div class="diff-row ${cls}"><div class="code-line">${safe}</div></div>`;
|
| 218 |
+
})
|
| 219 |
+
.join("");
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
function setAst(queryAst, matchAst, querySummary, matchSummary) {
|
| 223 |
if (astQueryEl) {
|
| 224 |
astQueryEl.textContent = queryAst || "AST not found.";
|
|
|
|
| 237 |
const matchFlow = matchSummary?.flow ? matchSummary.flow : "unavailable";
|
| 238 |
flowComparisonEl.textContent = `Selected flow:\n${queryFlow}\n\nMatch flow:\n${matchFlow}`;
|
| 239 |
}
|
| 240 |
+
|
| 241 |
+
const diffContainer = document.querySelector(".code-diff-view");
|
| 242 |
+
if (diffContainer) {
|
| 243 |
+
diffContainer.innerHTML = "";
|
| 244 |
+
if (queryAst && matchAst) {
|
| 245 |
+
const diffWrapper = document.createElement("div");
|
| 246 |
+
diffWrapper.className = "diff-wrapper";
|
| 247 |
+
diffWrapper.innerHTML = renderDiff(queryAst, matchAst);
|
| 248 |
+
diffContainer.appendChild(diffWrapper);
|
| 249 |
+
} else {
|
| 250 |
+
const left = document.createElement("pre");
|
| 251 |
+
left.className = "code-block";
|
| 252 |
+
left.textContent = queryAst || "";
|
| 253 |
+
const right = document.createElement("pre");
|
| 254 |
+
right.className = "code-block";
|
| 255 |
+
right.textContent = matchAst || "";
|
| 256 |
+
diffContainer.appendChild(left);
|
| 257 |
+
diffContainer.appendChild(right);
|
| 258 |
+
}
|
| 259 |
+
}
|
| 260 |
}
|
| 261 |
|
| 262 |
async function loadAst(symbol, matchIdentifier) {
|
|
|
|
| 364 |
item.addEventListener("click", () => {
|
| 365 |
document.querySelectorAll(".blueprint-item").forEach((el) => el.classList.remove("is-active"));
|
| 366 |
item.classList.add("is-active");
|
| 367 |
+
const matchIdentifier = top.identifier;
|
| 368 |
const activeName = document.getElementById("activeModuleName");
|
| 369 |
if (activeName) {
|
| 370 |
activeName.textContent = `${qcls} vs ${top.class_name}`;
|
static/styles.css
CHANGED
|
@@ -1,14 +1,14 @@
|
|
| 1 |
@import url("https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&display=swap");
|
| 2 |
|
| 3 |
:root {
|
| 4 |
-
--bg: #
|
| 5 |
-
--panel: #
|
| 6 |
-
--ink: #
|
| 7 |
-
--muted: #
|
| 8 |
-
--accent: #
|
| 9 |
-
--accent-2: #
|
| 10 |
-
--accent-3: #
|
| 11 |
-
--shadow: rgba(
|
| 12 |
}
|
| 13 |
|
| 14 |
* {
|
|
@@ -19,9 +19,9 @@ body {
|
|
| 19 |
margin: 0;
|
| 20 |
font-family: "Space Grotesk", system-ui, sans-serif;
|
| 21 |
color: var(--ink);
|
| 22 |
-
background: radial-gradient(circle at
|
| 23 |
-
radial-gradient(circle at
|
| 24 |
-
radial-gradient(circle at 70% 80%, #
|
| 25 |
var(--bg);
|
| 26 |
}
|
| 27 |
|
|
@@ -341,7 +341,7 @@ textarea {
|
|
| 341 |
|
| 342 |
.code-diff-view {
|
| 343 |
display: grid;
|
| 344 |
-
grid-template-columns:
|
| 345 |
gap: 16px;
|
| 346 |
}
|
| 347 |
|
|
@@ -359,6 +359,37 @@ textarea {
|
|
| 359 |
white-space: pre-wrap;
|
| 360 |
}
|
| 361 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
.structural-notes {
|
| 363 |
display: grid;
|
| 364 |
grid-template-columns: repeat(2, minmax(0, 1fr));
|
|
|
|
| 1 |
@import url("https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&display=swap");
|
| 2 |
|
| 3 |
:root {
|
| 4 |
+
--bg: #f4f4f2;
|
| 5 |
+
--panel: #f9f9f7;
|
| 6 |
+
--ink: #1f1f1c;
|
| 7 |
+
--muted: #6f6b66;
|
| 8 |
+
--accent: #8d7b6e;
|
| 9 |
+
--accent-2: #8791a3;
|
| 10 |
+
--accent-3: #7e9a8b;
|
| 11 |
+
--shadow: rgba(31, 31, 28, 0.08);
|
| 12 |
}
|
| 13 |
|
| 14 |
* {
|
|
|
|
| 19 |
margin: 0;
|
| 20 |
font-family: "Space Grotesk", system-ui, sans-serif;
|
| 21 |
color: var(--ink);
|
| 22 |
+
background: radial-gradient(circle at 15% 20%, #e9e5df 0%, transparent 55%),
|
| 23 |
+
radial-gradient(circle at 80% 10%, #ece7e1 0%, transparent 40%),
|
| 24 |
+
radial-gradient(circle at 70% 80%, #e6e2dc 0%, transparent 45%),
|
| 25 |
var(--bg);
|
| 26 |
}
|
| 27 |
|
|
|
|
| 341 |
|
| 342 |
.code-diff-view {
|
| 343 |
display: grid;
|
| 344 |
+
grid-template-columns: 1fr;
|
| 345 |
gap: 16px;
|
| 346 |
}
|
| 347 |
|
|
|
|
| 359 |
white-space: pre-wrap;
|
| 360 |
}
|
| 361 |
|
| 362 |
+
.diff-wrapper {
|
| 363 |
+
display: flex;
|
| 364 |
+
flex-direction: column;
|
| 365 |
+
background: #fff;
|
| 366 |
+
border: 1px solid #e3d6c8;
|
| 367 |
+
border-radius: 12px;
|
| 368 |
+
overflow: hidden;
|
| 369 |
+
font-family: "Space Grotesk", monospace;
|
| 370 |
+
font-size: 12px;
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
.diff-row {
|
| 374 |
+
display: block;
|
| 375 |
+
border-bottom: 1px solid #f0e8dc;
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
.code-line {
|
| 379 |
+
padding: 2px 8px;
|
| 380 |
+
white-space: pre;
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
.diff-del {
|
| 384 |
+
background-color: #ffe6e6;
|
| 385 |
+
text-decoration: line-through;
|
| 386 |
+
opacity: 0.7;
|
| 387 |
+
}
|
| 388 |
+
|
| 389 |
+
.diff-add {
|
| 390 |
+
background-color: #e6ffec;
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
.structural-notes {
|
| 394 |
display: grid;
|
| 395 |
grid-template-columns: repeat(2, minmax(0, 1fr));
|