Molbap HF Staff commited on
Commit
4fe7080
·
verified ·
1 Parent(s): 3ab7d12

Update app with better diff, new style

Browse files
app/__pycache__/detector.cpython-312.pyc CHANGED
Binary files a/app/__pycache__/detector.cpython-312.pyc and b/app/__pycache__/detector.cpython-312.pyc differ
 
app/__pycache__/main.cpython-312.pyc CHANGED
Binary files a/app/__pycache__/main.cpython-312.pyc and b/app/__pycache__/main.cpython-312.pyc differ
 
app/detector.py CHANGED
@@ -3,7 +3,7 @@ import json
3
  import math
4
  import os
5
  import re
6
- from typing import Callable
7
  from functools import cache
8
  from pathlib import Path
9
 
@@ -15,7 +15,18 @@ from transformers import AutoModel, AutoTokenizer
15
 
16
  import transformers
17
 
18
- MODELS_ROOT = Path(transformers.__file__).resolve().parent / "models"
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-0.6B"
21
  BATCH_SIZE = 16
@@ -24,28 +35,46 @@ HUB_DATASET_DEFAULT = "Molbap/modular-detector-embeddings"
24
  BOILERPLATE_NAMES = {
25
  "__init__",
26
  "_init_weights",
 
 
27
  "get_input_embeddings",
28
  "set_input_embeddings",
 
 
 
 
 
 
 
 
29
  }
30
 
 
 
 
 
 
 
 
31
 
32
  def _sanitize_for_embedding(code: str, model_hint: str | None, symbol_hint: str | None) -> str:
33
  code = _strip_docstrings(code)
 
34
  base = "\n".join(
35
- line for line in re.sub(r"#.*", "", code).splitlines() if not re.match(r"\s*(from|import)\s+", line)
36
  )
37
  variants = set()
38
  if model_hint:
39
  variants.add(model_hint)
40
  variants.add(model_hint.replace("_", ""))
41
- variants.add(re.sub(r"\d+", "", model_hint))
42
  if symbol_hint:
43
- match = re.match(r"^([A-Z][a-z0-9]+)", symbol_hint) or re.match(r"^([A-Za-z0-9]+)", symbol_hint)
44
  prefix = match.group(1) if match else ""
45
  if prefix:
46
  variants.add(prefix)
47
  variants.add(prefix.replace("_", ""))
48
- variants.add(re.sub(r"\d+", "", prefix))
49
  variants |= {variant.lower() for variant in list(variants)}
50
  sanitized = base
51
  for variant in sorted({x for x in variants if len(x) >= 3}, key=len, reverse=True):
@@ -54,11 +83,11 @@ def _sanitize_for_embedding(code: str, model_hint: str | None, symbol_hint: str
54
 
55
 
56
  def _normalize(value: str | None) -> str:
57
- return re.sub(r"[^a-z0-9]+", "", value.lower()) if value else ""
58
 
59
 
60
  def _leading_prefix(name: str) -> str:
61
- match = re.match(r"^([A-Z][a-z0-9]+)", name) or re.match(r"^([A-Za-z0-9]+)", name)
62
  return match.group(1) if match else ""
63
 
64
 
@@ -95,11 +124,22 @@ def _infer_model_prefixes(definitions_kind: dict[str, str]) -> set[str]:
95
  return prefixes
96
 
97
 
98
- def _calculate_reconstruction_score(scores: list[float], coverage_ratio: float) -> float:
99
- if not scores:
100
- return 0.0
101
- avg_sim = float(sum(scores)) / float(len(scores))
102
- return avg_sim * (coverage_ratio**0.5)
 
 
 
 
 
 
 
 
 
 
 
103
 
104
 
105
  def _normalize_source_path(path: str | None) -> str | None:
@@ -144,6 +184,43 @@ def _strip_docstrings(source: str) -> str:
144
  return source
145
 
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  def _normalize_code_for_compare(source: str) -> str:
148
  stripped = _strip_docstrings(source)
149
  return "".join(line.strip() for line in stripped.splitlines() if line.strip())
@@ -449,6 +526,8 @@ class CodeSimilarityAnalyzer:
449
  definitions_kind: dict[str, str] = {}
450
  lines = code.splitlines()
451
  tree = ast.parse(code)
 
 
452
  for node in ast.iter_child_nodes(tree):
453
  if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and granularity in ("definition", "method"):
454
  segment = ast.get_source_segment(code, node)
@@ -460,9 +539,8 @@ class CodeSimilarityAnalyzer:
460
  continue
461
  identifier = node.name
462
  definitions_raw[identifier] = segment
463
- sanitized = _sanitize_for_embedding(segment, model_hint, node.name)
464
- definitions_sanitized[identifier] = sanitized
465
  definitions_kind[identifier] = "function"
 
466
  continue
467
 
468
  if isinstance(node, ast.ClassDef):
@@ -471,19 +549,14 @@ class CodeSimilarityAnalyzer:
471
  start = max(0, node.lineno - 1)
472
  end = node.end_lineno
473
  class_segment = "\n".join(lines[start:end])
474
- class_header = ""
475
- if class_segment:
476
- class_header = class_segment.splitlines()[0].strip()
477
- class_context = class_header
478
 
479
  if granularity == "definition":
480
- if not class_segment:
481
- continue
482
  identifier = node.name
483
  definitions_raw[identifier] = class_segment
484
- sanitized = _sanitize_for_embedding(class_segment, model_hint, node.name)
485
- definitions_sanitized[identifier] = sanitized
486
  definitions_kind[identifier] = "class"
 
487
  continue
488
 
489
  for child in node.body:
@@ -497,12 +570,24 @@ class CodeSimilarityAnalyzer:
497
  if not segment:
498
  continue
499
  method_name = child.name
500
- combined = f"{class_context}\n{segment}" if class_context else segment
501
  identifier = f"{node.name}.{method_name}"
502
  definitions_raw[identifier] = segment
503
- sanitized = _sanitize_for_embedding(combined, model_hint, node.name)
504
- definitions_sanitized[identifier] = sanitized
505
  definitions_kind[identifier] = "method"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
  return definitions_raw, definitions_sanitized, definitions_kind
507
 
508
  def analyze_code(
@@ -578,6 +663,15 @@ class CodeSimilarityAnalyzer:
578
  }
579
  for identifier, score in candidates:
580
  relative_path, match_name = identifier.split(":", 1)
 
 
 
 
 
 
 
 
 
581
  if len(entry_all["embedding"]) < top_k_per_item:
582
  full_path, line = _resolve_definition_location(relative_path, match_name)
583
  entry_all["embedding"].append(
@@ -588,16 +682,12 @@ class CodeSimilarityAnalyzer:
588
  "score": score,
589
  "full_path": full_path,
590
  "line": line,
 
591
  }
592
  )
593
- if exclude_identical:
594
- match_segment = _get_definition_segment(relative_path, match_name)
595
- if match_segment is not None:
596
- match_norm = _normalize_code_for_compare(match_segment)
597
- query_norm = query_compare.get(query_identifier)
598
- if query_norm and match_norm == query_norm:
599
- identical_filtered += 1
600
- continue
601
  full_path, line = _resolve_definition_location(relative_path, match_name)
602
  entry["embedding"].append(
603
  {
@@ -607,6 +697,7 @@ class CodeSimilarityAnalyzer:
607
  "score": score,
608
  "full_path": full_path,
609
  "line": line,
 
610
  }
611
  )
612
  if len(entry["embedding"]) >= top_k_per_item and len(entry_all["embedding"]) >= top_k_per_item:
@@ -625,6 +716,38 @@ class CodeSimilarityAnalyzer:
625
  by_class: dict[str, dict[tuple[str, str], dict[str, object]]] = {}
626
  for query_identifier, entry in result_map.items():
627
  kind = entry.get("kind", "function")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628
  qcls = query_class_key(query_identifier, kind)
629
  matches = entry.get("embedding", [])
630
  if not matches:
@@ -660,15 +783,26 @@ class CodeSimilarityAnalyzer:
660
  by_class_out: dict[str, list[dict[str, object]]] = {}
661
  for qcls, cand_map in by_class.items():
662
  q_method_count = len(
663
- [key for key, kind in definitions_kind.items() if kind == "method" and key.startswith(f"{qcls}.")]
 
 
 
 
 
 
664
  )
665
  q_method_count = max(1, q_method_count)
666
  rows = []
667
  for _, slot in cand_map.items():
668
- scores = sorted(slot["scores"], reverse=True)
669
- coverage_count = len(scores)
 
 
 
 
 
 
670
  coverage_ratio = coverage_count / float(q_method_count)
671
- base_score = _calculate_reconstruction_score(scores, coverage_ratio)
672
  contributors = sorted(slot["contributors"], key=lambda x: float(x["score"]), reverse=True)[:5]
673
  rows.append(
674
  {
 
3
  import math
4
  import os
5
  import re
6
+ from typing import Callable, Iterable
7
  from functools import cache
8
  from pathlib import Path
9
 
 
15
 
16
  import transformers
17
 
18
+ _LIB_PATH = Path(transformers.__file__).resolve().parent
19
+ _ENV_REPO = os.getenv("TRANSFORMERS_REPO")
20
+ if _ENV_REPO:
21
+ _env_path = Path(_ENV_REPO)
22
+ _candidate = _env_path / "src" / "transformers" / "models"
23
+ if _candidate.exists():
24
+ MODELS_ROOT = _candidate
25
+ else:
26
+ _fallback = _env_path / "models"
27
+ MODELS_ROOT = _fallback if _fallback.exists() else _LIB_PATH / "models"
28
+ else:
29
+ MODELS_ROOT = _LIB_PATH / "models"
30
 
31
  EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-0.6B"
32
  BATCH_SIZE = 16
 
35
  BOILERPLATE_NAMES = {
36
  "__init__",
37
  "_init_weights",
38
+ "__repr__",
39
+ "extra_repr",
40
  "get_input_embeddings",
41
  "set_input_embeddings",
42
+ "get_output_embeddings",
43
+ "set_output_embeddings",
44
+ "tie_weights",
45
+ "post_init",
46
+ "forward",
47
+ "init_weights",
48
+ "reset_parameters",
49
+ "training",
50
  }
51
 
52
+ _RE_COMMENT = re.compile(r"#.*")
53
+ _RE_IMPORT = re.compile(r"\s*(from|import)\s+")
54
+ _RE_MODEL_HINT = re.compile(r"\d+")
55
+ _RE_LEADING_PREFIX = re.compile(r"^([A-Z][a-z0-9]+)")
56
+ _RE_ALPHANUM = re.compile(r"^([A-Za-z0-9]+)")
57
+ _RE_NORMALIZE = re.compile(r"[^a-z0-9]+")
58
+
59
 
60
  def _sanitize_for_embedding(code: str, model_hint: str | None, symbol_hint: str | None) -> str:
61
  code = _strip_docstrings(code)
62
+ cleaned = _RE_COMMENT.sub("", code)
63
  base = "\n".join(
64
+ line for line in cleaned.splitlines() if line.strip() and not _RE_IMPORT.match(line)
65
  )
66
  variants = set()
67
  if model_hint:
68
  variants.add(model_hint)
69
  variants.add(model_hint.replace("_", ""))
70
+ variants.add(_RE_MODEL_HINT.sub("", model_hint))
71
  if symbol_hint:
72
+ match = _RE_LEADING_PREFIX.match(symbol_hint) or _RE_ALPHANUM.match(symbol_hint)
73
  prefix = match.group(1) if match else ""
74
  if prefix:
75
  variants.add(prefix)
76
  variants.add(prefix.replace("_", ""))
77
+ variants.add(_RE_MODEL_HINT.sub("", prefix))
78
  variants |= {variant.lower() for variant in list(variants)}
79
  sanitized = base
80
  for variant in sorted({x for x in variants if len(x) >= 3}, key=len, reverse=True):
 
83
 
84
 
85
  def _normalize(value: str | None) -> str:
86
+ return _RE_NORMALIZE.sub("", value.lower()) if value else ""
87
 
88
 
89
  def _leading_prefix(name: str) -> str:
90
+ match = _RE_LEADING_PREFIX.match(name) or _RE_ALPHANUM.match(name)
91
  return match.group(1) if match else ""
92
 
93
 
 
124
  return prefixes
125
 
126
 
127
+ def _calculate_reconstruction_score(
128
+ contributors: Iterable[dict[str, object]],
129
+ query_method_count: int,
130
+ ) -> tuple[float, int]:
131
+ if query_method_count <= 0:
132
+ return 0.0, 0
133
+ best_scores: dict[str, float] = {}
134
+ for contributor in contributors:
135
+ query_name = contributor.get("query")
136
+ if not query_name:
137
+ continue
138
+ score = float(contributor.get("score", 0.0))
139
+ if score > best_scores.get(str(query_name), 0.0):
140
+ best_scores[str(query_name)] = score
141
+ total_similarity = sum(best_scores.values())
142
+ return total_similarity / float(query_method_count), len(best_scores)
143
 
144
 
145
  def _normalize_source_path(path: str | None) -> str | None:
 
184
  return source
185
 
186
 
187
+ def _strip_docstrings_in_tree(tree: ast.AST) -> None:
188
+ def strip_in_body(body: list[ast.stmt]) -> None:
189
+ if not body:
190
+ return
191
+ first = body[0]
192
+ if isinstance(first, ast.Expr) and isinstance(getattr(first, "value", None), ast.Constant):
193
+ if isinstance(first.value.value, str):
194
+ body.pop(0)
195
+
196
+ strip_in_body(tree.body)
197
+ for node in ast.walk(tree):
198
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
199
+ strip_in_body(node.body)
200
+
201
+
202
+ def _sanitize_unparsed_code(code: str, model_hint: str | None, symbol_hint: str | None) -> str:
203
+ cleaned = _RE_COMMENT.sub("", code)
204
+ base = "\n".join(line for line in cleaned.splitlines() if line.strip() and not _RE_IMPORT.match(line))
205
+ hints: set[str] = set()
206
+ if model_hint:
207
+ hints.add(model_hint)
208
+ hints.add(model_hint.replace("_", ""))
209
+ hints.add(_RE_MODEL_HINT.sub("", model_hint))
210
+ if symbol_hint:
211
+ match = _RE_LEADING_PREFIX.match(symbol_hint) or _RE_ALPHANUM.match(symbol_hint)
212
+ prefix = match.group(1) if match else ""
213
+ if prefix:
214
+ hints.add(prefix)
215
+ hints.add(prefix.replace("_", ""))
216
+ hints.add(_RE_MODEL_HINT.sub("", prefix))
217
+ hints = {h for h in hints if len(h) >= 3}
218
+ if hints:
219
+ pattern = re.compile("|".join(re.escape(h) for h in sorted(hints, key=len, reverse=True)), re.IGNORECASE)
220
+ base = pattern.sub("Model", base)
221
+ return base
222
+
223
+
224
  def _normalize_code_for_compare(source: str) -> str:
225
  stripped = _strip_docstrings(source)
226
  return "".join(line.strip() for line in stripped.splitlines() if line.strip())
 
526
  definitions_kind: dict[str, str] = {}
527
  lines = code.splitlines()
528
  tree = ast.parse(code)
529
+ entries: list[tuple[str, str, ast.AST, ast.ClassDef | None]] = []
530
+
531
  for node in ast.iter_child_nodes(tree):
532
  if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and granularity in ("definition", "method"):
533
  segment = ast.get_source_segment(code, node)
 
539
  continue
540
  identifier = node.name
541
  definitions_raw[identifier] = segment
 
 
542
  definitions_kind[identifier] = "function"
543
+ entries.append((identifier, "function", node, None))
544
  continue
545
 
546
  if isinstance(node, ast.ClassDef):
 
549
  start = max(0, node.lineno - 1)
550
  end = node.end_lineno
551
  class_segment = "\n".join(lines[start:end])
552
+ if not class_segment:
553
+ continue
 
 
554
 
555
  if granularity == "definition":
 
 
556
  identifier = node.name
557
  definitions_raw[identifier] = class_segment
 
 
558
  definitions_kind[identifier] = "class"
559
+ entries.append((identifier, "class", node, None))
560
  continue
561
 
562
  for child in node.body:
 
570
  if not segment:
571
  continue
572
  method_name = child.name
 
573
  identifier = f"{node.name}.{method_name}"
574
  definitions_raw[identifier] = segment
 
 
575
  definitions_kind[identifier] = "method"
576
+ entries.append((identifier, "method", child, node))
577
+
578
+ _strip_docstrings_in_tree(tree)
579
+ for identifier, kind, node, parent in entries:
580
+ try:
581
+ if kind == "method" and parent is not None:
582
+ parent_header = ast.unparse(parent).splitlines()[0]
583
+ combined = f"{parent_header}\n{ast.unparse(node)}"
584
+ sanitized = _sanitize_unparsed_code(combined, model_hint, parent.name)
585
+ else:
586
+ sanitized = _sanitize_unparsed_code(ast.unparse(node), model_hint, identifier)
587
+ except Exception:
588
+ sanitized = definitions_raw.get(identifier, "")
589
+ definitions_sanitized[identifier] = sanitized
590
+
591
  return definitions_raw, definitions_sanitized, definitions_kind
592
 
593
  def analyze_code(
 
663
  }
664
  for identifier, score in candidates:
665
  relative_path, match_name = identifier.split(":", 1)
666
+ is_identical = False
667
+ match_segment = None
668
+ if exclude_identical or query_compare:
669
+ match_segment = _get_definition_segment(relative_path, match_name)
670
+ if match_segment is not None:
671
+ match_norm = _normalize_code_for_compare(match_segment)
672
+ query_norm = query_compare.get(query_identifier)
673
+ if query_norm and match_norm == query_norm:
674
+ is_identical = True
675
  if len(entry_all["embedding"]) < top_k_per_item:
676
  full_path, line = _resolve_definition_location(relative_path, match_name)
677
  entry_all["embedding"].append(
 
682
  "score": score,
683
  "full_path": full_path,
684
  "line": line,
685
+ "is_identical": is_identical,
686
  }
687
  )
688
+ if exclude_identical and is_identical:
689
+ identical_filtered += 1
690
+ continue
 
 
 
 
 
691
  full_path, line = _resolve_definition_location(relative_path, match_name)
692
  entry["embedding"].append(
693
  {
 
697
  "score": score,
698
  "full_path": full_path,
699
  "line": line,
700
+ "is_identical": is_identical,
701
  }
702
  )
703
  if len(entry["embedding"]) >= top_k_per_item and len(entry_all["embedding"]) >= top_k_per_item:
 
716
  by_class: dict[str, dict[tuple[str, str], dict[str, object]]] = {}
717
  for query_identifier, entry in result_map.items():
718
  kind = entry.get("kind", "function")
719
+ if kind == "function":
720
+ matches = entry.get("embedding", [])
721
+ if not matches:
722
+ continue
723
+ best_per_cand: dict[tuple[str, str], dict[str, object]] = {}
724
+ for match in matches:
725
+ rel = match.get("relative_path")
726
+ mname = match.get("match_name")
727
+ score = match.get("score")
728
+ if rel is None or not mname or score is None:
729
+ continue
730
+ if "." in mname:
731
+ continue
732
+ ckey = (rel, mname)
733
+ prev = best_per_cand.get(ckey)
734
+ if prev is None or float(score) > float(prev.get("score", -1.0)):
735
+ best_per_cand[ckey] = match
736
+ for ckey, match in best_per_cand.items():
737
+ slot = by_class.setdefault(query_identifier, {}).setdefault(
738
+ ckey,
739
+ {
740
+ "relative_path": ckey[0],
741
+ "class_name": ckey[1],
742
+ "scores": [],
743
+ "contributors": [],
744
+ },
745
+ )
746
+ slot["scores"].append(float(match["score"]))
747
+ slot["contributors"].append(
748
+ {"query": query_identifier, "match": match["identifier"], "score": float(match["score"])}
749
+ )
750
+ continue
751
  qcls = query_class_key(query_identifier, kind)
752
  matches = entry.get("embedding", [])
753
  if not matches:
 
783
  by_class_out: dict[str, list[dict[str, object]]] = {}
784
  for qcls, cand_map in by_class.items():
785
  q_method_count = len(
786
+ [
787
+ key
788
+ for key, kind in definitions_kind.items()
789
+ if kind == "method"
790
+ and key.startswith(f"{qcls}.")
791
+ and key.split(".")[-1] not in BOILERPLATE_NAMES
792
+ ]
793
  )
794
  q_method_count = max(1, q_method_count)
795
  rows = []
796
  for _, slot in cand_map.items():
797
+ filtered_contributors = [
798
+ item
799
+ for item in slot["contributors"]
800
+ if str(item.get("query", "")).split(".")[-1] not in BOILERPLATE_NAMES
801
+ ]
802
+ base_score, coverage_count = _calculate_reconstruction_score(
803
+ filtered_contributors, q_method_count
804
+ )
805
  coverage_ratio = coverage_count / float(q_method_count)
 
806
  contributors = sorted(slot["contributors"], key=lambda x: float(x["score"]), reverse=True)[:5]
807
  rows.append(
808
  {
app/main.py CHANGED
@@ -157,12 +157,26 @@ def _get_structural_flow(node: ast.AST) -> str:
157
  flow: list[str] = []
158
  for child in ast.walk(node):
159
  if isinstance(child, ast.Call):
160
- name = _call_name(child.func)
 
 
 
 
 
161
  if name:
162
  flow.append(name)
 
 
 
163
  elif isinstance(child, (ast.If, ast.While, ast.For)):
164
- flow.append(f"Control({type(child).__name__})")
165
- return " -> ".join(flow[:15])
 
 
 
 
 
 
166
 
167
 
168
  def _extract_ast(source: str, symbol: str) -> tuple[str | None, dict[str, object] | None]:
 
157
  flow: list[str] = []
158
  for child in ast.walk(node):
159
  if isinstance(child, ast.Call):
160
+ name = None
161
+ if isinstance(child.func, ast.Attribute) and isinstance(child.func.value, ast.Name):
162
+ if child.func.value.id == "self":
163
+ name = f"self.{child.func.attr}"
164
+ if name is None:
165
+ name = _call_name(child.func)
166
  if name:
167
  flow.append(name)
168
+ elif isinstance(child, ast.Attribute):
169
+ if isinstance(child.value, ast.Name) and child.value.id == "self":
170
+ flow.append(f"self.{child.attr}")
171
  elif isinstance(child, (ast.If, ast.While, ast.For)):
172
+ flow.append("[LOGIC]")
173
+ elif isinstance(child, ast.Return):
174
+ flow.append("Return")
175
+ reduced: list[str] = []
176
+ for item in flow:
177
+ if not reduced or reduced[-1] != item:
178
+ reduced.append(item)
179
+ return " -> ".join(reduced[:20])
180
 
181
 
182
  def _extract_ast(source: str, symbol: str) -> tuple[str | None, dict[str, object] | None]:
static/app.js CHANGED
@@ -162,6 +162,63 @@ function formatSummary(summary) {
162
  return parts.join(" · ") || "No structural summary.";
163
  }
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  function setAst(queryAst, matchAst, querySummary, matchSummary) {
166
  if (astQueryEl) {
167
  astQueryEl.textContent = queryAst || "AST not found.";
@@ -180,6 +237,26 @@ function setAst(queryAst, matchAst, querySummary, matchSummary) {
180
  const matchFlow = matchSummary?.flow ? matchSummary.flow : "unavailable";
181
  flowComparisonEl.textContent = `Selected flow:\n${queryFlow}\n\nMatch flow:\n${matchFlow}`;
182
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  }
184
 
185
  async function loadAst(symbol, matchIdentifier) {
@@ -287,7 +364,7 @@ function renderBlueprint(byClass) {
287
  item.addEventListener("click", () => {
288
  document.querySelectorAll(".blueprint-item").forEach((el) => el.classList.remove("is-active"));
289
  item.classList.add("is-active");
290
- const matchIdentifier = top.top_contributors?.[0]?.match || top.identifier;
291
  const activeName = document.getElementById("activeModuleName");
292
  if (activeName) {
293
  activeName.textContent = `${qcls} vs ${top.class_name}`;
 
162
  return parts.join(" · ") || "No structural summary.";
163
  }
164
 
165
+ function escapeHtml(text) {
166
+ if (!text) return "";
167
+ return text
168
+ .replace(/&/g, "&amp;")
169
+ .replace(/</g, "&lt;")
170
+ .replace(/>/g, "&gt;")
171
+ .replace(/\"/g, "&quot;")
172
+ .replace(/'/g, "&#039;");
173
+ }
174
+
175
+ function diffLCS(text1, text2) {
176
+ const lines1 = text1 ? text1.split("\n") : [];
177
+ const lines2 = text2 ? text2.split("\n") : [];
178
+ const n = lines1.length;
179
+ const m = lines2.length;
180
+ const dp = Array.from({ length: n + 1 }, () => Array(m + 1).fill(0));
181
+
182
+ for (let i = 1; i <= n; i += 1) {
183
+ for (let j = 1; j <= m; j += 1) {
184
+ if (lines1[i - 1] === lines2[j - 1]) {
185
+ dp[i][j] = dp[i - 1][j - 1] + 1;
186
+ } else {
187
+ dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
188
+ }
189
+ }
190
+ }
191
+
192
+ let i = n;
193
+ let j = m;
194
+ const result = [];
195
+ while (i > 0 || j > 0) {
196
+ if (i > 0 && j > 0 && lines1[i - 1] === lines2[j - 1]) {
197
+ result.unshift({ type: "same", text: lines1[i - 1] });
198
+ i -= 1;
199
+ j -= 1;
200
+ } else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
201
+ result.unshift({ type: "add", text: lines2[j - 1] });
202
+ j -= 1;
203
+ } else {
204
+ result.unshift({ type: "del", text: lines1[i - 1] });
205
+ i -= 1;
206
+ }
207
+ }
208
+ return result;
209
+ }
210
+
211
+ function renderDiff(text1, text2) {
212
+ const diff = diffLCS(text1, text2);
213
+ return diff
214
+ .map((part) => {
215
+ const cls = part.type === "add" ? "diff-add" : part.type === "del" ? "diff-del" : "";
216
+ const safe = escapeHtml(part.text || " ");
217
+ return `<div class="diff-row ${cls}"><div class="code-line">${safe}</div></div>`;
218
+ })
219
+ .join("");
220
+ }
221
+
222
  function setAst(queryAst, matchAst, querySummary, matchSummary) {
223
  if (astQueryEl) {
224
  astQueryEl.textContent = queryAst || "AST not found.";
 
237
  const matchFlow = matchSummary?.flow ? matchSummary.flow : "unavailable";
238
  flowComparisonEl.textContent = `Selected flow:\n${queryFlow}\n\nMatch flow:\n${matchFlow}`;
239
  }
240
+
241
+ const diffContainer = document.querySelector(".code-diff-view");
242
+ if (diffContainer) {
243
+ diffContainer.innerHTML = "";
244
+ if (queryAst && matchAst) {
245
+ const diffWrapper = document.createElement("div");
246
+ diffWrapper.className = "diff-wrapper";
247
+ diffWrapper.innerHTML = renderDiff(queryAst, matchAst);
248
+ diffContainer.appendChild(diffWrapper);
249
+ } else {
250
+ const left = document.createElement("pre");
251
+ left.className = "code-block";
252
+ left.textContent = queryAst || "";
253
+ const right = document.createElement("pre");
254
+ right.className = "code-block";
255
+ right.textContent = matchAst || "";
256
+ diffContainer.appendChild(left);
257
+ diffContainer.appendChild(right);
258
+ }
259
+ }
260
  }
261
 
262
  async function loadAst(symbol, matchIdentifier) {
 
364
  item.addEventListener("click", () => {
365
  document.querySelectorAll(".blueprint-item").forEach((el) => el.classList.remove("is-active"));
366
  item.classList.add("is-active");
367
+ const matchIdentifier = top.identifier;
368
  const activeName = document.getElementById("activeModuleName");
369
  if (activeName) {
370
  activeName.textContent = `${qcls} vs ${top.class_name}`;
static/styles.css CHANGED
@@ -1,14 +1,14 @@
1
  @import url("https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&display=swap");
2
 
3
  :root {
4
- --bg: #f6f0e6;
5
- --panel: #fff7ee;
6
- --ink: #1b1b1b;
7
- --muted: #6b5f55;
8
- --accent: #d6572b;
9
- --accent-2: #2b6fd6;
10
- --accent-3: #1b8d57;
11
- --shadow: rgba(27, 27, 27, 0.1);
12
  }
13
 
14
  * {
@@ -19,9 +19,9 @@ body {
19
  margin: 0;
20
  font-family: "Space Grotesk", system-ui, sans-serif;
21
  color: var(--ink);
22
- background: radial-gradient(circle at 20% 20%, #ffe4c7 0%, transparent 55%),
23
- radial-gradient(circle at 85% 15%, #f5d2e8 0%, transparent 40%),
24
- radial-gradient(circle at 70% 80%, #d8f0e2 0%, transparent 45%),
25
  var(--bg);
26
  }
27
 
@@ -341,7 +341,7 @@ textarea {
341
 
342
  .code-diff-view {
343
  display: grid;
344
- grid-template-columns: repeat(2, minmax(0, 1fr));
345
  gap: 16px;
346
  }
347
 
@@ -359,6 +359,37 @@ textarea {
359
  white-space: pre-wrap;
360
  }
361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  .structural-notes {
363
  display: grid;
364
  grid-template-columns: repeat(2, minmax(0, 1fr));
 
1
  @import url("https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&display=swap");
2
 
3
  :root {
4
+ --bg: #f4f4f2;
5
+ --panel: #f9f9f7;
6
+ --ink: #1f1f1c;
7
+ --muted: #6f6b66;
8
+ --accent: #8d7b6e;
9
+ --accent-2: #8791a3;
10
+ --accent-3: #7e9a8b;
11
+ --shadow: rgba(31, 31, 28, 0.08);
12
  }
13
 
14
  * {
 
19
  margin: 0;
20
  font-family: "Space Grotesk", system-ui, sans-serif;
21
  color: var(--ink);
22
+ background: radial-gradient(circle at 15% 20%, #e9e5df 0%, transparent 55%),
23
+ radial-gradient(circle at 80% 10%, #ece7e1 0%, transparent 40%),
24
+ radial-gradient(circle at 70% 80%, #e6e2dc 0%, transparent 45%),
25
  var(--bg);
26
  }
27
 
 
341
 
342
  .code-diff-view {
343
  display: grid;
344
+ grid-template-columns: 1fr;
345
  gap: 16px;
346
  }
347
 
 
359
  white-space: pre-wrap;
360
  }
361
 
362
+ .diff-wrapper {
363
+ display: flex;
364
+ flex-direction: column;
365
+ background: #fff;
366
+ border: 1px solid #e3d6c8;
367
+ border-radius: 12px;
368
+ overflow: hidden;
369
+ font-family: "Space Grotesk", monospace;
370
+ font-size: 12px;
371
+ }
372
+
373
+ .diff-row {
374
+ display: block;
375
+ border-bottom: 1px solid #f0e8dc;
376
+ }
377
+
378
+ .code-line {
379
+ padding: 2px 8px;
380
+ white-space: pre;
381
+ }
382
+
383
+ .diff-del {
384
+ background-color: #ffe6e6;
385
+ text-decoration: line-through;
386
+ opacity: 0.7;
387
+ }
388
+
389
+ .diff-add {
390
+ background-color: #e6ffec;
391
+ }
392
+
393
  .structural-notes {
394
  display: grid;
395
  grid-template-columns: repeat(2, minmax(0, 1fr));