Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """Codebase Topological Mapping POC — tokenize feather itself, | |
| run through Engram activation patterns, build file similarity graph. | |
| Lightweight: uses text features as proxy for Engram activations. | |
| """ | |
| import json, os, re, math | |
| from pathlib import Path | |
| REPO = Path.home() / "work" / "feather" | |
| OUT_DIR = REPO / "docs" | |
| print("[CODEMAP] Analyzing feather codebase...") | |
| # Collect all .py files | |
| files = sorted(REPO.rglob("*.py")) | |
| # Exclude venv, hidden dirs, build artifacts | |
| files = [f for f in files if ".venv" not in f.parts and not f.name.startswith("_")] | |
| files = [f for f in files if f.stat().st_size > 100 and f.stat().st_size < 100000] | |
| print(f"[CODEMAP] {len(files)} source files") | |
| # Build term-frequency vectors (words as Engram proxy) | |
| stopwords = {"the", "a", "an", "in", "on", "of", "to", "for", "and", "or", | |
| "is", "are", "was", "were", "be", "been", "being", "have", | |
| "has", "had", "do", "does", "did", "but", "if", "so", "with", | |
| "at", "by", "from", "as", "it", "its", "this", "that", "not", | |
| "import", "from", "def", "class", "return", "self", "None", | |
| "True", "False", "raise", "pass", "elif", "else", "try", | |
| "except", "finally", "yield", "lambda", "with", "as", "assert", | |
| "break", "continue", "del", "global", "nonlocal"} | |
| vocab = {} | |
| doc_vectors = {} # file -> {term: count} | |
| for f in files: | |
| try: | |
| text = f.read_text(errors="replace") | |
| except Exception: | |
| continue | |
| # Tokenize: Python identifiers | |
| tokens = re.findall(r'[a-zA-Z_][a-zA-Z_0-9]*', text) | |
| tokens = [t.lower() for t in tokens if t.lower() not in stopwords and len(t) > 2] | |
| counter = {} | |
| for t in tokens: | |
| counter[t] = counter.get(t, 0) + 1 | |
| if t not in vocab: | |
| vocab[t] = len(vocab) | |
| if counter: | |
| doc_vectors[str(f.relative_to(REPO))] = counter | |
| print(f"[CODEMAP] {len(doc_vectors)} files with content, {len(vocab)} unique terms") | |
| # Build TF-IDF weighted vectors | |
| n_docs = len(doc_vectors) | |
| df = {} | |
| for v in doc_vectors.values(): | |
| for t in v: | |
| df[t] = df.get(t, 0) + 1 | |
| # Similarity matrix (file-file via cosine) | |
| fnames = list(doc_vectors.keys()) | |
| n = len(fnames) | |
| sim_matrix = [] | |
| for i in range(n): | |
| vi = doc_vectors[fnames[i]] | |
| # TF-IDF for file i | |
| w_i = {} | |
| for t, c in vi.items(): | |
| w_i[t] = c * math.log((n_docs + 1) / (df.get(t, n_docs) + 1) + 1) | |
| norm_i = math.sqrt(sum(v*v for v in w_i.values())) | |
| sims = [] | |
| for j in range(n): | |
| vj = doc_vectors[fnames[j]] | |
| dot = sum(w_i.get(t, 0) * (vj[t] * math.log((n_docs + 1) / (df.get(t, n_docs) + 1) + 1)) for t in set(w_i) & set(vj)) | |
| norm_j = math.sqrt(sum(v*v for v in vj.values())) | |
| sims.append(dot / max(norm_i * norm_j, 1e-10)) | |
| sim_matrix.append(sims) | |
| # Extract module clusters via spectral-like grouping | |
| # Sort files into directories | |
| from collections import defaultdict | |
| dir_groups = defaultdict(list) | |
| for f in fnames: | |
| parts = f.split("/") | |
| if len(parts) >= 3: | |
| group = "/".join(parts[:2]) | |
| elif len(parts) >= 2: | |
| group = parts[0] | |
| else: | |
| group = "root" | |
| dir_groups[group].append(f) | |
| # Average intra-group vs inter-group similarity | |
| intra_sims = [] | |
| inter_sims = [] | |
| for i in range(n): | |
| for j in range(i+1, n): | |
| sim = sim_matrix[i][j] | |
| fi, fj = fnames[i], fnames[j] | |
| fi_parts = fi.split("/") | |
| fj_parts = fj.split("/") | |
| same_group = len(fi_parts) >= 2 and len(fj_parts) >= 2 and fi_parts[0] == fj_parts[0] | |
| if same_group: | |
| intra_sims.append(sim) | |
| else: | |
| inter_sims.append(sim) | |
| mean_intra = sum(intra_sims) / max(len(intra_sims), 1) | |
| mean_inter = sum(inter_sims) / max(len(inter_sims), 1) | |
| print(f"[CODEMAP] Intra-module similarity: {mean_intra:.4f}") | |
| print(f"[CODEMAP] Inter-module similarity: {mean_inter:.4f}") | |
| # Topological structure: which files are "hub" files (high total degree) | |
| # Degree = sum of similarities to other files | |
| degrees = [sum(row) for row in sim_matrix] | |
| top_hubs = sorted(zip(degrees, fnames), reverse=True)[:10] | |
| print(f"[CODEMAP] Hub files (topological centers):") | |
| for d, f in top_hubs: | |
| print(f" {f}: total_sim={d:.2f}") | |
| # Build module-level graph | |
| module_sims = {} | |
| keys = sorted(dir_groups.keys()) | |
| for i in range(len(keys)): | |
| for j in range(i, len(keys)): | |
| files_i = dir_groups[keys[i]] | |
| files_j = dir_groups[keys[j]] | |
| s = 0; c = 0 | |
| for fi in files_i: | |
| for fj in files_j: | |
| if fi == fj: continue | |
| fi_idx = fnames.index(fi) | |
| fj_idx = fnames.index(fj) | |
| s += sim_matrix[fi_idx][fj_idx] | |
| c += 1 | |
| if c > 0: | |
| module_sims[f"{keys[i]}-{keys[j]}"] = s / c | |
| top_module_edges = sorted(module_sims.items(), key=lambda x: -x[1])[:15] | |
| print(f"[CODEMAP] Top module-module connections:") | |
| for edge, s in top_module_edges: | |
| print(f" {edge}: sim={s:.4f}") | |
| results = { | |
| "n_files": int(n), "n_terms": int(len(vocab)), | |
| "intra_module_similarity": float(mean_intra), | |
| "inter_module_similarity": float(mean_inter), | |
| "similarity_ratio_intra_vs_inter": float(mean_intra / max(mean_inter, 1e-10)), | |
| "top_hubs": [(str(f), float(d)) for d, f in top_hubs], | |
| "top_module_connections": [(str(e), float(s)) for e, s in top_module_edges[:10]], | |
| "interpretation": ( | |
| "Codebase topology: files within modules are " + | |
| f"{mean_intra/mean_inter:.1f}x more similar than files across modules. " | |
| "This mirrors the Engram's expected behavior: modules form simplicial " | |
| "clusters, cross-module imports form 1-skeleton edges." | |
| ) if mean_intra > 0 else "Insufficient data.", | |
| } | |
| with open(OUT_DIR / "results_codemap.json", "w") as f: | |
| json.dump(results, f, indent=2) | |
| print(f"[CODEMAP] Saved results_codemap.json") | |