#!/usr/bin/env python3 """Codebase Topological Mapping POC — tokenize feather itself, run through Engram activation patterns, build file similarity graph. Lightweight: uses text features as proxy for Engram activations. """ import json, os, re, math from pathlib import Path REPO = Path.home() / "work" / "feather" OUT_DIR = REPO / "docs" print("[CODEMAP] Analyzing feather codebase...") # Collect all .py files files = sorted(REPO.rglob("*.py")) # Exclude venv, hidden dirs, build artifacts files = [f for f in files if ".venv" not in f.parts and not f.name.startswith("_")] files = [f for f in files if f.stat().st_size > 100 and f.stat().st_size < 100000] print(f"[CODEMAP] {len(files)} source files") # Build term-frequency vectors (words as Engram proxy) stopwords = {"the", "a", "an", "in", "on", "of", "to", "for", "and", "or", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "but", "if", "so", "with", "at", "by", "from", "as", "it", "its", "this", "that", "not", "import", "from", "def", "class", "return", "self", "None", "True", "False", "raise", "pass", "elif", "else", "try", "except", "finally", "yield", "lambda", "with", "as", "assert", "break", "continue", "del", "global", "nonlocal"} vocab = {} doc_vectors = {} # file -> {term: count} for f in files: try: text = f.read_text(errors="replace") except Exception: continue # Tokenize: Python identifiers tokens = re.findall(r'[a-zA-Z_][a-zA-Z_0-9]*', text) tokens = [t.lower() for t in tokens if t.lower() not in stopwords and len(t) > 2] counter = {} for t in tokens: counter[t] = counter.get(t, 0) + 1 if t not in vocab: vocab[t] = len(vocab) if counter: doc_vectors[str(f.relative_to(REPO))] = counter print(f"[CODEMAP] {len(doc_vectors)} files with content, {len(vocab)} unique terms") # Build TF-IDF weighted vectors n_docs = len(doc_vectors) df = {} for v in doc_vectors.values(): for t in v: df[t] = df.get(t, 0) + 1 # Similarity matrix (file-file via cosine) fnames = list(doc_vectors.keys()) n = len(fnames) sim_matrix = [] for i in range(n): vi = doc_vectors[fnames[i]] # TF-IDF for file i w_i = {} for t, c in vi.items(): w_i[t] = c * math.log((n_docs + 1) / (df.get(t, n_docs) + 1) + 1) norm_i = math.sqrt(sum(v*v for v in w_i.values())) sims = [] for j in range(n): vj = doc_vectors[fnames[j]] dot = sum(w_i.get(t, 0) * (vj[t] * math.log((n_docs + 1) / (df.get(t, n_docs) + 1) + 1)) for t in set(w_i) & set(vj)) norm_j = math.sqrt(sum(v*v for v in vj.values())) sims.append(dot / max(norm_i * norm_j, 1e-10)) sim_matrix.append(sims) # Extract module clusters via spectral-like grouping # Sort files into directories from collections import defaultdict dir_groups = defaultdict(list) for f in fnames: parts = f.split("/") if len(parts) >= 3: group = "/".join(parts[:2]) elif len(parts) >= 2: group = parts[0] else: group = "root" dir_groups[group].append(f) # Average intra-group vs inter-group similarity intra_sims = [] inter_sims = [] for i in range(n): for j in range(i+1, n): sim = sim_matrix[i][j] fi, fj = fnames[i], fnames[j] fi_parts = fi.split("/") fj_parts = fj.split("/") same_group = len(fi_parts) >= 2 and len(fj_parts) >= 2 and fi_parts[0] == fj_parts[0] if same_group: intra_sims.append(sim) else: inter_sims.append(sim) mean_intra = sum(intra_sims) / max(len(intra_sims), 1) mean_inter = sum(inter_sims) / max(len(inter_sims), 1) print(f"[CODEMAP] Intra-module similarity: {mean_intra:.4f}") print(f"[CODEMAP] Inter-module similarity: {mean_inter:.4f}") # Topological structure: which files are "hub" files (high total degree) # Degree = sum of similarities to other files degrees = [sum(row) for row in sim_matrix] top_hubs = sorted(zip(degrees, fnames), reverse=True)[:10] print(f"[CODEMAP] Hub files (topological centers):") for d, f in top_hubs: print(f" {f}: total_sim={d:.2f}") # Build module-level graph module_sims = {} keys = sorted(dir_groups.keys()) for i in range(len(keys)): for j in range(i, len(keys)): files_i = dir_groups[keys[i]] files_j = dir_groups[keys[j]] s = 0; c = 0 for fi in files_i: for fj in files_j: if fi == fj: continue fi_idx = fnames.index(fi) fj_idx = fnames.index(fj) s += sim_matrix[fi_idx][fj_idx] c += 1 if c > 0: module_sims[f"{keys[i]}-{keys[j]}"] = s / c top_module_edges = sorted(module_sims.items(), key=lambda x: -x[1])[:15] print(f"[CODEMAP] Top module-module connections:") for edge, s in top_module_edges: print(f" {edge}: sim={s:.4f}") results = { "n_files": int(n), "n_terms": int(len(vocab)), "intra_module_similarity": float(mean_intra), "inter_module_similarity": float(mean_inter), "similarity_ratio_intra_vs_inter": float(mean_intra / max(mean_inter, 1e-10)), "top_hubs": [(str(f), float(d)) for d, f in top_hubs], "top_module_connections": [(str(e), float(s)) for e, s in top_module_edges[:10]], "interpretation": ( "Codebase topology: files within modules are " + f"{mean_intra/mean_inter:.1f}x more similar than files across modules. " "This mirrors the Engram's expected behavior: modules form simplicial " "clusters, cross-module imports form 1-skeleton edges." ) if mean_intra > 0 else "Insufficient data.", } with open(OUT_DIR / "results_codemap.json", "w") as f: json.dump(results, f, indent=2) print(f"[CODEMAP] Saved results_codemap.json")