Spaces:
Runtime error
Runtime error
File size: 5,878 Bytes
e5cf7c3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | #!/usr/bin/env python3
"""Codebase Topological Mapping POC — tokenize feather itself,
run through Engram activation patterns, build file similarity graph.
Lightweight: uses text features as proxy for Engram activations.
"""
import json, os, re, math
from pathlib import Path
REPO = Path.home() / "work" / "feather"
OUT_DIR = REPO / "docs"
print("[CODEMAP] Analyzing feather codebase...")
# Collect all .py files
files = sorted(REPO.rglob("*.py"))
# Exclude venv, hidden dirs, build artifacts
files = [f for f in files if ".venv" not in f.parts and not f.name.startswith("_")]
files = [f for f in files if f.stat().st_size > 100 and f.stat().st_size < 100000]
print(f"[CODEMAP] {len(files)} source files")
# Build term-frequency vectors (words as Engram proxy)
stopwords = {"the", "a", "an", "in", "on", "of", "to", "for", "and", "or",
"is", "are", "was", "were", "be", "been", "being", "have",
"has", "had", "do", "does", "did", "but", "if", "so", "with",
"at", "by", "from", "as", "it", "its", "this", "that", "not",
"import", "from", "def", "class", "return", "self", "None",
"True", "False", "raise", "pass", "elif", "else", "try",
"except", "finally", "yield", "lambda", "with", "as", "assert",
"break", "continue", "del", "global", "nonlocal"}
vocab = {}
doc_vectors = {} # file -> {term: count}
for f in files:
try:
text = f.read_text(errors="replace")
except Exception:
continue
# Tokenize: Python identifiers
tokens = re.findall(r'[a-zA-Z_][a-zA-Z_0-9]*', text)
tokens = [t.lower() for t in tokens if t.lower() not in stopwords and len(t) > 2]
counter = {}
for t in tokens:
counter[t] = counter.get(t, 0) + 1
if t not in vocab:
vocab[t] = len(vocab)
if counter:
doc_vectors[str(f.relative_to(REPO))] = counter
print(f"[CODEMAP] {len(doc_vectors)} files with content, {len(vocab)} unique terms")
# Build TF-IDF weighted vectors
n_docs = len(doc_vectors)
df = {}
for v in doc_vectors.values():
for t in v:
df[t] = df.get(t, 0) + 1
# Similarity matrix (file-file via cosine)
fnames = list(doc_vectors.keys())
n = len(fnames)
sim_matrix = []
for i in range(n):
vi = doc_vectors[fnames[i]]
# TF-IDF for file i
w_i = {}
for t, c in vi.items():
w_i[t] = c * math.log((n_docs + 1) / (df.get(t, n_docs) + 1) + 1)
norm_i = math.sqrt(sum(v*v for v in w_i.values()))
sims = []
for j in range(n):
vj = doc_vectors[fnames[j]]
dot = sum(w_i.get(t, 0) * (vj[t] * math.log((n_docs + 1) / (df.get(t, n_docs) + 1) + 1)) for t in set(w_i) & set(vj))
norm_j = math.sqrt(sum(v*v for v in vj.values()))
sims.append(dot / max(norm_i * norm_j, 1e-10))
sim_matrix.append(sims)
# Extract module clusters via spectral-like grouping
# Sort files into directories
from collections import defaultdict
dir_groups = defaultdict(list)
for f in fnames:
parts = f.split("/")
if len(parts) >= 3:
group = "/".join(parts[:2])
elif len(parts) >= 2:
group = parts[0]
else:
group = "root"
dir_groups[group].append(f)
# Average intra-group vs inter-group similarity
intra_sims = []
inter_sims = []
for i in range(n):
for j in range(i+1, n):
sim = sim_matrix[i][j]
fi, fj = fnames[i], fnames[j]
fi_parts = fi.split("/")
fj_parts = fj.split("/")
same_group = len(fi_parts) >= 2 and len(fj_parts) >= 2 and fi_parts[0] == fj_parts[0]
if same_group:
intra_sims.append(sim)
else:
inter_sims.append(sim)
mean_intra = sum(intra_sims) / max(len(intra_sims), 1)
mean_inter = sum(inter_sims) / max(len(inter_sims), 1)
print(f"[CODEMAP] Intra-module similarity: {mean_intra:.4f}")
print(f"[CODEMAP] Inter-module similarity: {mean_inter:.4f}")
# Topological structure: which files are "hub" files (high total degree)
# Degree = sum of similarities to other files
degrees = [sum(row) for row in sim_matrix]
top_hubs = sorted(zip(degrees, fnames), reverse=True)[:10]
print(f"[CODEMAP] Hub files (topological centers):")
for d, f in top_hubs:
print(f" {f}: total_sim={d:.2f}")
# Build module-level graph
module_sims = {}
keys = sorted(dir_groups.keys())
for i in range(len(keys)):
for j in range(i, len(keys)):
files_i = dir_groups[keys[i]]
files_j = dir_groups[keys[j]]
s = 0; c = 0
for fi in files_i:
for fj in files_j:
if fi == fj: continue
fi_idx = fnames.index(fi)
fj_idx = fnames.index(fj)
s += sim_matrix[fi_idx][fj_idx]
c += 1
if c > 0:
module_sims[f"{keys[i]}-{keys[j]}"] = s / c
top_module_edges = sorted(module_sims.items(), key=lambda x: -x[1])[:15]
print(f"[CODEMAP] Top module-module connections:")
for edge, s in top_module_edges:
print(f" {edge}: sim={s:.4f}")
results = {
"n_files": int(n), "n_terms": int(len(vocab)),
"intra_module_similarity": float(mean_intra),
"inter_module_similarity": float(mean_inter),
"similarity_ratio_intra_vs_inter": float(mean_intra / max(mean_inter, 1e-10)),
"top_hubs": [(str(f), float(d)) for d, f in top_hubs],
"top_module_connections": [(str(e), float(s)) for e, s in top_module_edges[:10]],
"interpretation": (
"Codebase topology: files within modules are " +
f"{mean_intra/mean_inter:.1f}x more similar than files across modules. "
"This mirrors the Engram's expected behavior: modules form simplicial "
"clusters, cross-module imports form 1-skeleton edges."
) if mean_intra > 0 else "Insufficient data.",
}
with open(OUT_DIR / "results_codemap.json", "w") as f:
json.dump(results, f, indent=2)
print(f"[CODEMAP] Saved results_codemap.json")
|