feather-a10g-large-runtime / overlay /scripts /experiment_codemap.py
icarus112's picture
Update Feather a10g-large training runtime image
e5cf7c3 verified
#!/usr/bin/env python3
"""Codebase Topological Mapping POC — tokenize feather itself,
run through Engram activation patterns, build file similarity graph.
Lightweight: uses text features as proxy for Engram activations.
"""
import json, os, re, math
from pathlib import Path
REPO = Path.home() / "work" / "feather"
OUT_DIR = REPO / "docs"
print("[CODEMAP] Analyzing feather codebase...")
# Collect all .py files
files = sorted(REPO.rglob("*.py"))
# Exclude venv, hidden dirs, build artifacts
files = [f for f in files if ".venv" not in f.parts and not f.name.startswith("_")]
files = [f for f in files if f.stat().st_size > 100 and f.stat().st_size < 100000]
print(f"[CODEMAP] {len(files)} source files")
# Build term-frequency vectors (words as Engram proxy)
stopwords = {"the", "a", "an", "in", "on", "of", "to", "for", "and", "or",
"is", "are", "was", "were", "be", "been", "being", "have",
"has", "had", "do", "does", "did", "but", "if", "so", "with",
"at", "by", "from", "as", "it", "its", "this", "that", "not",
"import", "from", "def", "class", "return", "self", "None",
"True", "False", "raise", "pass", "elif", "else", "try",
"except", "finally", "yield", "lambda", "with", "as", "assert",
"break", "continue", "del", "global", "nonlocal"}
vocab = {}
doc_vectors = {} # file -> {term: count}
for f in files:
try:
text = f.read_text(errors="replace")
except Exception:
continue
# Tokenize: Python identifiers
tokens = re.findall(r'[a-zA-Z_][a-zA-Z_0-9]*', text)
tokens = [t.lower() for t in tokens if t.lower() not in stopwords and len(t) > 2]
counter = {}
for t in tokens:
counter[t] = counter.get(t, 0) + 1
if t not in vocab:
vocab[t] = len(vocab)
if counter:
doc_vectors[str(f.relative_to(REPO))] = counter
print(f"[CODEMAP] {len(doc_vectors)} files with content, {len(vocab)} unique terms")
# Build TF-IDF weighted vectors
n_docs = len(doc_vectors)
df = {}
for v in doc_vectors.values():
for t in v:
df[t] = df.get(t, 0) + 1
# Similarity matrix (file-file via cosine)
fnames = list(doc_vectors.keys())
n = len(fnames)
sim_matrix = []
for i in range(n):
vi = doc_vectors[fnames[i]]
# TF-IDF for file i
w_i = {}
for t, c in vi.items():
w_i[t] = c * math.log((n_docs + 1) / (df.get(t, n_docs) + 1) + 1)
norm_i = math.sqrt(sum(v*v for v in w_i.values()))
sims = []
for j in range(n):
vj = doc_vectors[fnames[j]]
dot = sum(w_i.get(t, 0) * (vj[t] * math.log((n_docs + 1) / (df.get(t, n_docs) + 1) + 1)) for t in set(w_i) & set(vj))
norm_j = math.sqrt(sum(v*v for v in vj.values()))
sims.append(dot / max(norm_i * norm_j, 1e-10))
sim_matrix.append(sims)
# Extract module clusters via spectral-like grouping
# Sort files into directories
from collections import defaultdict
dir_groups = defaultdict(list)
for f in fnames:
parts = f.split("/")
if len(parts) >= 3:
group = "/".join(parts[:2])
elif len(parts) >= 2:
group = parts[0]
else:
group = "root"
dir_groups[group].append(f)
# Average intra-group vs inter-group similarity
intra_sims = []
inter_sims = []
for i in range(n):
for j in range(i+1, n):
sim = sim_matrix[i][j]
fi, fj = fnames[i], fnames[j]
fi_parts = fi.split("/")
fj_parts = fj.split("/")
same_group = len(fi_parts) >= 2 and len(fj_parts) >= 2 and fi_parts[0] == fj_parts[0]
if same_group:
intra_sims.append(sim)
else:
inter_sims.append(sim)
mean_intra = sum(intra_sims) / max(len(intra_sims), 1)
mean_inter = sum(inter_sims) / max(len(inter_sims), 1)
print(f"[CODEMAP] Intra-module similarity: {mean_intra:.4f}")
print(f"[CODEMAP] Inter-module similarity: {mean_inter:.4f}")
# Topological structure: which files are "hub" files (high total degree)
# Degree = sum of similarities to other files
degrees = [sum(row) for row in sim_matrix]
top_hubs = sorted(zip(degrees, fnames), reverse=True)[:10]
print(f"[CODEMAP] Hub files (topological centers):")
for d, f in top_hubs:
print(f" {f}: total_sim={d:.2f}")
# Build module-level graph
module_sims = {}
keys = sorted(dir_groups.keys())
for i in range(len(keys)):
for j in range(i, len(keys)):
files_i = dir_groups[keys[i]]
files_j = dir_groups[keys[j]]
s = 0; c = 0
for fi in files_i:
for fj in files_j:
if fi == fj: continue
fi_idx = fnames.index(fi)
fj_idx = fnames.index(fj)
s += sim_matrix[fi_idx][fj_idx]
c += 1
if c > 0:
module_sims[f"{keys[i]}-{keys[j]}"] = s / c
top_module_edges = sorted(module_sims.items(), key=lambda x: -x[1])[:15]
print(f"[CODEMAP] Top module-module connections:")
for edge, s in top_module_edges:
print(f" {edge}: sim={s:.4f}")
results = {
"n_files": int(n), "n_terms": int(len(vocab)),
"intra_module_similarity": float(mean_intra),
"inter_module_similarity": float(mean_inter),
"similarity_ratio_intra_vs_inter": float(mean_intra / max(mean_inter, 1e-10)),
"top_hubs": [(str(f), float(d)) for d, f in top_hubs],
"top_module_connections": [(str(e), float(s)) for e, s in top_module_edges[:10]],
"interpretation": (
"Codebase topology: files within modules are " +
f"{mean_intra/mean_inter:.1f}x more similar than files across modules. "
"This mirrors the Engram's expected behavior: modules form simplicial "
"clusters, cross-module imports form 1-skeleton edges."
) if mean_intra > 0 else "Insufficient data.",
}
with open(OUT_DIR / "results_codemap.json", "w") as f:
json.dump(results, f, indent=2)
print(f"[CODEMAP] Saved results_codemap.json")