File size: 5,878 Bytes
e5cf7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python3
"""Codebase Topological Mapping POC — tokenize feather itself,
run through Engram activation patterns, build file similarity graph.
Lightweight: uses text features as proxy for Engram activations.
"""
import json, os, re, math
from pathlib import Path

REPO = Path.home() / "work" / "feather"
OUT_DIR = REPO / "docs"

print("[CODEMAP] Analyzing feather codebase...")

# Collect all .py files
files = sorted(REPO.rglob("*.py"))
# Exclude venv, hidden dirs, build artifacts
files = [f for f in files if ".venv" not in f.parts and not f.name.startswith("_")]
files = [f for f in files if f.stat().st_size > 100 and f.stat().st_size < 100000]
print(f"[CODEMAP] {len(files)} source files")

# Build term-frequency vectors (words as Engram proxy)
stopwords = {"the", "a", "an", "in", "on", "of", "to", "for", "and", "or",
             "is", "are", "was", "were", "be", "been", "being", "have",
             "has", "had", "do", "does", "did", "but", "if", "so", "with",
             "at", "by", "from", "as", "it", "its", "this", "that", "not",
             "import", "from", "def", "class", "return", "self", "None",
             "True", "False", "raise", "pass", "elif", "else", "try",
             "except", "finally", "yield", "lambda", "with", "as", "assert",
             "break", "continue", "del", "global", "nonlocal"}

vocab = {}
doc_vectors = {}  # file -> {term: count}

for f in files:
    try:
        text = f.read_text(errors="replace")
    except Exception:
        continue
    # Tokenize: Python identifiers
    tokens = re.findall(r'[a-zA-Z_][a-zA-Z_0-9]*', text)
    tokens = [t.lower() for t in tokens if t.lower() not in stopwords and len(t) > 2]
    counter = {}
    for t in tokens:
        counter[t] = counter.get(t, 0) + 1
        if t not in vocab:
            vocab[t] = len(vocab)
    if counter:
        doc_vectors[str(f.relative_to(REPO))] = counter

print(f"[CODEMAP] {len(doc_vectors)} files with content, {len(vocab)} unique terms")

# Build TF-IDF weighted vectors
n_docs = len(doc_vectors)
df = {}
for v in doc_vectors.values():
    for t in v:
        df[t] = df.get(t, 0) + 1

# Similarity matrix (file-file via cosine)
fnames = list(doc_vectors.keys())
n = len(fnames)
sim_matrix = []
for i in range(n):
    vi = doc_vectors[fnames[i]]
    # TF-IDF for file i
    w_i = {}
    for t, c in vi.items():
        w_i[t] = c * math.log((n_docs + 1) / (df.get(t, n_docs) + 1) + 1)
    norm_i = math.sqrt(sum(v*v for v in w_i.values()))
    sims = []
    for j in range(n):
        vj = doc_vectors[fnames[j]]
        dot = sum(w_i.get(t, 0) * (vj[t] * math.log((n_docs + 1) / (df.get(t, n_docs) + 1) + 1)) for t in set(w_i) & set(vj))
        norm_j = math.sqrt(sum(v*v for v in vj.values()))
        sims.append(dot / max(norm_i * norm_j, 1e-10))
    sim_matrix.append(sims)

# Extract module clusters via spectral-like grouping
# Sort files into directories
from collections import defaultdict
dir_groups = defaultdict(list)
for f in fnames:
    parts = f.split("/")
    if len(parts) >= 3:
        group = "/".join(parts[:2])
    elif len(parts) >= 2:
        group = parts[0]
    else:
        group = "root"
    dir_groups[group].append(f)

# Average intra-group vs inter-group similarity
intra_sims = []
inter_sims = []
for i in range(n):
    for j in range(i+1, n):
        sim = sim_matrix[i][j]
        fi, fj = fnames[i], fnames[j]
        fi_parts = fi.split("/")
        fj_parts = fj.split("/")
        same_group = len(fi_parts) >= 2 and len(fj_parts) >= 2 and fi_parts[0] == fj_parts[0]
        if same_group:
            intra_sims.append(sim)
        else:
            inter_sims.append(sim)

mean_intra = sum(intra_sims) / max(len(intra_sims), 1)
mean_inter = sum(inter_sims) / max(len(inter_sims), 1)
print(f"[CODEMAP] Intra-module similarity: {mean_intra:.4f}")
print(f"[CODEMAP] Inter-module similarity: {mean_inter:.4f}")

# Topological structure: which files are "hub" files (high total degree)
# Degree = sum of similarities to other files
degrees = [sum(row) for row in sim_matrix]
top_hubs = sorted(zip(degrees, fnames), reverse=True)[:10]
print(f"[CODEMAP] Hub files (topological centers):")
for d, f in top_hubs:
    print(f"  {f}: total_sim={d:.2f}")

# Build module-level graph
module_sims = {}
keys = sorted(dir_groups.keys())
for i in range(len(keys)):
    for j in range(i, len(keys)):
        files_i = dir_groups[keys[i]]
        files_j = dir_groups[keys[j]]
        s = 0; c = 0
        for fi in files_i:
            for fj in files_j:
                if fi == fj: continue
                fi_idx = fnames.index(fi)
                fj_idx = fnames.index(fj)
                s += sim_matrix[fi_idx][fj_idx]
                c += 1
        if c > 0:
            module_sims[f"{keys[i]}-{keys[j]}"] = s / c

top_module_edges = sorted(module_sims.items(), key=lambda x: -x[1])[:15]
print(f"[CODEMAP] Top module-module connections:")
for edge, s in top_module_edges:
    print(f"  {edge}: sim={s:.4f}")

results = {
    "n_files": int(n), "n_terms": int(len(vocab)),
    "intra_module_similarity": float(mean_intra),
    "inter_module_similarity": float(mean_inter),
    "similarity_ratio_intra_vs_inter": float(mean_intra / max(mean_inter, 1e-10)),
    "top_hubs": [(str(f), float(d)) for d, f in top_hubs],
    "top_module_connections": [(str(e), float(s)) for e, s in top_module_edges[:10]],
    "interpretation": (
        "Codebase topology: files within modules are " +
        f"{mean_intra/mean_inter:.1f}x more similar than files across modules. "
        "This mirrors the Engram's expected behavior: modules form simplicial "
        "clusters, cross-module imports form 1-skeleton edges."
    ) if mean_intra > 0 else "Insufficient data.",
}
with open(OUT_DIR / "results_codemap.json", "w") as f:
    json.dump(results, f, indent=2)
print(f"[CODEMAP] Saved results_codemap.json")