Spaces:

GAInTech
/

feather-a10g-large-runtime

Runtime error

App Files Files Community

feather-a10g-large-runtime / overlay /scripts /experiment_codemap.py

icarus112

Update Feather a10g-large training runtime image

e5cf7c3 verified about 5 hours ago

raw

history blame contribute delete

5.88 kB

	#!/usr/bin/env python3
	"""Codebase Topological Mapping POC — tokenize feather itself,
	run through Engram activation patterns, build file similarity graph.
	Lightweight: uses text features as proxy for Engram activations.
	"""
	import json, os, re, math
	from pathlib import Path

	REPO = Path.home() / "work" / "feather"
	OUT_DIR = REPO / "docs"

	print("[CODEMAP] Analyzing feather codebase...")

	# Collect all .py files
	files = sorted(REPO.rglob("*.py"))
	# Exclude venv, hidden dirs, build artifacts
	files = [f for f in files if ".venv" not in f.parts and not f.name.startswith("_")]
	files = [f for f in files if f.stat().st_size > 100 and f.stat().st_size < 100000]
	print(f"[CODEMAP] {len(files)} source files")

	# Build term-frequency vectors (words as Engram proxy)
	stopwords = {"the", "a", "an", "in", "on", "of", "to", "for", "and", "or",
	"is", "are", "was", "were", "be", "been", "being", "have",
	"has", "had", "do", "does", "did", "but", "if", "so", "with",
	"at", "by", "from", "as", "it", "its", "this", "that", "not",
	"import", "from", "def", "class", "return", "self", "None",
	"True", "False", "raise", "pass", "elif", "else", "try",
	"except", "finally", "yield", "lambda", "with", "as", "assert",
	"break", "continue", "del", "global", "nonlocal"}

	vocab = {}
	doc_vectors = {} # file -> {term: count}

	for f in files:
	try:
	text = f.read_text(errors="replace")
	except Exception:
	continue
	# Tokenize: Python identifiers
	tokens = re.findall(r'[a-zA-Z_][a-zA-Z_0-9]*', text)
	tokens = [t.lower() for t in tokens if t.lower() not in stopwords and len(t) > 2]
	counter = {}
	for t in tokens:
	counter[t] = counter.get(t, 0) + 1
	if t not in vocab:
	vocab[t] = len(vocab)
	if counter:
	doc_vectors[str(f.relative_to(REPO))] = counter

	print(f"[CODEMAP] {len(doc_vectors)} files with content, {len(vocab)} unique terms")

	# Build TF-IDF weighted vectors
	n_docs = len(doc_vectors)
	df = {}
	for v in doc_vectors.values():
	for t in v:
	df[t] = df.get(t, 0) + 1

	# Similarity matrix (file-file via cosine)
	fnames = list(doc_vectors.keys())
	n = len(fnames)
	sim_matrix = []
	for i in range(n):
	vi = doc_vectors[fnames[i]]
	# TF-IDF for file i
	w_i = {}
	for t, c in vi.items():
	w_i[t] = c * math.log((n_docs + 1) / (df.get(t, n_docs) + 1) + 1)
	norm_i = math.sqrt(sum(v*v for v in w_i.values()))
	sims = []
	for j in range(n):
	vj = doc_vectors[fnames[j]]
	dot = sum(w_i.get(t, 0) * (vj[t] * math.log((n_docs + 1) / (df.get(t, n_docs) + 1) + 1)) for t in set(w_i) & set(vj))
	norm_j = math.sqrt(sum(v*v for v in vj.values()))
	sims.append(dot / max(norm_i * norm_j, 1e-10))
	sim_matrix.append(sims)

	# Extract module clusters via spectral-like grouping
	# Sort files into directories
	from collections import defaultdict
	dir_groups = defaultdict(list)
	for f in fnames:
	parts = f.split("/")
	if len(parts) >= 3:
	group = "/".join(parts[:2])
	elif len(parts) >= 2:
	group = parts[0]
	else:
	group = "root"
	dir_groups[group].append(f)

	# Average intra-group vs inter-group similarity
	intra_sims = []
	inter_sims = []
	for i in range(n):
	for j in range(i+1, n):
	sim = sim_matrix[i][j]
	fi, fj = fnames[i], fnames[j]
	fi_parts = fi.split("/")
	fj_parts = fj.split("/")
	same_group = len(fi_parts) >= 2 and len(fj_parts) >= 2 and fi_parts[0] == fj_parts[0]
	if same_group:
	intra_sims.append(sim)
	else:
	inter_sims.append(sim)

	mean_intra = sum(intra_sims) / max(len(intra_sims), 1)
	mean_inter = sum(inter_sims) / max(len(inter_sims), 1)
	print(f"[CODEMAP] Intra-module similarity: {mean_intra:.4f}")
	print(f"[CODEMAP] Inter-module similarity: {mean_inter:.4f}")

	# Topological structure: which files are "hub" files (high total degree)
	# Degree = sum of similarities to other files
	degrees = [sum(row) for row in sim_matrix]
	top_hubs = sorted(zip(degrees, fnames), reverse=True)[:10]
	print(f"[CODEMAP] Hub files (topological centers):")
	for d, f in top_hubs:
	print(f" {f}: total_sim={d:.2f}")

	# Build module-level graph
	module_sims = {}
	keys = sorted(dir_groups.keys())
	for i in range(len(keys)):
	for j in range(i, len(keys)):
	files_i = dir_groups[keys[i]]
	files_j = dir_groups[keys[j]]
	s = 0; c = 0
	for fi in files_i:
	for fj in files_j:
	if fi == fj: continue
	fi_idx = fnames.index(fi)
	fj_idx = fnames.index(fj)
	s += sim_matrix[fi_idx][fj_idx]
	c += 1
	if c > 0:
	module_sims[f"{keys[i]}-{keys[j]}"] = s / c

	top_module_edges = sorted(module_sims.items(), key=lambda x: -x[1])[:15]
	print(f"[CODEMAP] Top module-module connections:")
	for edge, s in top_module_edges:
	print(f" {edge}: sim={s:.4f}")

	results = {
	"n_files": int(n), "n_terms": int(len(vocab)),
	"intra_module_similarity": float(mean_intra),
	"inter_module_similarity": float(mean_inter),
	"similarity_ratio_intra_vs_inter": float(mean_intra / max(mean_inter, 1e-10)),
	"top_hubs": [(str(f), float(d)) for d, f in top_hubs],
	"top_module_connections": [(str(e), float(s)) for e, s in top_module_edges[:10]],
	"interpretation": (
	"Codebase topology: files within modules are " +
	f"{mean_intra/mean_inter:.1f}x more similar than files across modules. "
	"This mirrors the Engram's expected behavior: modules form simplicial "
	"clusters, cross-module imports form 1-skeleton edges."
	) if mean_intra > 0 else "Insufficient data.",
	}
	with open(OUT_DIR / "results_codemap.json", "w") as f:
	json.dump(results, f, indent=2)
	print(f"[CODEMAP] Saved results_codemap.json")