GitHub Copilot
Upgrade to Protocol 26: Gödel-Zeta Datastore & Recursive Manifold
66b508d
"""
logos/indexer.py - The Codebase Aggregator
Protocol 4: Automated Context indexing
This script creates a monolithic "Dump File" of the entire project to facilitate
LLM ingestion and token indexing.
Workflow:
1. Traverses the project root.
2. Filters for "Signal" (Source code, Docs) vs "Noise" (Venv, Data, Cache).
3. Aggregates content into a structured markdown file.
4. Provides Token Estimation.
Output: 'project_context_dump.md'
"""
import os
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import pathspec
from logos.connectors import get_connector
# Configuration
PROJECT_ROOT = "."
OUTPUT_FILE = "project_context_dump.md"
# Whitelist: Only these extensions contain "Logic/Knowledge"
INCLUDE_EXTENSIONS = {
'.py', # Source Logic
'.md', # Documentation & Knowledge Base
'.txt', # Requirements / Notes
'.bat', # Orchestration
'.sh', # Scripts
'.yaml', # Config
'.json' # Config (Be careful with data files)
}
# Blacklist: Always ignore these directories
IGNORE_DIRS = {
'.git',
'.venv',
'__pycache__',
'node_modules',
'_archive', # Legacy code
'LOGOS Screenshots', # Binary Data
'LOGOS Notes', # Binary Data (Ingested via ingest_knowledge.py)
'.gemini', # Agent memory
'artifacts' # Agent outputs
}
# Max file size to include (avoid dumping huge data files by accident)
MAX_FILE_SIZE = 100 * 1024 # 100KB
def load_gitignore(root):
"""Load .gitignore to respect project exclusions."""
gitignore = os.path.join(root, ".gitignore")
if os.path.exists(gitignore):
with open(gitignore, 'r') as f:
return pathspec.PathSpec.from_lines('gitwildmatch', f)
return None
def estimate_tokens(text):
"""Rough estimation: 4 chars ~= 1 token"""
return len(text) // 4
def main():
print(f"--- LOGOS Indexing Protocol ---")
print(f"Root: {os.path.abspath(PROJECT_ROOT)}")
spec = load_gitignore(PROJECT_ROOT)
total_files = 0
total_chars = 0
with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
# Header
out.write(f"# LOGOS Project Context Dump\n")
out.write(f"Generated: {os.path.abspath(OUTPUT_FILE)}\n\n")
for root, dirs, files in os.walk(PROJECT_ROOT):
# 1. Filtering Directories
dirs[:] = [d for d in dirs if d not in IGNORE_DIRS]
for file in files:
ext = os.path.splitext(file)[1].lower()
# 2. Filtering Extensions
if ext not in INCLUDE_EXTENSIONS:
continue
filepath = os.path.join(root, file)
relpath = os.path.relpath(filepath, PROJECT_ROOT)
# 3. Filtering GitIgnore
if spec and spec.match_file(relpath):
continue
# 4. Filtering Size
try:
size = os.path.getsize(filepath)
if size > MAX_FILE_SIZE:
print(f"[SKIP] Too large: {relpath} ({size/1024:.1f}KB)")
continue
# 5. Ingestion
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
# Format
out.write(f"================================================================================\n")
out.write(f"FILE: {relpath}\n")
out.write(f"================================================================================\n")
out.write(f"```{ext[1:]}\n")
out.write(content)
out.write(f"\n```\n\n")
total_files += 1
total_chars += len(content)
print(f"[INDEX] Added: {relpath}")
except Exception as e:
print(f"[WARN] Could not read {relpath}: {e}")
# Stats
tokens = total_chars // 4
print(f"\n--- Indexing Complete ---")
print(f"Files Processed: {total_files}")
print(f"Total Characters: {total_chars:,}")
print(f"Estimated Tokens: {tokens:,}")
print(f"Output: {OUTPUT_FILE}")
if __name__ == "__main__":
main()