""" logos/indexer.py - The Codebase Aggregator Protocol 4: Automated Context indexing This script creates a monolithic "Dump File" of the entire project to facilitate LLM ingestion and token indexing. Workflow: 1. Traverses the project root. 2. Filters for "Signal" (Source code, Docs) vs "Noise" (Venv, Data, Cache). 3. Aggregates content into a structured markdown file. 4. Provides Token Estimation. Output: 'project_context_dump.md' """ import os import sys import os sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) import pathspec from logos.connectors import get_connector # Configuration PROJECT_ROOT = "." OUTPUT_FILE = "project_context_dump.md" # Whitelist: Only these extensions contain "Logic/Knowledge" INCLUDE_EXTENSIONS = { '.py', # Source Logic '.md', # Documentation & Knowledge Base '.txt', # Requirements / Notes '.bat', # Orchestration '.sh', # Scripts '.yaml', # Config '.json' # Config (Be careful with data files) } # Blacklist: Always ignore these directories IGNORE_DIRS = { '.git', '.venv', '__pycache__', 'node_modules', '_archive', # Legacy code 'LOGOS Screenshots', # Binary Data 'LOGOS Notes', # Binary Data (Ingested via ingest_knowledge.py) '.gemini', # Agent memory 'artifacts' # Agent outputs } # Max file size to include (avoid dumping huge data files by accident) MAX_FILE_SIZE = 100 * 1024 # 100KB def load_gitignore(root): """Load .gitignore to respect project exclusions.""" gitignore = os.path.join(root, ".gitignore") if os.path.exists(gitignore): with open(gitignore, 'r') as f: return pathspec.PathSpec.from_lines('gitwildmatch', f) return None def estimate_tokens(text): """Rough estimation: 4 chars ~= 1 token""" return len(text) // 4 def main(): print(f"--- LOGOS Indexing Protocol ---") print(f"Root: {os.path.abspath(PROJECT_ROOT)}") spec = load_gitignore(PROJECT_ROOT) total_files = 0 total_chars = 0 with open(OUTPUT_FILE, "w", encoding="utf-8") as out: # Header out.write(f"# LOGOS Project Context Dump\n") out.write(f"Generated: {os.path.abspath(OUTPUT_FILE)}\n\n") for root, dirs, files in os.walk(PROJECT_ROOT): # 1. Filtering Directories dirs[:] = [d for d in dirs if d not in IGNORE_DIRS] for file in files: ext = os.path.splitext(file)[1].lower() # 2. Filtering Extensions if ext not in INCLUDE_EXTENSIONS: continue filepath = os.path.join(root, file) relpath = os.path.relpath(filepath, PROJECT_ROOT) # 3. Filtering GitIgnore if spec and spec.match_file(relpath): continue # 4. Filtering Size try: size = os.path.getsize(filepath) if size > MAX_FILE_SIZE: print(f"[SKIP] Too large: {relpath} ({size/1024:.1f}KB)") continue # 5. Ingestion with open(filepath, "r", encoding="utf-8", errors="ignore") as f: content = f.read() # Format out.write(f"================================================================================\n") out.write(f"FILE: {relpath}\n") out.write(f"================================================================================\n") out.write(f"```{ext[1:]}\n") out.write(content) out.write(f"\n```\n\n") total_files += 1 total_chars += len(content) print(f"[INDEX] Added: {relpath}") except Exception as e: print(f"[WARN] Could not read {relpath}: {e}") # Stats tokens = total_chars // 4 print(f"\n--- Indexing Complete ---") print(f"Files Processed: {total_files}") print(f"Total Characters: {total_chars:,}") print(f"Estimated Tokens: {tokens:,}") print(f"Output: {OUTPUT_FILE}") if __name__ == "__main__": main()