Spaces:
Runtime error
Runtime error
| """ | |
| logos/indexer.py - The Codebase Aggregator | |
| Protocol 4: Automated Context indexing | |
| This script creates a monolithic "Dump File" of the entire project to facilitate | |
| LLM ingestion and token indexing. | |
| Workflow: | |
| 1. Traverses the project root. | |
| 2. Filters for "Signal" (Source code, Docs) vs "Noise" (Venv, Data, Cache). | |
| 3. Aggregates content into a structured markdown file. | |
| 4. Provides Token Estimation. | |
| Output: 'project_context_dump.md' | |
| """ | |
| import os | |
| import sys | |
| import os | |
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| import pathspec | |
| from logos.connectors import get_connector | |
| # Configuration | |
| PROJECT_ROOT = "." | |
| OUTPUT_FILE = "project_context_dump.md" | |
| # Whitelist: Only these extensions contain "Logic/Knowledge" | |
| INCLUDE_EXTENSIONS = { | |
| '.py', # Source Logic | |
| '.md', # Documentation & Knowledge Base | |
| '.txt', # Requirements / Notes | |
| '.bat', # Orchestration | |
| '.sh', # Scripts | |
| '.yaml', # Config | |
| '.json' # Config (Be careful with data files) | |
| } | |
| # Blacklist: Always ignore these directories | |
| IGNORE_DIRS = { | |
| '.git', | |
| '.venv', | |
| '__pycache__', | |
| 'node_modules', | |
| '_archive', # Legacy code | |
| 'LOGOS Screenshots', # Binary Data | |
| 'LOGOS Notes', # Binary Data (Ingested via ingest_knowledge.py) | |
| '.gemini', # Agent memory | |
| 'artifacts' # Agent outputs | |
| } | |
| # Max file size to include (avoid dumping huge data files by accident) | |
| MAX_FILE_SIZE = 100 * 1024 # 100KB | |
| def load_gitignore(root): | |
| """Load .gitignore to respect project exclusions.""" | |
| gitignore = os.path.join(root, ".gitignore") | |
| if os.path.exists(gitignore): | |
| with open(gitignore, 'r') as f: | |
| return pathspec.PathSpec.from_lines('gitwildmatch', f) | |
| return None | |
| def estimate_tokens(text): | |
| """Rough estimation: 4 chars ~= 1 token""" | |
| return len(text) // 4 | |
| def main(): | |
| print(f"--- LOGOS Indexing Protocol ---") | |
| print(f"Root: {os.path.abspath(PROJECT_ROOT)}") | |
| spec = load_gitignore(PROJECT_ROOT) | |
| total_files = 0 | |
| total_chars = 0 | |
| with open(OUTPUT_FILE, "w", encoding="utf-8") as out: | |
| # Header | |
| out.write(f"# LOGOS Project Context Dump\n") | |
| out.write(f"Generated: {os.path.abspath(OUTPUT_FILE)}\n\n") | |
| for root, dirs, files in os.walk(PROJECT_ROOT): | |
| # 1. Filtering Directories | |
| dirs[:] = [d for d in dirs if d not in IGNORE_DIRS] | |
| for file in files: | |
| ext = os.path.splitext(file)[1].lower() | |
| # 2. Filtering Extensions | |
| if ext not in INCLUDE_EXTENSIONS: | |
| continue | |
| filepath = os.path.join(root, file) | |
| relpath = os.path.relpath(filepath, PROJECT_ROOT) | |
| # 3. Filtering GitIgnore | |
| if spec and spec.match_file(relpath): | |
| continue | |
| # 4. Filtering Size | |
| try: | |
| size = os.path.getsize(filepath) | |
| if size > MAX_FILE_SIZE: | |
| print(f"[SKIP] Too large: {relpath} ({size/1024:.1f}KB)") | |
| continue | |
| # 5. Ingestion | |
| with open(filepath, "r", encoding="utf-8", errors="ignore") as f: | |
| content = f.read() | |
| # Format | |
| out.write(f"================================================================================\n") | |
| out.write(f"FILE: {relpath}\n") | |
| out.write(f"================================================================================\n") | |
| out.write(f"```{ext[1:]}\n") | |
| out.write(content) | |
| out.write(f"\n```\n\n") | |
| total_files += 1 | |
| total_chars += len(content) | |
| print(f"[INDEX] Added: {relpath}") | |
| except Exception as e: | |
| print(f"[WARN] Could not read {relpath}: {e}") | |
| # Stats | |
| tokens = total_chars // 4 | |
| print(f"\n--- Indexing Complete ---") | |
| print(f"Files Processed: {total_files}") | |
| print(f"Total Characters: {total_chars:,}") | |
| print(f"Estimated Tokens: {tokens:,}") | |
| print(f"Output: {OUTPUT_FILE}") | |
| if __name__ == "__main__": | |
| main() | |