"""
logos/indexer.py - The Codebase Aggregator
Protocol 4: Automated Context indexing

This script creates a monolithic "Dump File" of the entire project to facilitate 
LLM ingestion and token indexing.

Workflow:
1. Traverses the project root.
2. Filters for "Signal" (Source code, Docs) vs "Noise" (Venv, Data, Cache).
3. Aggregates content into a structured markdown file.
4. Provides Token Estimation.

Output: 'project_context_dump.md'
"""

import os
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

import pathspec
from logos.connectors import get_connector

# Configuration
PROJECT_ROOT = "."
OUTPUT_FILE = "project_context_dump.md"

# Whitelist: Only these extensions contain "Logic/Knowledge"
INCLUDE_EXTENSIONS = {
    '.py',   # Source Logic
    '.md',   # Documentation & Knowledge Base
    '.txt',  # Requirements / Notes
    '.bat',  # Orchestration
    '.sh',   # Scripts
    '.yaml', # Config
    '.json'  # Config (Be careful with data files)
}

# Blacklist: Always ignore these directories
IGNORE_DIRS = {
    '.git',
    '.venv',
    '__pycache__',
    'node_modules',
    '_archive',        # Legacy code
    'LOGOS Screenshots', # Binary Data
    'LOGOS Notes',     # Binary Data (Ingested via ingest_knowledge.py)
    '.gemini',         # Agent memory
    'artifacts'        # Agent outputs
}

# Max file size to include (avoid dumping huge data files by accident)
MAX_FILE_SIZE = 100 * 1024 # 100KB

def load_gitignore(root):
    """Load .gitignore to respect project exclusions."""
    gitignore = os.path.join(root, ".gitignore")
    if os.path.exists(gitignore):
        with open(gitignore, 'r') as f:
            return pathspec.PathSpec.from_lines('gitwildmatch', f)
    return None

def estimate_tokens(text):
    """Rough estimation: 4 chars ~= 1 token"""
    return len(text) // 4

def main():
    print(f"--- LOGOS Indexing Protocol ---")
    print(f"Root: {os.path.abspath(PROJECT_ROOT)}")
    
    spec = load_gitignore(PROJECT_ROOT)
    
    total_files = 0
    total_chars = 0
    
    with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
        # Header
        out.write(f"# LOGOS Project Context Dump\n")
        out.write(f"Generated: {os.path.abspath(OUTPUT_FILE)}\n\n")
        
        for root, dirs, files in os.walk(PROJECT_ROOT):
            # 1. Filtering Directories
            dirs[:] = [d for d in dirs if d not in IGNORE_DIRS]
            
            for file in files:
                ext = os.path.splitext(file)[1].lower()
                
                # 2. Filtering Extensions
                if ext not in INCLUDE_EXTENSIONS:
                    continue
                    
                filepath = os.path.join(root, file)
                relpath = os.path.relpath(filepath, PROJECT_ROOT)
                
                # 3. Filtering GitIgnore
                if spec and spec.match_file(relpath):
                    continue
                
                # 4. Filtering Size
                try:
                    size = os.path.getsize(filepath)
                    if size > MAX_FILE_SIZE:
                        print(f"[SKIP] Too large: {relpath} ({size/1024:.1f}KB)")
                        continue
                        
                    # 5. Ingestion
                    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
                        content = f.read()
                        
                    # Format
                    out.write(f"================================================================================\n")
                    out.write(f"FILE: {relpath}\n")
                    out.write(f"================================================================================\n")
                    out.write(f"```{ext[1:]}\n")
                    out.write(content)
                    out.write(f"\n```\n\n")
                    
                    total_files += 1
                    total_chars += len(content)
                    print(f"[INDEX] Added: {relpath}")
                    
                except Exception as e:
                    print(f"[WARN] Could not read {relpath}: {e}")

    # Stats
    tokens = total_chars // 4
    print(f"\n--- Indexing Complete ---")
    print(f"Files Processed: {total_files}")
    print(f"Total Characters: {total_chars:,}")
    print(f"Estimated Tokens: {tokens:,}")
    print(f"Output: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()