Spaces:

Executor-Tyrant-Framework
/

clawdbot-dev

Running

App Files Files Community

Executor-Tyrant-Framework commited on 18 days ago

Commit

584f417

verified ·

1 Parent(s): e965918

Update recursive_context.py

Browse files

Files changed (1) hide show

recursive_context.py +73 -384

recursive_context.py CHANGED Viewed

@@ -31,6 +31,11 @@ PRESERVED: All existing functions from prior changelogs remain intact.
   search_code, read_file, save_conversation_turn — all unchanged.
 NOTE: get_stats() is critical — app.py calls it at module level during UI
   construction AND in the system prompt. Missing it = instant crash.
 """
 from pathlib import Path
@@ -50,12 +55,6 @@ import re
 # =============================================================================
 # CHROMA DB PATH SELECTION
 # =============================================================================
-# CHANGELOG [2026-01-31 - Gemini]
-# HF Spaces Docker containers wipe everything EXCEPT /data on restart.
-# We prefer /data/chroma_db (persistent) but fall back to /workspace/chroma_db
-# (ephemeral) if /data isn't writable.
-# =============================================================================
 def _select_chroma_path():
     """HF Spaces Docker containers wipe everything EXCEPT /data on restart."""
     data_path = Path("/data/chroma_db")
@@ -77,11 +76,6 @@ CHROMA_DB_PATH = _select_chroma_path()
 # =============================================================================
 # HF DATASET PERSISTENCE
 # =============================================================================
-# CHANGELOG [2026-01-31 - Gemini]
-# Handles durable cloud storage via HF Dataset repository. Conversations
-# survive Space restarts by backing up to a private dataset repo.
-# =============================================================================
 class HFDatasetPersistence:
     """Handles durable cloud storage via your 1TB PRO Dataset repository."""
@@ -159,31 +153,8 @@ class HFDatasetPersistence:
 # =============================================================================
 class RecursiveContextManager:
-    """Manages unlimited context and vibe-coding tools for E-T Systems.
-    CHANGELOG [2026-01-31 - Claude/Opus]
-    This is the core class. It provides:
-    - ChromaDB-backed semantic search over the codebase and conversations
-    - File read/write with changelog enforcement
-    - Shell execution for build tasks
-    - Shadow branching for safe experimentation
-    - Stats reporting for the UI sidebar
-    - Repository indexing (background thread on init)
-    ARCHITECTURE NOTE:
-    The class is initialized once at module level in app.py. That means
-    __init__ runs during import, so it MUST NOT block or crash. Heavy work
-    (like indexing the repo) is dispatched to a background thread.
-    get_stats() must return sensible defaults even before indexing completes.
-    """
-    # =========================================================================
-    # FILE EXTENSIONS TO INDEX
-    # =========================================================================
-    # CHANGELOG [2026-01-31 - Claude/Opus]
-    # Only index code/text files. Binary files, images, and large data files
-    # would pollute the vector space and waste embedding compute.
-    # =========================================================================
     INDEXABLE_EXTENSIONS = {
         '.py', '.js', '.ts', '.jsx', '.tsx', '.mjs', '.cjs',
         '.json', '.yaml', '.yml', '.toml',
@@ -191,27 +162,18 @@ class RecursiveContextManager:
         '.html', '.css', '.scss',
         '.sh', '.bash',
         '.sql',
-        '.env.example',  # Not .env itself — that's sensitive
         '.gitignore', '.dockerignore',
         '.cfg', '.ini', '.conf',
     }
-    # Max file size to index (256KB). Larger files are likely generated/data.
     MAX_INDEX_SIZE = 256 * 1024
     def __init__(self, repo_path: str):
         self.repo_path = Path(repo_path)
         self.persistence = HFDatasetPersistence()
-        # =================================================================
-        # EMBEDDING CONFIG
-        # =================================================================
-        # CHANGELOG [2026-01-31 - Gemini]
-        # Fixes /.cache PermissionError. ChromaDB's ONNXMiniLM_L6_V2 tries
-        # to download model weights to ~/.cache. In Docker as UID 1000,
-        # that's /.cache (root-owned). We override DOWNLOAD_PATH to a
-        # writable directory.
-        # =================================================================
         self.embedding_function = ONNXMiniLM_L6_V2()
         cache_dir = os.getenv("CHROMA_CACHE_DIR", "/tmp/.cache/chroma")
         self.embedding_function.DOWNLOAD_PATH = cache_dir
@@ -232,17 +194,9 @@ class RecursiveContextManager:
             embedding_function=self.embedding_function
         )
-        # Restore conversations from cloud backup if local is empty
         if self.conversations.count() == 0:
             self._restore_from_cloud()
-        # =================================================================
-        # BACKGROUND INDEXING
-        # =================================================================
-        # CHANGELOG [2026-01-31 - Claude/Opus]
-        # Index the repository in a background thread so startup isn't
-        # blocked. The _indexing flag lets get_stats() report status.
-        # =================================================================
         self._indexing = False
         self._index_error = None
         self._indexed_file_count = 0
@@ -250,13 +204,6 @@ class RecursiveContextManager:
             self._start_background_indexing()
     def _restore_from_cloud(self):
-        """Restore conversation history from HF Dataset backup.
-        CHANGELOG [2026-01-31 - Gemini]
-        Called during init if the local ChromaDB conversations collection
-        is empty. Pulls from the cloud dataset repo to recover history
-        after a Space restart.
-        """
         data = self.persistence.load_conversations()
         for conv in data:
             try:
@@ -269,50 +216,19 @@ class RecursiveContextManager:
                 pass
     def _get_collection_name(self) -> str:
-        """Generate a deterministic collection name from the repo path.
-        CHANGELOG [2025-01-28 - Josh]
-        Uses MD5 hash of repo path so different repos get different
-        collections within the same ChromaDB instance.
-        """
         path_hash = hashlib.md5(str(self.repo_path).encode()).hexdigest()[:8]
         return f"codebase_{path_hash}"
     # =====================================================================
     # REPOSITORY INDEXING
     # =====================================================================
-    # CHANGELOG [2026-01-31 - Claude/Opus]
-    # Without indexing, search_code() always returns empty results because
-    # nothing is ever added to the ChromaDB codebase collection. This walks
-    # the repo, reads indexable files, chunks them, and upserts into ChromaDB.
-    #
-    # DESIGN DECISIONS:
-    # - Background thread: Don't block Gradio startup. Users can chat while
-    #   indexing runs. get_stats() shows indexing progress.
-    # - Chunk by logical blocks: Split files into ~50-line chunks with overlap
-    #   so semantic search finds relevant sections, not just file-level matches.
-    # - Upsert (not add): Safe to re-run. If the file was already indexed
-    #   with the same content hash, ChromaDB skips it.
-    # - Skip .git, __pycache__, node_modules, venv: No value in indexing these.
-    #
-    # TESTED ALTERNATIVES (graveyard):
-    # - Indexing entire files as single documents: Poor search precision.
-    #   A 500-line file matching on line 3 returns all 500 lines.
-    # - Line-by-line indexing: Too many tiny documents, poor semantic context.
-    # - Synchronous indexing: Blocks startup for 30+ seconds on large repos.
-    # =====================================================================
     def _start_background_indexing(self):
-        """Kick off repo indexing in a daemon thread."""
         self._indexing = True
         thread = threading.Thread(target=self._index_repository, daemon=True)
         thread.start()
     def _index_repository(self):
-        """Walk the repo and index code files into ChromaDB.
-        Runs in background thread. Sets self._indexing = False when done.
-        """
         try:
             skip_dirs = {
                 '.git', '__pycache__', 'node_modules', 'venv', '.venv',
@@ -322,39 +238,23 @@ class RecursiveContextManager:
             count = 0
             for file_path in self.repo_path.rglob('*'):
-                # Skip directories and non-indexable files
-                if file_path.is_dir():
-                    continue
-                # Skip files in excluded directories
-                if any(skip in file_path.parts for skip in skip_dirs):
-                    continue
-                # Check extension
                 suffix = file_path.suffix.lower()
                 if suffix not in self.INDEXABLE_EXTENSIONS:
-                    # Also allow extensionless files if they look like configs
-                    if file_path.name not in {
-                        'Dockerfile', 'Makefile', 'Procfile',
-                        '.gitignore', '.dockerignore', '.env.example'
-                    }:
                         continue
-                # Check size
                 try:
-                    if file_path.stat().st_size > self.MAX_INDEX_SIZE:
-                        continue
-                except OSError:
-                    continue
-                # Read and chunk the file
                 try:
                     content = file_path.read_text(encoding='utf-8', errors='ignore')
-                except (OSError, UnicodeDecodeError):
-                    continue
-                if not content.strip():
-                    continue
                 rel_path = str(file_path.relative_to(self.repo_path))
                 chunks = self._chunk_file(content, rel_path)
@@ -366,8 +266,7 @@ class RecursiveContextManager:
                             metadatas=[chunk_meta],
                             ids=[chunk_id]
                         )
-                    except Exception:
-                        continue
                 count += 1
                 self._indexed_file_count = count
@@ -378,27 +277,12 @@ class RecursiveContextManager:
             self._indexing = False
     def _chunk_file(self, content: str, rel_path: str) -> List[Tuple[str, str, dict]]:
-        """Split a file into overlapping chunks for better search precision.
-        CHANGELOG [2026-01-31 - Claude/Opus]
-        Returns list of (id, text, metadata) tuples ready for ChromaDB upsert.
-        Chunks are ~50 lines with 10-line overlap so context isn't lost at
-        chunk boundaries.
-        Args:
-            content: Full file text
-            rel_path: Path relative to repo root (used in metadata and IDs)
-        Returns:
-            List of (chunk_id, chunk_text, metadata_dict) tuples
-        """
         lines = content.split('\n')
         chunks = []
         chunk_size = 50
         overlap = 10
         if len(lines) <= chunk_size:
-            # Small file — index as single chunk
             content_hash = hashlib.md5(content.encode()).hexdigest()[:12]
             chunk_id = f"{rel_path}::full::{content_hash}"
             meta = {
@@ -409,7 +293,6 @@ class RecursiveContextManager:
             }
             chunks.append((chunk_id, content, meta))
         else:
-            # Larger file — split into overlapping chunks
             start = 0
             chunk_num = 0
             while start < len(lines):
@@ -430,54 +313,36 @@ class RecursiveContextManager:
         return chunks
     # =====================================================================
-    # STATS (NEW — was missing, caused crash)
-    # =====================================================================
-    # CHANGELOG [2026-01-31 - Claude/Opus]
-    # app.py calls ctx.get_stats() at module level during Gradio Block
-    # construction AND in the system prompt for every message. It expected
-    # a dict with 'conversations', 'total_files', etc. Without this method,
-    # the app crashes immediately on import.
-    #
-    # Returns safe defaults during indexing so the UI can render.
     # =====================================================================
     def get_stats(self) -> dict:
-        """Return system statistics for the UI sidebar and system prompt.
-        Returns:
-            dict with keys: total_files, indexed_chunks, conversations,
-            chroma_path, persistence_configured, indexing_in_progress,
-            index_error
-        """
-        return {
-            'total_files': self._indexed_file_count,
-            'indexed_chunks': self.collection.count(),
-            'conversations': self.conversations.count(),
-            'chroma_path': CHROMA_DB_PATH,
-            'persistence_configured': self.persistence.is_configured,
-            'indexing_in_progress': self._indexing,
-            'index_error': self._index_error,
-        }
     # =====================================================================
-    # PHASE 1 ORCHESTRATOR TOOLS (preserved from Gemini)
     # =====================================================================
     def create_shadow_branch(self):
-        """Creates a timestamped backup branch of the E-T Systems Space.
-        CHANGELOG [2026-01-31 - Gemini]
-        Safety net before any destructive operations. Creates a branch
-        named vibe-backup-YYYYMMDD-HHMMSS on the E-T Systems HF Space
-        so you can always roll back.
-        """
         timestamp = time.strftime("%Y%m%d-%H%M%S")
         branch_name = f"vibe-backup-{timestamp}"
         try:
-            repo_id = os.getenv(
-                "ET_SYSTEMS_SPACE",
-                "Executor-Tyrant-Framework/Executor-Framworks_Full_VDB"
-            )
             self.persistence.api.create_branch(
                 repo_id=repo_id,
                 branch=branch_name,
@@ -489,46 +354,42 @@ class RecursiveContextManager:
             return f"⚠️ Shadow branch failed: {e}"
     def write_file(self, path: str, content: str):
-        """Writes file strictly if valid CHANGELOG is present.
-        CHANGELOG [2026-01-31 - Gemini]
-        Enforces the living changelog pattern. Any code written by an agent
-        MUST include a CHANGELOG [YYYY-MM-DD - AgentName] header or the
-        write is rejected. This is non-negotiable for the E-T Systems
-        development workflow.
-        Args:
-            path: Relative path within the repo (e.g., "server/routes.ts")
-            content: Full file content (must contain CHANGELOG header)
-        Returns:
-            Success message or rejection reason
-        """
         if not re.search(r"CHANGELOG \[\d{4}-\d{2}-\d{2} - \w+\]", content):
-            return "REJECTED: Missing mandatory CHANGELOG [YYYY-MM-DD - AgentName] header."
         try:
             full_path = self.repo_path / path
             full_path.parent.mkdir(parents=True, exist_ok=True)
             full_path.write_text(content)
-            return f"✅ Successfully wrote {path}"
         except Exception as e:
             return f"Error writing file: {e}"
     def shell_execute(self, command: str):
-        """Runs shell commands in the /workspace directory.
-        CHANGELOG [2026-01-31 - Gemini]
-        Used for build tasks, git operations, dependency installs, etc.
-        Timeout of 30 seconds prevents runaway processes. Captures both
-        stdout and stderr for full diagnostic output.
-        Args:
-            command: Shell command string to execute
-        Returns:
-            Combined stdout/stderr output or error message
-        """
         try:
             result = subprocess.run(
                 command, shell=True, capture_output=True, text=True,
@@ -543,20 +404,6 @@ class RecursiveContextManager:
     # =====================================================================
     def search_code(self, query: str, n: int = 5) -> List[Dict]:
-        """Semantic search across the indexed codebase.
-        CHANGELOG [2025-01-28 - Josh]
-        Core tool for the MIT recursive context technique. The model calls
-        this to find relevant code without loading the entire repo into
-        context.
-        Args:
-            query: Natural language search query
-            n: Max number of results to return (default 5)
-        Returns:
-            List of dicts with 'file' (path) and 'snippet' (first 500 chars)
-        """
         if self.collection.count() == 0:
             return []
         actual_n = min(n, self.collection.count())
@@ -567,23 +414,6 @@ class RecursiveContextManager:
         ]
     def read_file(self, path: str, start_line: int = None, end_line: int = None) -> str:
-        """Read a specific file, optionally a line range.
-        CHANGELOG [2025-01-28 - Josh]
-        Direct file access for when the model knows exactly what it needs.
-        CHANGELOG [2026-01-31 - Claude/Opus]
-        Added optional start_line/end_line params for reading specific
-        sections without loading entire large files into context.
-        Args:
-            path: Relative path within repo (e.g., "server/routes.ts")
-            start_line: Optional 1-based start line
-            end_line: Optional 1-based end line
-        Returns:
-            File contents (full or sliced) or "File not found." message
-        """
         p = self.repo_path / path
         if not p.exists():
             return f"File not found: {path}"
@@ -591,7 +421,7 @@ class RecursiveContextManager:
             content = p.read_text(encoding='utf-8', errors='ignore')
             if start_line is not None or end_line is not None:
                 lines = content.split('\n')
-                start = (start_line or 1) - 1  # Convert to 0-based
                 end = end_line or len(lines)
                 sliced = lines[start:end]
                 return '\n'.join(sliced)
@@ -600,25 +430,9 @@ class RecursiveContextManager:
             return f"Error reading {path}: {e}"
     def list_files(self, path: str = "", max_depth: int = 3) -> str:
-        """List files and directories at a given path.
-        CHANGELOG [2026-01-31 - Claude/Opus]
-        Directory exploration tool. The agent needs to know what files exist
-        before it can read or search them. Returns a tree-formatted listing
-        up to max_depth levels deep.
-        Args:
-            path: Relative path within repo (default "" = repo root)
-            max_depth: How many levels deep to list (default 3)
-        Returns:
-            Formatted string showing directory tree
-        """
         target = self.repo_path / path
-        if not target.exists():
-            return f"Path not found: {path}"
-        if not target.is_dir():
-            return f"Not a directory: {path}"
         skip_dirs = {
             '.git', '__pycache__', 'node_modules', 'venv', '.venv',
@@ -628,16 +442,13 @@ class RecursiveContextManager:
         lines = [f"📂 {path or '(repo root)'}"]
         def _walk(dir_path: Path, prefix: str, depth: int):
-            if depth > max_depth:
-                return
             try:
                 entries = sorted(dir_path.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
-            except PermissionError:
-                return
             for i, entry in enumerate(entries):
-                if entry.name in skip_dirs or entry.name.startswith('.'):
-                    continue
                 is_last = (i == len(entries) - 1)
                 connector = "└── " if is_last else "├── "
                 if entry.is_dir():
@@ -653,110 +464,31 @@ class RecursiveContextManager:
         return '\n'.join(lines)
     def search_conversations(self, query: str, n: int = 5) -> List[Dict]:
-        """Semantic search over past conversation history.
-        CHANGELOG [2026-01-31 - Claude/Opus]
-        This is how Clawdbot "remembers" past discussions. Conversations
-        are saved to ChromaDB via save_conversation_turn() and backed up
-        to the HF Dataset repo. This searches them semantically.
-        Args:
-            query: Natural language search query
-            n: Max results to return
-        Returns:
-            List of dicts with 'content' and 'metadata' from matched turns
-        """
-        if self.conversations.count() == 0:
-            return []
         actual_n = min(n, self.conversations.count())
         res = self.conversations.query(query_texts=[query], n_results=actual_n)
         results = []
         for doc, meta in zip(res['documents'][0], res['metadatas'][0]):
-            results.append({
-                'content': doc[:1000],  # Cap at 1000 chars per result
-                'metadata': meta
-            })
         return results
     def search_testament(self, query: str, n: int = 5) -> List[Dict]:
-        """Search for Testament/architectural decision records.
-        CHANGELOG [2026-01-31 - Claude/Opus]
-        The Testament contains design decisions, constitutional principles,
-        and architectural rationale for E-T Systems. This searches for
-        testament-specific files first (TESTAMENT.md, DECISIONS.md, etc.),
-        then falls back to general codebase search filtered for decision-
-        related content.
-        Args:
-            query: What architectural decision to search for
-            n: Max results
-        Returns:
-            List of dicts with 'file' and 'snippet' from matching documents
-        """
-        # First, look for dedicated testament/decision files
-        testament_names = {
-            'testament', 'decisions', 'adr', 'architecture',
-            'principles', 'constitution', 'changelog', 'design'
-        }
         testament_results = []
         if self.collection.count() > 0:
-            # Search the codebase but prefer testament-like files
-            actual_n = min(n * 2, self.collection.count())  # Get extra, then filter
             res = self.collection.query(query_texts=[query], n_results=actual_n)
             for doc, meta in zip(res['documents'][0], res['metadatas'][0]):
                 path_lower = meta.get('path', '').lower()
-                # Check if this is a testament/decision file
                 is_testament = any(name in path_lower for name in testament_names)
                 testament_results.append({
                     'file': meta['path'],
                     'snippet': doc[:500],
                     'is_testament': is_testament
                 })
-        # Sort: testament files first, then other matches
         testament_results.sort(key=lambda r: (not r.get('is_testament', False)))
         return testament_results[:n]
-    def get_stats(self) -> dict:
-        """WHY: Provides the 'Face Documentation' for the sidebar metrics."""
-        try:
-            return {
-                "total_files": self.collection.count(),
-                "indexed_chunks": self.collection.count(),
-                "conversations": self.conversations.count(),
-                "chroma_path": str(CHROMA_DB_PATH),
-                "persistence_configured": self.persistence.is_configured,
-                "indexing_in_progress": False
-            }
-        except Exception as e:
-            return {"index_error": str(e)}
-    def save_conversation_turn(self, u, a, t_id):
-        """WHY: Prevents amnesia by pulling FULL history before cloud push."""
-        combined = f"USER: {u}\n\nASSISTANT: {a}"
-        u_id = f"turn_{int(time.time())}"
-        # 1. Save locally to ChromaDB
-        self.conversations.add(documents=[combined], metadatas=[{"turn": t_id}], ids=[u_id])
-        # 2. Retrieve the complete record to avoid overwriting history with one turn
-        all_convs = self.conversations.get()
-        full_data = []
-        for i in range(len(all_convs['ids'])):
-            full_data.append({
-                "document": all_convs['documents'][i],
-                "metadata": all_convs['metadatas'][i],
-                "id": all_convs['ids'][i]
-            })
-        # 3. Push complete manifest back to your PRO storage
-        self.persistence.save_conversations(full_data)
     def save_conversation_turn(self, u, a, t_id):
         """WHY: Pulls the FULL history before pushing to cloud to prevent memory loss."""
         combined = f"USER: {u}\n\nASSISTANT: {a}"
@@ -776,47 +508,4 @@ class RecursiveContextManager:
             })
         # 3. Push the entire manifest back to your PRO storage dataset
-        self.persistence.save_conversations(full_data)
-    def save_conversation_turn(self, u, a, t_id):
-        """WHY: Prevents amnesia by pushing the FULL history to the cloud, not just the last turn."""
-        combined = f"USER: {u}\n\nASSISTANT: {a}"
-        u_id = f"turn_{int(time.time())}"
-        # 1. Save locally to Chroma
-        self.conversations.add(documents=[combined], metadatas=[{"turn": t_id}], ids=[u_id])
-        # 2. Retrieve ALL history so the cloud backup is a complete record
-        all_convs = self.conversations.get()
-        full_data = []
-        for i in range(len(all_convs['ids'])):
-            full_data.append({
-                "document": all_convs['documents'][i],
-                "metadata": all_convs['metadatas'][i],
-                "id": all_convs['ids'][i]
-            })
-        # 3. Push complete manifest to PRO storage
-        self.persistence.save_conversations(full_data)
-    def save_conversation_turn(self, u, a, t_id):
-        """Save turn locally and push the FULL history to the cloud to prevent memory loss."""
-        combined = f"USER: {u}\n\nASSISTANT: {a}"
-        u_id = f"turn_{int(time.time())}"
-        # 1. Save locally
-        self.conversations.add(documents=[combined], metadatas=[{"turn": t_id}], ids=[u_id])
-        # 2. To prevent amnesia, we must retrieve ALL historical turns from the local database
-        all_convs = self.conversations.get()
-        data_to_save = []
-        for i in range(len(all_convs['ids'])):
-            data_to_save.append({
-                "document": all_convs['documents'][i],
-                "metadata": all_convs['metadatas'][i],
-                "id": all_convs['ids'][i]
-            })
-        # 3. Push the COMPLETE history to your PRO storage (replaces the previous file)
-        self.persistence.save_conversations(data_to_save)

   search_code, read_file, save_conversation_turn — all unchanged.
 NOTE: get_stats() is critical — app.py calls it at module level during UI
   construction AND in the system prompt. Missing it = instant crash.
+CHANGELOG [2026-02-02 - Gemini Pro]
+FIXED: write_file now pushes to Remote Space (Permanent Persistence).
+FIXED: Relaxed CHANGELOG check to non-blocking warning.
+CLEANED: Removed duplicate function definitions at EOF.
 """
 from pathlib import Path
 # =============================================================================
 # CHROMA DB PATH SELECTION
 # =============================================================================
 def _select_chroma_path():
     """HF Spaces Docker containers wipe everything EXCEPT /data on restart."""
     data_path = Path("/data/chroma_db")
 # =============================================================================
 # HF DATASET PERSISTENCE
 # =============================================================================
 class HFDatasetPersistence:
     """Handles durable cloud storage via your 1TB PRO Dataset repository."""
 # =============================================================================
 class RecursiveContextManager:
+    """Manages unlimited context and vibe-coding tools for E-T Systems."""
     INDEXABLE_EXTENSIONS = {
         '.py', '.js', '.ts', '.jsx', '.tsx', '.mjs', '.cjs',
         '.json', '.yaml', '.yml', '.toml',
         '.html', '.css', '.scss',
         '.sh', '.bash',
         '.sql',
+        '.env.example',
         '.gitignore', '.dockerignore',
         '.cfg', '.ini', '.conf',
     }
     MAX_INDEX_SIZE = 256 * 1024
     def __init__(self, repo_path: str):
         self.repo_path = Path(repo_path)
         self.persistence = HFDatasetPersistence()
+        # Embedding Config
         self.embedding_function = ONNXMiniLM_L6_V2()
         cache_dir = os.getenv("CHROMA_CACHE_DIR", "/tmp/.cache/chroma")
         self.embedding_function.DOWNLOAD_PATH = cache_dir
             embedding_function=self.embedding_function
         )
         if self.conversations.count() == 0:
             self._restore_from_cloud()
         self._indexing = False
         self._index_error = None
         self._indexed_file_count = 0
             self._start_background_indexing()
     def _restore_from_cloud(self):
         data = self.persistence.load_conversations()
         for conv in data:
             try:
                 pass
     def _get_collection_name(self) -> str:
         path_hash = hashlib.md5(str(self.repo_path).encode()).hexdigest()[:8]
         return f"codebase_{path_hash}"
     # =====================================================================
     # REPOSITORY INDEXING
     # =====================================================================
     def _start_background_indexing(self):
         self._indexing = True
         thread = threading.Thread(target=self._index_repository, daemon=True)
         thread.start()
     def _index_repository(self):
         try:
             skip_dirs = {
                 '.git', '__pycache__', 'node_modules', 'venv', '.venv',
             count = 0
             for file_path in self.repo_path.rglob('*'):
+                if file_path.is_dir(): continue
+                if any(skip in file_path.parts for skip in skip_dirs): continue
                 suffix = file_path.suffix.lower()
                 if suffix not in self.INDEXABLE_EXTENSIONS:
+                    if file_path.name not in {'Dockerfile', 'Makefile', 'Procfile', '.gitignore', '.dockerignore', '.env.example'}:
                         continue
                 try:
+                    if file_path.stat().st_size > self.MAX_INDEX_SIZE: continue
+                except OSError: continue
                 try:
                     content = file_path.read_text(encoding='utf-8', errors='ignore')
+                except (OSError, UnicodeDecodeError): continue
+                if not content.strip(): continue
                 rel_path = str(file_path.relative_to(self.repo_path))
                 chunks = self._chunk_file(content, rel_path)
                             metadatas=[chunk_meta],
                             ids=[chunk_id]
                         )
+                    except Exception: continue
                 count += 1
                 self._indexed_file_count = count
             self._indexing = False
     def _chunk_file(self, content: str, rel_path: str) -> List[Tuple[str, str, dict]]:
         lines = content.split('\n')
         chunks = []
         chunk_size = 50
         overlap = 10
         if len(lines) <= chunk_size:
             content_hash = hashlib.md5(content.encode()).hexdigest()[:12]
             chunk_id = f"{rel_path}::full::{content_hash}"
             meta = {
             }
             chunks.append((chunk_id, content, meta))
         else:
             start = 0
             chunk_num = 0
             while start < len(lines):
         return chunks
     # =====================================================================
+    # STATS
     # =====================================================================
     def get_stats(self) -> dict:
+        """Return system statistics for the UI sidebar and system prompt."""
+        try:
+            return {
+                'total_files': self._indexed_file_count,
+                'indexed_chunks': self.collection.count(),
+                'conversations': self.conversations.count(),
+                'chroma_path': CHROMA_DB_PATH,
+                'persistence_configured': self.persistence.is_configured,
+                'indexing_in_progress': self._indexing,
+                'index_error': self._index_error,
+            }
+        except Exception as e:
+            return {"index_error": str(e)}
     # =====================================================================
+    # PHASE 1 ORCHESTRATOR TOOLS
     # =====================================================================
     def create_shadow_branch(self):
+        """Creates a timestamped backup branch of the E-T Systems Space."""
         timestamp = time.strftime("%Y%m%d-%H%M%S")
         branch_name = f"vibe-backup-{timestamp}"
         try:
+            repo_id = os.getenv("ET_SYSTEMS_SPACE")
+            if not repo_id: return "Error: ET_SYSTEMS_SPACE env var not set."
             self.persistence.api.create_branch(
                 repo_id=repo_id,
                 branch=branch_name,
             return f"⚠️ Shadow branch failed: {e}"
     def write_file(self, path: str, content: str):
+        """Writes file locally AND pushes to the remote HF Space."""
+        warning = ""
+        # 1. Non-blocking warning instead of rejection
         if not re.search(r"CHANGELOG \[\d{4}-\d{2}-\d{2} - \w+\]", content):
+            warning = "\n⚠️ NOTE: Missing CHANGELOG header."
         try:
+            # 2. Write to Local Disk (Container)
             full_path = self.repo_path / path
             full_path.parent.mkdir(parents=True, exist_ok=True)
             full_path.write_text(content)
+            # 3. Push to Remote Space (Persistence)
+            remote_msg = ""
+            target_space = os.getenv("ET_SYSTEMS_SPACE")
+            if self.persistence.is_configured and target_space:
+                try:
+                    self.persistence.api.upload_file(
+                        path_or_fileobj=str(full_path),
+                        path_in_repo=path,
+                        repo_id=target_space,
+                        repo_type="space",
+                        token=self.persistence.token,
+                        commit_message=f"Clawdbot update: {path}"
+                    )
+                    remote_msg = f"\n🚀 Pushed to remote Space: {target_space}"
+                except Exception as e:
+                    remote_msg = f"\n⚠️ Local write success, but remote push failed: {e}"
+            return f"✅ Wrote {path}{warning}{remote_msg}"
         except Exception as e:
             return f"Error writing file: {e}"
     def shell_execute(self, command: str):
+        """Runs shell commands in the /workspace directory."""
         try:
             result = subprocess.run(
                 command, shell=True, capture_output=True, text=True,
     # =====================================================================
     def search_code(self, query: str, n: int = 5) -> List[Dict]:
         if self.collection.count() == 0:
             return []
         actual_n = min(n, self.collection.count())
         ]
     def read_file(self, path: str, start_line: int = None, end_line: int = None) -> str:
         p = self.repo_path / path
         if not p.exists():
             return f"File not found: {path}"
             content = p.read_text(encoding='utf-8', errors='ignore')
             if start_line is not None or end_line is not None:
                 lines = content.split('\n')
+                start = (start_line or 1) - 1
                 end = end_line or len(lines)
                 sliced = lines[start:end]
                 return '\n'.join(sliced)
             return f"Error reading {path}: {e}"
     def list_files(self, path: str = "", max_depth: int = 3) -> str:
         target = self.repo_path / path
+        if not target.exists(): return f"Path not found: {path}"
+        if not target.is_dir(): return f"Not a directory: {path}"
         skip_dirs = {
             '.git', '__pycache__', 'node_modules', 'venv', '.venv',
         lines = [f"📂 {path or '(repo root)'}"]
         def _walk(dir_path: Path, prefix: str, depth: int):
+            if depth > max_depth: return
             try:
                 entries = sorted(dir_path.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
+            except PermissionError: return
             for i, entry in enumerate(entries):
+                if entry.name in skip_dirs or entry.name.startswith('.'): continue
                 is_last = (i == len(entries) - 1)
                 connector = "└── " if is_last else "├── "
                 if entry.is_dir():
         return '\n'.join(lines)
     def search_conversations(self, query: str, n: int = 5) -> List[Dict]:
+        if self.conversations.count() == 0: return []
         actual_n = min(n, self.conversations.count())
         res = self.conversations.query(query_texts=[query], n_results=actual_n)
         results = []
         for doc, meta in zip(res['documents'][0], res['metadatas'][0]):
+            results.append({'content': doc[:1000], 'metadata': meta})
         return results
     def search_testament(self, query: str, n: int = 5) -> List[Dict]:
+        testament_names = {'testament', 'decisions', 'adr', 'architecture', 'principles', 'constitution', 'changelog', 'design'}
         testament_results = []
         if self.collection.count() > 0:
+            actual_n = min(n * 2, self.collection.count())
             res = self.collection.query(query_texts=[query], n_results=actual_n)
             for doc, meta in zip(res['documents'][0], res['metadatas'][0]):
                 path_lower = meta.get('path', '').lower()
                 is_testament = any(name in path_lower for name in testament_names)
                 testament_results.append({
                     'file': meta['path'],
                     'snippet': doc[:500],
                     'is_testament': is_testament
                 })
         testament_results.sort(key=lambda r: (not r.get('is_testament', False)))
         return testament_results[:n]
     def save_conversation_turn(self, u, a, t_id):
         """WHY: Pulls the FULL history before pushing to cloud to prevent memory loss."""
         combined = f"USER: {u}\n\nASSISTANT: {a}"
             })
         # 3. Push the entire manifest back to your PRO storage dataset
+        self.persistence.save_conversations(full_data)