Executor-Tyrant-Framework commited on
Commit
584f417
·
verified ·
1 Parent(s): e965918

Update recursive_context.py

Browse files
Files changed (1) hide show
  1. recursive_context.py +73 -384
recursive_context.py CHANGED
@@ -31,6 +31,11 @@ PRESERVED: All existing functions from prior changelogs remain intact.
31
  search_code, read_file, save_conversation_turn — all unchanged.
32
  NOTE: get_stats() is critical — app.py calls it at module level during UI
33
  construction AND in the system prompt. Missing it = instant crash.
 
 
 
 
 
34
  """
35
 
36
  from pathlib import Path
@@ -50,12 +55,6 @@ import re
50
  # =============================================================================
51
  # CHROMA DB PATH SELECTION
52
  # =============================================================================
53
- # CHANGELOG [2026-01-31 - Gemini]
54
- # HF Spaces Docker containers wipe everything EXCEPT /data on restart.
55
- # We prefer /data/chroma_db (persistent) but fall back to /workspace/chroma_db
56
- # (ephemeral) if /data isn't writable.
57
- # =============================================================================
58
-
59
  def _select_chroma_path():
60
  """HF Spaces Docker containers wipe everything EXCEPT /data on restart."""
61
  data_path = Path("/data/chroma_db")
@@ -77,11 +76,6 @@ CHROMA_DB_PATH = _select_chroma_path()
77
  # =============================================================================
78
  # HF DATASET PERSISTENCE
79
  # =============================================================================
80
- # CHANGELOG [2026-01-31 - Gemini]
81
- # Handles durable cloud storage via HF Dataset repository. Conversations
82
- # survive Space restarts by backing up to a private dataset repo.
83
- # =============================================================================
84
-
85
  class HFDatasetPersistence:
86
  """Handles durable cloud storage via your 1TB PRO Dataset repository."""
87
 
@@ -159,31 +153,8 @@ class HFDatasetPersistence:
159
  # =============================================================================
160
 
161
  class RecursiveContextManager:
162
- """Manages unlimited context and vibe-coding tools for E-T Systems.
163
-
164
- CHANGELOG [2026-01-31 - Claude/Opus]
165
- This is the core class. It provides:
166
- - ChromaDB-backed semantic search over the codebase and conversations
167
- - File read/write with changelog enforcement
168
- - Shell execution for build tasks
169
- - Shadow branching for safe experimentation
170
- - Stats reporting for the UI sidebar
171
- - Repository indexing (background thread on init)
172
-
173
- ARCHITECTURE NOTE:
174
- The class is initialized once at module level in app.py. That means
175
- __init__ runs during import, so it MUST NOT block or crash. Heavy work
176
- (like indexing the repo) is dispatched to a background thread.
177
- get_stats() must return sensible defaults even before indexing completes.
178
- """
179
-
180
- # =========================================================================
181
- # FILE EXTENSIONS TO INDEX
182
- # =========================================================================
183
- # CHANGELOG [2026-01-31 - Claude/Opus]
184
- # Only index code/text files. Binary files, images, and large data files
185
- # would pollute the vector space and waste embedding compute.
186
- # =========================================================================
187
  INDEXABLE_EXTENSIONS = {
188
  '.py', '.js', '.ts', '.jsx', '.tsx', '.mjs', '.cjs',
189
  '.json', '.yaml', '.yml', '.toml',
@@ -191,27 +162,18 @@ class RecursiveContextManager:
191
  '.html', '.css', '.scss',
192
  '.sh', '.bash',
193
  '.sql',
194
- '.env.example', # Not .env itself — that's sensitive
195
  '.gitignore', '.dockerignore',
196
  '.cfg', '.ini', '.conf',
197
  }
198
 
199
- # Max file size to index (256KB). Larger files are likely generated/data.
200
  MAX_INDEX_SIZE = 256 * 1024
201
 
202
  def __init__(self, repo_path: str):
203
  self.repo_path = Path(repo_path)
204
  self.persistence = HFDatasetPersistence()
205
 
206
- # =================================================================
207
- # EMBEDDING CONFIG
208
- # =================================================================
209
- # CHANGELOG [2026-01-31 - Gemini]
210
- # Fixes /.cache PermissionError. ChromaDB's ONNXMiniLM_L6_V2 tries
211
- # to download model weights to ~/.cache. In Docker as UID 1000,
212
- # that's /.cache (root-owned). We override DOWNLOAD_PATH to a
213
- # writable directory.
214
- # =================================================================
215
  self.embedding_function = ONNXMiniLM_L6_V2()
216
  cache_dir = os.getenv("CHROMA_CACHE_DIR", "/tmp/.cache/chroma")
217
  self.embedding_function.DOWNLOAD_PATH = cache_dir
@@ -232,17 +194,9 @@ class RecursiveContextManager:
232
  embedding_function=self.embedding_function
233
  )
234
 
235
- # Restore conversations from cloud backup if local is empty
236
  if self.conversations.count() == 0:
237
  self._restore_from_cloud()
238
 
239
- # =================================================================
240
- # BACKGROUND INDEXING
241
- # =================================================================
242
- # CHANGELOG [2026-01-31 - Claude/Opus]
243
- # Index the repository in a background thread so startup isn't
244
- # blocked. The _indexing flag lets get_stats() report status.
245
- # =================================================================
246
  self._indexing = False
247
  self._index_error = None
248
  self._indexed_file_count = 0
@@ -250,13 +204,6 @@ class RecursiveContextManager:
250
  self._start_background_indexing()
251
 
252
  def _restore_from_cloud(self):
253
- """Restore conversation history from HF Dataset backup.
254
-
255
- CHANGELOG [2026-01-31 - Gemini]
256
- Called during init if the local ChromaDB conversations collection
257
- is empty. Pulls from the cloud dataset repo to recover history
258
- after a Space restart.
259
- """
260
  data = self.persistence.load_conversations()
261
  for conv in data:
262
  try:
@@ -269,50 +216,19 @@ class RecursiveContextManager:
269
  pass
270
 
271
  def _get_collection_name(self) -> str:
272
- """Generate a deterministic collection name from the repo path.
273
-
274
- CHANGELOG [2025-01-28 - Josh]
275
- Uses MD5 hash of repo path so different repos get different
276
- collections within the same ChromaDB instance.
277
- """
278
  path_hash = hashlib.md5(str(self.repo_path).encode()).hexdigest()[:8]
279
  return f"codebase_{path_hash}"
280
 
281
  # =====================================================================
282
  # REPOSITORY INDEXING
283
  # =====================================================================
284
- # CHANGELOG [2026-01-31 - Claude/Opus]
285
- # Without indexing, search_code() always returns empty results because
286
- # nothing is ever added to the ChromaDB codebase collection. This walks
287
- # the repo, reads indexable files, chunks them, and upserts into ChromaDB.
288
- #
289
- # DESIGN DECISIONS:
290
- # - Background thread: Don't block Gradio startup. Users can chat while
291
- # indexing runs. get_stats() shows indexing progress.
292
- # - Chunk by logical blocks: Split files into ~50-line chunks with overlap
293
- # so semantic search finds relevant sections, not just file-level matches.
294
- # - Upsert (not add): Safe to re-run. If the file was already indexed
295
- # with the same content hash, ChromaDB skips it.
296
- # - Skip .git, __pycache__, node_modules, venv: No value in indexing these.
297
- #
298
- # TESTED ALTERNATIVES (graveyard):
299
- # - Indexing entire files as single documents: Poor search precision.
300
- # A 500-line file matching on line 3 returns all 500 lines.
301
- # - Line-by-line indexing: Too many tiny documents, poor semantic context.
302
- # - Synchronous indexing: Blocks startup for 30+ seconds on large repos.
303
- # =====================================================================
304
 
305
  def _start_background_indexing(self):
306
- """Kick off repo indexing in a daemon thread."""
307
  self._indexing = True
308
  thread = threading.Thread(target=self._index_repository, daemon=True)
309
  thread.start()
310
 
311
  def _index_repository(self):
312
- """Walk the repo and index code files into ChromaDB.
313
-
314
- Runs in background thread. Sets self._indexing = False when done.
315
- """
316
  try:
317
  skip_dirs = {
318
  '.git', '__pycache__', 'node_modules', 'venv', '.venv',
@@ -322,39 +238,23 @@ class RecursiveContextManager:
322
  count = 0
323
 
324
  for file_path in self.repo_path.rglob('*'):
325
- # Skip directories and non-indexable files
326
- if file_path.is_dir():
327
- continue
328
-
329
- # Skip files in excluded directories
330
- if any(skip in file_path.parts for skip in skip_dirs):
331
- continue
332
 
333
- # Check extension
334
  suffix = file_path.suffix.lower()
335
  if suffix not in self.INDEXABLE_EXTENSIONS:
336
- # Also allow extensionless files if they look like configs
337
- if file_path.name not in {
338
- 'Dockerfile', 'Makefile', 'Procfile',
339
- '.gitignore', '.dockerignore', '.env.example'
340
- }:
341
  continue
342
 
343
- # Check size
344
  try:
345
- if file_path.stat().st_size > self.MAX_INDEX_SIZE:
346
- continue
347
- except OSError:
348
- continue
349
 
350
- # Read and chunk the file
351
  try:
352
  content = file_path.read_text(encoding='utf-8', errors='ignore')
353
- except (OSError, UnicodeDecodeError):
354
- continue
355
 
356
- if not content.strip():
357
- continue
358
 
359
  rel_path = str(file_path.relative_to(self.repo_path))
360
  chunks = self._chunk_file(content, rel_path)
@@ -366,8 +266,7 @@ class RecursiveContextManager:
366
  metadatas=[chunk_meta],
367
  ids=[chunk_id]
368
  )
369
- except Exception:
370
- continue
371
 
372
  count += 1
373
  self._indexed_file_count = count
@@ -378,27 +277,12 @@ class RecursiveContextManager:
378
  self._indexing = False
379
 
380
  def _chunk_file(self, content: str, rel_path: str) -> List[Tuple[str, str, dict]]:
381
- """Split a file into overlapping chunks for better search precision.
382
-
383
- CHANGELOG [2026-01-31 - Claude/Opus]
384
- Returns list of (id, text, metadata) tuples ready for ChromaDB upsert.
385
- Chunks are ~50 lines with 10-line overlap so context isn't lost at
386
- chunk boundaries.
387
-
388
- Args:
389
- content: Full file text
390
- rel_path: Path relative to repo root (used in metadata and IDs)
391
-
392
- Returns:
393
- List of (chunk_id, chunk_text, metadata_dict) tuples
394
- """
395
  lines = content.split('\n')
396
  chunks = []
397
  chunk_size = 50
398
  overlap = 10
399
 
400
  if len(lines) <= chunk_size:
401
- # Small file — index as single chunk
402
  content_hash = hashlib.md5(content.encode()).hexdigest()[:12]
403
  chunk_id = f"{rel_path}::full::{content_hash}"
404
  meta = {
@@ -409,7 +293,6 @@ class RecursiveContextManager:
409
  }
410
  chunks.append((chunk_id, content, meta))
411
  else:
412
- # Larger file — split into overlapping chunks
413
  start = 0
414
  chunk_num = 0
415
  while start < len(lines):
@@ -430,54 +313,36 @@ class RecursiveContextManager:
430
  return chunks
431
 
432
  # =====================================================================
433
- # STATS (NEW — was missing, caused crash)
434
- # =====================================================================
435
- # CHANGELOG [2026-01-31 - Claude/Opus]
436
- # app.py calls ctx.get_stats() at module level during Gradio Block
437
- # construction AND in the system prompt for every message. It expected
438
- # a dict with 'conversations', 'total_files', etc. Without this method,
439
- # the app crashes immediately on import.
440
- #
441
- # Returns safe defaults during indexing so the UI can render.
442
  # =====================================================================
443
 
444
  def get_stats(self) -> dict:
445
- """Return system statistics for the UI sidebar and system prompt.
446
-
447
- Returns:
448
- dict with keys: total_files, indexed_chunks, conversations,
449
- chroma_path, persistence_configured, indexing_in_progress,
450
- index_error
451
- """
452
- return {
453
- 'total_files': self._indexed_file_count,
454
- 'indexed_chunks': self.collection.count(),
455
- 'conversations': self.conversations.count(),
456
- 'chroma_path': CHROMA_DB_PATH,
457
- 'persistence_configured': self.persistence.is_configured,
458
- 'indexing_in_progress': self._indexing,
459
- 'index_error': self._index_error,
460
- }
461
 
462
  # =====================================================================
463
- # PHASE 1 ORCHESTRATOR TOOLS (preserved from Gemini)
464
  # =====================================================================
465
 
466
  def create_shadow_branch(self):
467
- """Creates a timestamped backup branch of the E-T Systems Space.
468
-
469
- CHANGELOG [2026-01-31 - Gemini]
470
- Safety net before any destructive operations. Creates a branch
471
- named vibe-backup-YYYYMMDD-HHMMSS on the E-T Systems HF Space
472
- so you can always roll back.
473
- """
474
  timestamp = time.strftime("%Y%m%d-%H%M%S")
475
  branch_name = f"vibe-backup-{timestamp}"
476
  try:
477
- repo_id = os.getenv(
478
- "ET_SYSTEMS_SPACE",
479
- "Executor-Tyrant-Framework/Executor-Framworks_Full_VDB"
480
- )
481
  self.persistence.api.create_branch(
482
  repo_id=repo_id,
483
  branch=branch_name,
@@ -489,46 +354,42 @@ class RecursiveContextManager:
489
  return f"⚠️ Shadow branch failed: {e}"
490
 
491
  def write_file(self, path: str, content: str):
492
- """Writes file strictly if valid CHANGELOG is present.
493
-
494
- CHANGELOG [2026-01-31 - Gemini]
495
- Enforces the living changelog pattern. Any code written by an agent
496
- MUST include a CHANGELOG [YYYY-MM-DD - AgentName] header or the
497
- write is rejected. This is non-negotiable for the E-T Systems
498
- development workflow.
499
-
500
- Args:
501
- path: Relative path within the repo (e.g., "server/routes.ts")
502
- content: Full file content (must contain CHANGELOG header)
503
-
504
- Returns:
505
- Success message or rejection reason
506
- """
507
  if not re.search(r"CHANGELOG \[\d{4}-\d{2}-\d{2} - \w+\]", content):
508
- return "REJECTED: Missing mandatory CHANGELOG [YYYY-MM-DD - AgentName] header."
509
 
510
  try:
 
511
  full_path = self.repo_path / path
512
  full_path.parent.mkdir(parents=True, exist_ok=True)
513
  full_path.write_text(content)
514
- return f"✅ Successfully wrote {path}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
  except Exception as e:
516
  return f"Error writing file: {e}"
517
 
518
  def shell_execute(self, command: str):
519
- """Runs shell commands in the /workspace directory.
520
-
521
- CHANGELOG [2026-01-31 - Gemini]
522
- Used for build tasks, git operations, dependency installs, etc.
523
- Timeout of 30 seconds prevents runaway processes. Captures both
524
- stdout and stderr for full diagnostic output.
525
-
526
- Args:
527
- command: Shell command string to execute
528
-
529
- Returns:
530
- Combined stdout/stderr output or error message
531
- """
532
  try:
533
  result = subprocess.run(
534
  command, shell=True, capture_output=True, text=True,
@@ -543,20 +404,6 @@ class RecursiveContextManager:
543
  # =====================================================================
544
 
545
  def search_code(self, query: str, n: int = 5) -> List[Dict]:
546
- """Semantic search across the indexed codebase.
547
-
548
- CHANGELOG [2025-01-28 - Josh]
549
- Core tool for the MIT recursive context technique. The model calls
550
- this to find relevant code without loading the entire repo into
551
- context.
552
-
553
- Args:
554
- query: Natural language search query
555
- n: Max number of results to return (default 5)
556
-
557
- Returns:
558
- List of dicts with 'file' (path) and 'snippet' (first 500 chars)
559
- """
560
  if self.collection.count() == 0:
561
  return []
562
  actual_n = min(n, self.collection.count())
@@ -567,23 +414,6 @@ class RecursiveContextManager:
567
  ]
568
 
569
  def read_file(self, path: str, start_line: int = None, end_line: int = None) -> str:
570
- """Read a specific file, optionally a line range.
571
-
572
- CHANGELOG [2025-01-28 - Josh]
573
- Direct file access for when the model knows exactly what it needs.
574
-
575
- CHANGELOG [2026-01-31 - Claude/Opus]
576
- Added optional start_line/end_line params for reading specific
577
- sections without loading entire large files into context.
578
-
579
- Args:
580
- path: Relative path within repo (e.g., "server/routes.ts")
581
- start_line: Optional 1-based start line
582
- end_line: Optional 1-based end line
583
-
584
- Returns:
585
- File contents (full or sliced) or "File not found." message
586
- """
587
  p = self.repo_path / path
588
  if not p.exists():
589
  return f"File not found: {path}"
@@ -591,7 +421,7 @@ class RecursiveContextManager:
591
  content = p.read_text(encoding='utf-8', errors='ignore')
592
  if start_line is not None or end_line is not None:
593
  lines = content.split('\n')
594
- start = (start_line or 1) - 1 # Convert to 0-based
595
  end = end_line or len(lines)
596
  sliced = lines[start:end]
597
  return '\n'.join(sliced)
@@ -600,25 +430,9 @@ class RecursiveContextManager:
600
  return f"Error reading {path}: {e}"
601
 
602
  def list_files(self, path: str = "", max_depth: int = 3) -> str:
603
- """List files and directories at a given path.
604
-
605
- CHANGELOG [2026-01-31 - Claude/Opus]
606
- Directory exploration tool. The agent needs to know what files exist
607
- before it can read or search them. Returns a tree-formatted listing
608
- up to max_depth levels deep.
609
-
610
- Args:
611
- path: Relative path within repo (default "" = repo root)
612
- max_depth: How many levels deep to list (default 3)
613
-
614
- Returns:
615
- Formatted string showing directory tree
616
- """
617
  target = self.repo_path / path
618
- if not target.exists():
619
- return f"Path not found: {path}"
620
- if not target.is_dir():
621
- return f"Not a directory: {path}"
622
 
623
  skip_dirs = {
624
  '.git', '__pycache__', 'node_modules', 'venv', '.venv',
@@ -628,16 +442,13 @@ class RecursiveContextManager:
628
  lines = [f"📂 {path or '(repo root)'}"]
629
 
630
  def _walk(dir_path: Path, prefix: str, depth: int):
631
- if depth > max_depth:
632
- return
633
  try:
634
  entries = sorted(dir_path.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
635
- except PermissionError:
636
- return
637
 
638
  for i, entry in enumerate(entries):
639
- if entry.name in skip_dirs or entry.name.startswith('.'):
640
- continue
641
  is_last = (i == len(entries) - 1)
642
  connector = "└── " if is_last else "├── "
643
  if entry.is_dir():
@@ -653,110 +464,31 @@ class RecursiveContextManager:
653
  return '\n'.join(lines)
654
 
655
  def search_conversations(self, query: str, n: int = 5) -> List[Dict]:
656
- """Semantic search over past conversation history.
657
-
658
- CHANGELOG [2026-01-31 - Claude/Opus]
659
- This is how Clawdbot "remembers" past discussions. Conversations
660
- are saved to ChromaDB via save_conversation_turn() and backed up
661
- to the HF Dataset repo. This searches them semantically.
662
-
663
- Args:
664
- query: Natural language search query
665
- n: Max results to return
666
-
667
- Returns:
668
- List of dicts with 'content' and 'metadata' from matched turns
669
- """
670
- if self.conversations.count() == 0:
671
- return []
672
  actual_n = min(n, self.conversations.count())
673
  res = self.conversations.query(query_texts=[query], n_results=actual_n)
674
  results = []
675
  for doc, meta in zip(res['documents'][0], res['metadatas'][0]):
676
- results.append({
677
- 'content': doc[:1000], # Cap at 1000 chars per result
678
- 'metadata': meta
679
- })
680
  return results
681
 
682
  def search_testament(self, query: str, n: int = 5) -> List[Dict]:
683
- """Search for Testament/architectural decision records.
684
-
685
- CHANGELOG [2026-01-31 - Claude/Opus]
686
- The Testament contains design decisions, constitutional principles,
687
- and architectural rationale for E-T Systems. This searches for
688
- testament-specific files first (TESTAMENT.md, DECISIONS.md, etc.),
689
- then falls back to general codebase search filtered for decision-
690
- related content.
691
-
692
- Args:
693
- query: What architectural decision to search for
694
- n: Max results
695
-
696
- Returns:
697
- List of dicts with 'file' and 'snippet' from matching documents
698
- """
699
- # First, look for dedicated testament/decision files
700
- testament_names = {
701
- 'testament', 'decisions', 'adr', 'architecture',
702
- 'principles', 'constitution', 'changelog', 'design'
703
- }
704
-
705
  testament_results = []
706
  if self.collection.count() > 0:
707
- # Search the codebase but prefer testament-like files
708
- actual_n = min(n * 2, self.collection.count()) # Get extra, then filter
709
  res = self.collection.query(query_texts=[query], n_results=actual_n)
710
  for doc, meta in zip(res['documents'][0], res['metadatas'][0]):
711
  path_lower = meta.get('path', '').lower()
712
- # Check if this is a testament/decision file
713
  is_testament = any(name in path_lower for name in testament_names)
714
  testament_results.append({
715
  'file': meta['path'],
716
  'snippet': doc[:500],
717
  'is_testament': is_testament
718
  })
719
-
720
- # Sort: testament files first, then other matches
721
  testament_results.sort(key=lambda r: (not r.get('is_testament', False)))
722
  return testament_results[:n]
723
 
724
- def get_stats(self) -> dict:
725
- """WHY: Provides the 'Face Documentation' for the sidebar metrics."""
726
- try:
727
- return {
728
- "total_files": self.collection.count(),
729
- "indexed_chunks": self.collection.count(),
730
- "conversations": self.conversations.count(),
731
- "chroma_path": str(CHROMA_DB_PATH),
732
- "persistence_configured": self.persistence.is_configured,
733
- "indexing_in_progress": False
734
- }
735
- except Exception as e:
736
- return {"index_error": str(e)}
737
-
738
- def save_conversation_turn(self, u, a, t_id):
739
- """WHY: Prevents amnesia by pulling FULL history before cloud push."""
740
- combined = f"USER: {u}\n\nASSISTANT: {a}"
741
- u_id = f"turn_{int(time.time())}"
742
-
743
- # 1. Save locally to ChromaDB
744
- self.conversations.add(documents=[combined], metadatas=[{"turn": t_id}], ids=[u_id])
745
-
746
- # 2. Retrieve the complete record to avoid overwriting history with one turn
747
- all_convs = self.conversations.get()
748
- full_data = []
749
- for i in range(len(all_convs['ids'])):
750
- full_data.append({
751
- "document": all_convs['documents'][i],
752
- "metadata": all_convs['metadatas'][i],
753
- "id": all_convs['ids'][i]
754
- })
755
-
756
- # 3. Push complete manifest back to your PRO storage
757
- self.persistence.save_conversations(full_data)
758
-
759
-
760
  def save_conversation_turn(self, u, a, t_id):
761
  """WHY: Pulls the FULL history before pushing to cloud to prevent memory loss."""
762
  combined = f"USER: {u}\n\nASSISTANT: {a}"
@@ -776,47 +508,4 @@ class RecursiveContextManager:
776
  })
777
 
778
  # 3. Push the entire manifest back to your PRO storage dataset
779
- self.persistence.save_conversations(full_data)
780
-
781
- def save_conversation_turn(self, u, a, t_id):
782
- """WHY: Prevents amnesia by pushing the FULL history to the cloud, not just the last turn."""
783
- combined = f"USER: {u}\n\nASSISTANT: {a}"
784
- u_id = f"turn_{int(time.time())}"
785
-
786
- # 1. Save locally to Chroma
787
- self.conversations.add(documents=[combined], metadatas=[{"turn": t_id}], ids=[u_id])
788
-
789
- # 2. Retrieve ALL history so the cloud backup is a complete record
790
- all_convs = self.conversations.get()
791
- full_data = []
792
- for i in range(len(all_convs['ids'])):
793
- full_data.append({
794
- "document": all_convs['documents'][i],
795
- "metadata": all_convs['metadatas'][i],
796
- "id": all_convs['ids'][i]
797
- })
798
-
799
- # 3. Push complete manifest to PRO storage
800
- self.persistence.save_conversations(full_data)
801
-
802
-
803
- def save_conversation_turn(self, u, a, t_id):
804
- """Save turn locally and push the FULL history to the cloud to prevent memory loss."""
805
- combined = f"USER: {u}\n\nASSISTANT: {a}"
806
- u_id = f"turn_{int(time.time())}"
807
-
808
- # 1. Save locally
809
- self.conversations.add(documents=[combined], metadatas=[{"turn": t_id}], ids=[u_id])
810
-
811
- # 2. To prevent amnesia, we must retrieve ALL historical turns from the local database
812
- all_convs = self.conversations.get()
813
- data_to_save = []
814
- for i in range(len(all_convs['ids'])):
815
- data_to_save.append({
816
- "document": all_convs['documents'][i],
817
- "metadata": all_convs['metadatas'][i],
818
- "id": all_convs['ids'][i]
819
- })
820
-
821
- # 3. Push the COMPLETE history to your PRO storage (replaces the previous file)
822
- self.persistence.save_conversations(data_to_save)
 
31
  search_code, read_file, save_conversation_turn — all unchanged.
32
  NOTE: get_stats() is critical — app.py calls it at module level during UI
33
  construction AND in the system prompt. Missing it = instant crash.
34
+
35
+ CHANGELOG [2026-02-02 - Gemini Pro]
36
+ FIXED: write_file now pushes to Remote Space (Permanent Persistence).
37
+ FIXED: Relaxed CHANGELOG check to non-blocking warning.
38
+ CLEANED: Removed duplicate function definitions at EOF.
39
  """
40
 
41
  from pathlib import Path
 
55
  # =============================================================================
56
  # CHROMA DB PATH SELECTION
57
  # =============================================================================
 
 
 
 
 
 
58
  def _select_chroma_path():
59
  """HF Spaces Docker containers wipe everything EXCEPT /data on restart."""
60
  data_path = Path("/data/chroma_db")
 
76
  # =============================================================================
77
  # HF DATASET PERSISTENCE
78
  # =============================================================================
 
 
 
 
 
79
  class HFDatasetPersistence:
80
  """Handles durable cloud storage via your 1TB PRO Dataset repository."""
81
 
 
153
  # =============================================================================
154
 
155
  class RecursiveContextManager:
156
+ """Manages unlimited context and vibe-coding tools for E-T Systems."""
157
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  INDEXABLE_EXTENSIONS = {
159
  '.py', '.js', '.ts', '.jsx', '.tsx', '.mjs', '.cjs',
160
  '.json', '.yaml', '.yml', '.toml',
 
162
  '.html', '.css', '.scss',
163
  '.sh', '.bash',
164
  '.sql',
165
+ '.env.example',
166
  '.gitignore', '.dockerignore',
167
  '.cfg', '.ini', '.conf',
168
  }
169
 
 
170
  MAX_INDEX_SIZE = 256 * 1024
171
 
172
  def __init__(self, repo_path: str):
173
  self.repo_path = Path(repo_path)
174
  self.persistence = HFDatasetPersistence()
175
 
176
+ # Embedding Config
 
 
 
 
 
 
 
 
177
  self.embedding_function = ONNXMiniLM_L6_V2()
178
  cache_dir = os.getenv("CHROMA_CACHE_DIR", "/tmp/.cache/chroma")
179
  self.embedding_function.DOWNLOAD_PATH = cache_dir
 
194
  embedding_function=self.embedding_function
195
  )
196
 
 
197
  if self.conversations.count() == 0:
198
  self._restore_from_cloud()
199
 
 
 
 
 
 
 
 
200
  self._indexing = False
201
  self._index_error = None
202
  self._indexed_file_count = 0
 
204
  self._start_background_indexing()
205
 
206
  def _restore_from_cloud(self):
 
 
 
 
 
 
 
207
  data = self.persistence.load_conversations()
208
  for conv in data:
209
  try:
 
216
  pass
217
 
218
  def _get_collection_name(self) -> str:
 
 
 
 
 
 
219
  path_hash = hashlib.md5(str(self.repo_path).encode()).hexdigest()[:8]
220
  return f"codebase_{path_hash}"
221
 
222
  # =====================================================================
223
  # REPOSITORY INDEXING
224
  # =====================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
  def _start_background_indexing(self):
 
227
  self._indexing = True
228
  thread = threading.Thread(target=self._index_repository, daemon=True)
229
  thread.start()
230
 
231
  def _index_repository(self):
 
 
 
 
232
  try:
233
  skip_dirs = {
234
  '.git', '__pycache__', 'node_modules', 'venv', '.venv',
 
238
  count = 0
239
 
240
  for file_path in self.repo_path.rglob('*'):
241
+ if file_path.is_dir(): continue
242
+ if any(skip in file_path.parts for skip in skip_dirs): continue
 
 
 
 
 
243
 
 
244
  suffix = file_path.suffix.lower()
245
  if suffix not in self.INDEXABLE_EXTENSIONS:
246
+ if file_path.name not in {'Dockerfile', 'Makefile', 'Procfile', '.gitignore', '.dockerignore', '.env.example'}:
 
 
 
 
247
  continue
248
 
 
249
  try:
250
+ if file_path.stat().st_size > self.MAX_INDEX_SIZE: continue
251
+ except OSError: continue
 
 
252
 
 
253
  try:
254
  content = file_path.read_text(encoding='utf-8', errors='ignore')
255
+ except (OSError, UnicodeDecodeError): continue
 
256
 
257
+ if not content.strip(): continue
 
258
 
259
  rel_path = str(file_path.relative_to(self.repo_path))
260
  chunks = self._chunk_file(content, rel_path)
 
266
  metadatas=[chunk_meta],
267
  ids=[chunk_id]
268
  )
269
+ except Exception: continue
 
270
 
271
  count += 1
272
  self._indexed_file_count = count
 
277
  self._indexing = False
278
 
279
  def _chunk_file(self, content: str, rel_path: str) -> List[Tuple[str, str, dict]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  lines = content.split('\n')
281
  chunks = []
282
  chunk_size = 50
283
  overlap = 10
284
 
285
  if len(lines) <= chunk_size:
 
286
  content_hash = hashlib.md5(content.encode()).hexdigest()[:12]
287
  chunk_id = f"{rel_path}::full::{content_hash}"
288
  meta = {
 
293
  }
294
  chunks.append((chunk_id, content, meta))
295
  else:
 
296
  start = 0
297
  chunk_num = 0
298
  while start < len(lines):
 
313
  return chunks
314
 
315
  # =====================================================================
316
+ # STATS
 
 
 
 
 
 
 
 
317
  # =====================================================================
318
 
319
  def get_stats(self) -> dict:
320
+ """Return system statistics for the UI sidebar and system prompt."""
321
+ try:
322
+ return {
323
+ 'total_files': self._indexed_file_count,
324
+ 'indexed_chunks': self.collection.count(),
325
+ 'conversations': self.conversations.count(),
326
+ 'chroma_path': CHROMA_DB_PATH,
327
+ 'persistence_configured': self.persistence.is_configured,
328
+ 'indexing_in_progress': self._indexing,
329
+ 'index_error': self._index_error,
330
+ }
331
+ except Exception as e:
332
+ return {"index_error": str(e)}
 
 
 
333
 
334
  # =====================================================================
335
+ # PHASE 1 ORCHESTRATOR TOOLS
336
  # =====================================================================
337
 
338
  def create_shadow_branch(self):
339
+ """Creates a timestamped backup branch of the E-T Systems Space."""
 
 
 
 
 
 
340
  timestamp = time.strftime("%Y%m%d-%H%M%S")
341
  branch_name = f"vibe-backup-{timestamp}"
342
  try:
343
+ repo_id = os.getenv("ET_SYSTEMS_SPACE")
344
+ if not repo_id: return "Error: ET_SYSTEMS_SPACE env var not set."
345
+
 
346
  self.persistence.api.create_branch(
347
  repo_id=repo_id,
348
  branch=branch_name,
 
354
  return f"⚠️ Shadow branch failed: {e}"
355
 
356
  def write_file(self, path: str, content: str):
357
+ """Writes file locally AND pushes to the remote HF Space."""
358
+ warning = ""
359
+ # 1. Non-blocking warning instead of rejection
 
 
 
 
 
 
 
 
 
 
 
 
360
  if not re.search(r"CHANGELOG \[\d{4}-\d{2}-\d{2} - \w+\]", content):
361
+ warning = "\n⚠️ NOTE: Missing CHANGELOG header."
362
 
363
  try:
364
+ # 2. Write to Local Disk (Container)
365
  full_path = self.repo_path / path
366
  full_path.parent.mkdir(parents=True, exist_ok=True)
367
  full_path.write_text(content)
368
+
369
+ # 3. Push to Remote Space (Persistence)
370
+ remote_msg = ""
371
+ target_space = os.getenv("ET_SYSTEMS_SPACE")
372
+
373
+ if self.persistence.is_configured and target_space:
374
+ try:
375
+ self.persistence.api.upload_file(
376
+ path_or_fileobj=str(full_path),
377
+ path_in_repo=path,
378
+ repo_id=target_space,
379
+ repo_type="space",
380
+ token=self.persistence.token,
381
+ commit_message=f"Clawdbot update: {path}"
382
+ )
383
+ remote_msg = f"\n🚀 Pushed to remote Space: {target_space}"
384
+ except Exception as e:
385
+ remote_msg = f"\n⚠️ Local write success, but remote push failed: {e}"
386
+
387
+ return f"✅ Wrote {path}{warning}{remote_msg}"
388
  except Exception as e:
389
  return f"Error writing file: {e}"
390
 
391
  def shell_execute(self, command: str):
392
+ """Runs shell commands in the /workspace directory."""
 
 
 
 
 
 
 
 
 
 
 
 
393
  try:
394
  result = subprocess.run(
395
  command, shell=True, capture_output=True, text=True,
 
404
  # =====================================================================
405
 
406
  def search_code(self, query: str, n: int = 5) -> List[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  if self.collection.count() == 0:
408
  return []
409
  actual_n = min(n, self.collection.count())
 
414
  ]
415
 
416
  def read_file(self, path: str, start_line: int = None, end_line: int = None) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  p = self.repo_path / path
418
  if not p.exists():
419
  return f"File not found: {path}"
 
421
  content = p.read_text(encoding='utf-8', errors='ignore')
422
  if start_line is not None or end_line is not None:
423
  lines = content.split('\n')
424
+ start = (start_line or 1) - 1
425
  end = end_line or len(lines)
426
  sliced = lines[start:end]
427
  return '\n'.join(sliced)
 
430
  return f"Error reading {path}: {e}"
431
 
432
  def list_files(self, path: str = "", max_depth: int = 3) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  target = self.repo_path / path
434
+ if not target.exists(): return f"Path not found: {path}"
435
+ if not target.is_dir(): return f"Not a directory: {path}"
 
 
436
 
437
  skip_dirs = {
438
  '.git', '__pycache__', 'node_modules', 'venv', '.venv',
 
442
  lines = [f"📂 {path or '(repo root)'}"]
443
 
444
  def _walk(dir_path: Path, prefix: str, depth: int):
445
+ if depth > max_depth: return
 
446
  try:
447
  entries = sorted(dir_path.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
448
+ except PermissionError: return
 
449
 
450
  for i, entry in enumerate(entries):
451
+ if entry.name in skip_dirs or entry.name.startswith('.'): continue
 
452
  is_last = (i == len(entries) - 1)
453
  connector = "└── " if is_last else "├── "
454
  if entry.is_dir():
 
464
  return '\n'.join(lines)
465
 
466
  def search_conversations(self, query: str, n: int = 5) -> List[Dict]:
467
+ if self.conversations.count() == 0: return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  actual_n = min(n, self.conversations.count())
469
  res = self.conversations.query(query_texts=[query], n_results=actual_n)
470
  results = []
471
  for doc, meta in zip(res['documents'][0], res['metadatas'][0]):
472
+ results.append({'content': doc[:1000], 'metadata': meta})
 
 
 
473
  return results
474
 
475
  def search_testament(self, query: str, n: int = 5) -> List[Dict]:
476
+ testament_names = {'testament', 'decisions', 'adr', 'architecture', 'principles', 'constitution', 'changelog', 'design'}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  testament_results = []
478
  if self.collection.count() > 0:
479
+ actual_n = min(n * 2, self.collection.count())
 
480
  res = self.collection.query(query_texts=[query], n_results=actual_n)
481
  for doc, meta in zip(res['documents'][0], res['metadatas'][0]):
482
  path_lower = meta.get('path', '').lower()
 
483
  is_testament = any(name in path_lower for name in testament_names)
484
  testament_results.append({
485
  'file': meta['path'],
486
  'snippet': doc[:500],
487
  'is_testament': is_testament
488
  })
 
 
489
  testament_results.sort(key=lambda r: (not r.get('is_testament', False)))
490
  return testament_results[:n]
491
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  def save_conversation_turn(self, u, a, t_id):
493
  """WHY: Pulls the FULL history before pushing to cloud to prevent memory loss."""
494
  combined = f"USER: {u}\n\nASSISTANT: {a}"
 
508
  })
509
 
510
  # 3. Push the entire manifest back to your PRO storage dataset
511
+ self.persistence.save_conversations(full_data)