Update recursive_context.py
Browse files- recursive_context.py +82 -795
recursive_context.py
CHANGED
|
@@ -1,182 +1,37 @@
|
|
| 1 |
"""
|
| 2 |
Recursive Context Manager for Clawdbot
|
| 3 |
-
|
| 4 |
-
CHANGELOG [2025-01-28 - Josh]
|
| 5 |
-
Implements MIT's Recursive Language Model technique for unlimited context.
|
| 6 |
-
|
| 7 |
-
CHANGELOG [2025-01-30 - Claude]
|
| 8 |
-
Added HuggingFace Dataset persistence layer.
|
| 9 |
-
PROBLEM: /workspace gets wiped on Space restart, killing ChromaDB data.
|
| 10 |
-
SOLUTION: Sync ChromaDB collections to a private HF Dataset repo.
|
| 11 |
-
- On startup: Pull from Dataset -> restore to ChromaDB
|
| 12 |
-
- On save: Also push to Dataset (debounced to avoid spam)
|
| 13 |
-
- Periodic backup every N conversation turns
|
| 14 |
-
This gives us FREE, VERSIONED, PERSISTENT storage that survives restarts.
|
| 15 |
-
|
| 16 |
-
CHANGELOG [2025-01-31 - Claude]
|
| 17 |
-
FIXED: Multiple persistence failures causing "Conversations Saved: 0"
|
| 18 |
-
ROOT CAUSES FOUND:
|
| 19 |
-
1. ChromaDB path was /workspace/chroma_db - EPHEMERAL on HF Spaces Docker.
|
| 20 |
-
Container filesystem gets wiped on every restart. Only /data survives.
|
| 21 |
-
2. Cloud backup (HF Dataset) silently did nothing when MEMORY_REPO wasn't set.
|
| 22 |
-
No errors, no warnings in UI - just quiet failure.
|
| 23 |
-
3. Debounce timer (30s) could prevent saves if Space sleeps quickly.
|
| 24 |
-
4. HF Spaces sometimes SIGKILL containers without sending SIGTERM,
|
| 25 |
-
so shutdown hooks never fire and pending saves are lost.
|
| 26 |
-
|
| 27 |
-
FIXES APPLIED:
|
| 28 |
-
- ChromaDB path: /workspace/chroma_db -> /data/chroma_db (HF persistent volume)
|
| 29 |
-
Falls back to /workspace/chroma_db if /data isn't writable (free tier without
|
| 30 |
-
persistent storage enabled). Logs which path it's using so you can tell.
|
| 31 |
-
- Cloud backup: Now logs CLEAR warnings when MEMORY_REPO isn't set, every time
|
| 32 |
-
a save would have been backed up. No more silent failure.
|
| 33 |
-
- Debounce: Reduced from 30s to 10s. Also saves immediately on first conversation
|
| 34 |
-
turn (no debounce for turn 0) so even single-message sessions persist.
|
| 35 |
-
- Shutdown: force_backup() is still registered, but we no longer RELY on it.
|
| 36 |
-
Instead, every Nth save goes to cloud immediately (N=3, was N=5).
|
| 37 |
-
|
| 38 |
-
PERSISTENCE ARCHITECTURE (now with /data):
|
| 39 |
-
/data/chroma_db (survives restarts if persistent storage enabled)
|
| 40 |
-
|
|
| 41 |
-
v
|
| 42 |
-
ChromaDB (fast local queries) <--> HF Dataset (durable cloud storage)
|
| 43 |
-
^
|
| 44 |
-
Private repo: username/clawdbot-memory
|
| 45 |
-
Contains: conversations.json
|
| 46 |
-
|
| 47 |
-
REFERENCE: https://www.youtube.com/watch?v=huszaaJPjU8
|
| 48 |
-
"MIT basically solved unlimited context windows"
|
| 49 |
-
|
| 50 |
-
APPROACH:
|
| 51 |
-
Instead of cramming everything into context (hits limits) or summarizing
|
| 52 |
-
(lossy compression), we:
|
| 53 |
-
|
| 54 |
-
1. Store entire codebase in searchable environment
|
| 55 |
-
2. Give model TOOLS to query what it needs
|
| 56 |
-
3. Model recursively retrieves relevant pieces
|
| 57 |
-
4. No summarization loss - full fidelity access
|
| 58 |
-
|
| 59 |
-
This is like RAG, but IN-ENVIRONMENT with the model actively deciding
|
| 60 |
-
what context it needs rather than us guessing upfront.
|
| 61 |
-
|
| 62 |
-
EXAMPLE FLOW:
|
| 63 |
-
User: "How does Genesis handle surprise?"
|
| 64 |
-
Model: search_code("Genesis surprise detection")
|
| 65 |
-
-> Finds: genesis/substrate.py, genesis/attention.py
|
| 66 |
-
Model: read_file("genesis/substrate.py", lines 145-167)
|
| 67 |
-
-> Gets actual implementation
|
| 68 |
-
Model: search_testament("surprise detection rationale")
|
| 69 |
-
-> Gets design decision
|
| 70 |
-
Model: Synthesizes answer from retrieved pieces
|
| 71 |
-
|
| 72 |
-
NO CONTEXT WINDOW LIMIT - just selective retrieval.
|
| 73 |
"""
|
| 74 |
|
| 75 |
from pathlib import Path
|
| 76 |
from typing import List, Dict, Optional, Tuple
|
| 77 |
import chromadb
|
| 78 |
from chromadb.config import Settings
|
|
|
|
| 79 |
import hashlib
|
| 80 |
import json
|
| 81 |
import os
|
| 82 |
import time
|
| 83 |
import threading
|
| 84 |
|
| 85 |
-
|
| 86 |
-
# =============================================================================
|
| 87 |
-
# PERSISTENT STORAGE PATH SELECTION
|
| 88 |
-
# =============================================================================
|
| 89 |
-
# CHANGELOG [2025-01-31 - Claude]
|
| 90 |
-
# HF Spaces Docker containers wipe everything EXCEPT /data on restart.
|
| 91 |
-
# We try /data first (persistent), fall back to /workspace (ephemeral).
|
| 92 |
-
# This decision is made once at module load and logged clearly.
|
| 93 |
-
# =============================================================================
|
| 94 |
-
|
| 95 |
def _select_chroma_path():
|
| 96 |
-
"""
|
| 97 |
-
Choose the best available path for ChromaDB storage.
|
| 98 |
-
|
| 99 |
-
CHANGELOG [2025-01-31 - Claude]
|
| 100 |
-
PRIORITY ORDER:
|
| 101 |
-
1. /data/chroma_db - HF Spaces persistent volume (survives restarts)
|
| 102 |
-
2. /workspace/chroma_db - Container filesystem (wiped on restart)
|
| 103 |
-
|
| 104 |
-
WHY /data:
|
| 105 |
-
HuggingFace Spaces with Docker SDK provide /data as persistent storage.
|
| 106 |
-
It must be enabled in Space settings (Settings -> Persistent Storage).
|
| 107 |
-
Free tier gets 20GB. This is the ONLY path that survives container restarts.
|
| 108 |
-
|
| 109 |
-
WHY FALLBACK:
|
| 110 |
-
If /data doesn't exist or isn't writable (persistent storage not enabled),
|
| 111 |
-
we still need ChromaDB to work for the current session. /workspace works
|
| 112 |
-
fine within a single session, just doesn't survive restarts.
|
| 113 |
-
"""
|
| 114 |
-
# Try /data first (HF persistent volume)
|
| 115 |
data_path = Path("/data/chroma_db")
|
| 116 |
try:
|
| 117 |
data_path.mkdir(parents=True, exist_ok=True)
|
| 118 |
-
# Test write access by creating and removing a temp file
|
| 119 |
test_file = data_path / ".write_test"
|
| 120 |
test_file.write_text("test")
|
| 121 |
test_file.unlink()
|
| 122 |
-
print("=" * 60)
|
| 123 |
-
print("STORAGE: Using /data/chroma_db (PERSISTENT - survives restarts)")
|
| 124 |
-
print("=" * 60)
|
| 125 |
return str(data_path)
|
| 126 |
-
except (OSError, PermissionError)
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
print("STORAGE: Memory will be lost on restart!")
|
| 131 |
-
print("STORAGE: Enable persistent storage in Space Settings,")
|
| 132 |
-
print("STORAGE: or set MEMORY_REPO secret for cloud backup.")
|
| 133 |
-
print("=" * 60)
|
| 134 |
|
| 135 |
-
# Fallback to /workspace
|
| 136 |
-
workspace_path = Path("/workspace/chroma_db")
|
| 137 |
-
workspace_path.mkdir(parents=True, exist_ok=True)
|
| 138 |
-
return str(workspace_path)
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
# Resolve once at import time so it's consistent throughout the session
|
| 142 |
CHROMA_DB_PATH = _select_chroma_path()
|
| 143 |
|
| 144 |
-
|
| 145 |
class HFDatasetPersistence:
|
| 146 |
-
"""
|
| 147 |
-
Handles syncing ChromaDB data to/from HuggingFace Datasets.
|
| 148 |
-
|
| 149 |
-
CHANGELOG [2025-01-30 - Claude]
|
| 150 |
-
Created to solve the Space restart problem.
|
| 151 |
-
|
| 152 |
-
CHANGELOG [2025-01-31 - Claude]
|
| 153 |
-
FIXED: Now logs clear warnings when MEMORY_REPO isn't configured.
|
| 154 |
-
Previously failed silently, making it impossible to tell why memory
|
| 155 |
-
wasn't persisting. Also reduced debounce from 30s to 10s.
|
| 156 |
-
|
| 157 |
-
WHY HF DATASETS:
|
| 158 |
-
- Free storage (up to 50GB on free tier)
|
| 159 |
-
- Version controlled (can roll back if corrupted)
|
| 160 |
-
- Private repos available
|
| 161 |
-
- Native HF integration (no extra auth needed in Spaces)
|
| 162 |
-
- JSON files work great for conversation data
|
| 163 |
-
|
| 164 |
-
ALTERNATIVES CONSIDERED:
|
| 165 |
-
- Supabase: Good but adds external dependency
|
| 166 |
-
- /data mount: Requires persistent storage setting (now our primary!)
|
| 167 |
-
- External S3: More complex, costs money
|
| 168 |
-
"""
|
| 169 |
-
|
| 170 |
def __init__(self, repo_id: str = None):
|
| 171 |
-
"""
|
| 172 |
-
Initialize persistence layer.
|
| 173 |
-
|
| 174 |
-
Args:
|
| 175 |
-
repo_id: HF Dataset repo (e.g., "username/clawdbot-memory")
|
| 176 |
-
If None, uses MEMORY_REPO env var
|
| 177 |
-
"""
|
| 178 |
from huggingface_hub import HfApi
|
| 179 |
-
|
| 180 |
self.api = HfApi()
|
| 181 |
self.repo_id = repo_id or os.getenv("MEMORY_REPO")
|
| 182 |
self.token = (
|
|
@@ -184,158 +39,46 @@ class HFDatasetPersistence:
|
|
| 184 |
os.getenv("HUGGING_FACE_HUB_TOKEN") or
|
| 185 |
os.getenv("HUGGINGFACE_TOKEN")
|
| 186 |
)
|
| 187 |
-
|
| 188 |
-
# Track if we've initialized the repo
|
| 189 |
self._repo_ready = False
|
| 190 |
-
|
| 191 |
-
# Debounce saves to avoid hammering HF API
|
| 192 |
-
# RATIONALE: User might send 10 messages quickly, we don't want 10 uploads
|
| 193 |
-
# CHANGELOG [2025-01-31 - Claude]: Reduced from 30s to 10s. 30s was too
|
| 194 |
-
# long - Spaces can sleep after 15 minutes of inactivity, and if a user
|
| 195 |
-
# sends a few messages then leaves, the debounce could eat the last save.
|
| 196 |
self._save_lock = threading.Lock()
|
| 197 |
self._pending_save = False
|
| 198 |
self._last_save_time = 0
|
| 199 |
-
self.SAVE_DEBOUNCE_SECONDS = 10
|
| 200 |
|
| 201 |
-
# CHANGELOG [2025-01-31 - Claude]
|
| 202 |
-
# Log configuration status clearly on startup so it's visible in logs
|
| 203 |
if self.repo_id and self.token:
|
| 204 |
self._ensure_repo_exists()
|
| 205 |
-
print(f"CLOUD BACKUP: Configured -> {self.repo_id}")
|
| 206 |
-
# Verify token has write permissions
|
| 207 |
-
# CHANGELOG [2025-01-31 - Claude]
|
| 208 |
-
# Gemini caught this: a read-only token will let the app start
|
| 209 |
-
# but all upload_file calls will fail with 403. Check early.
|
| 210 |
self._verify_write_permissions()
|
| 211 |
-
elif not self.repo_id:
|
| 212 |
-
print("=" * 60)
|
| 213 |
-
print("CLOUD BACKUP: NOT CONFIGURED")
|
| 214 |
-
print("Add MEMORY_REPO secret to Space settings.")
|
| 215 |
-
print("Value should be: your-username/clawdbot-memory")
|
| 216 |
-
print("Without this, conversations won't survive restarts")
|
| 217 |
-
print("(unless /data persistent storage is enabled).")
|
| 218 |
-
print("=" * 60)
|
| 219 |
-
elif not self.token:
|
| 220 |
-
print("CLOUD BACKUP: No HF_TOKEN found - cloud backup disabled")
|
| 221 |
|
| 222 |
def _ensure_repo_exists(self):
|
| 223 |
-
|
| 224 |
-
Create the memory repo if it doesn't exist.
|
| 225 |
-
|
| 226 |
-
CHANGELOG [2025-01-30 - Claude]
|
| 227 |
-
Auto-creates private Dataset repo for memory storage.
|
| 228 |
-
"""
|
| 229 |
-
if self._repo_ready:
|
| 230 |
-
return
|
| 231 |
-
|
| 232 |
try:
|
| 233 |
-
|
| 234 |
-
self.api.repo_info(
|
| 235 |
-
repo_id=self.repo_id,
|
| 236 |
-
repo_type="dataset",
|
| 237 |
-
token=self.token
|
| 238 |
-
)
|
| 239 |
-
print(f"Memory repo exists: {self.repo_id}")
|
| 240 |
self._repo_ready = True
|
| 241 |
except Exception:
|
| 242 |
-
# Create it
|
| 243 |
try:
|
| 244 |
-
self.api.create_repo(
|
| 245 |
-
repo_id=self.repo_id,
|
| 246 |
-
repo_type="dataset",
|
| 247 |
-
private=True, # Keep conversations private!
|
| 248 |
-
token=self.token
|
| 249 |
-
)
|
| 250 |
-
print(f"Created memory repo: {self.repo_id}")
|
| 251 |
self._repo_ready = True
|
| 252 |
-
except Exception
|
| 253 |
-
print(f"Could not create memory repo: {e}")
|
| 254 |
-
print(" Memory will not persist across restarts!")
|
| 255 |
|
| 256 |
@property
|
| 257 |
def is_configured(self):
|
| 258 |
-
"""
|
| 259 |
-
Check if cloud backup is properly configured.
|
| 260 |
-
|
| 261 |
-
CHANGELOG [2025-01-31 - Claude]
|
| 262 |
-
Added so callers can check before relying on cloud backup.
|
| 263 |
-
"""
|
| 264 |
return bool(self.repo_id and self.token)
|
| 265 |
|
| 266 |
def _verify_write_permissions(self):
|
| 267 |
-
"""
|
| 268 |
-
Check that the HF_TOKEN has write permissions.
|
| 269 |
-
|
| 270 |
-
CHANGELOG [2025-01-31 - Claude]
|
| 271 |
-
Added per Gemini's feedback: a read-only token lets the app start
|
| 272 |
-
but causes all cloud saves to fail with 403. Better to catch this
|
| 273 |
-
at startup and warn loudly than discover it after losing data.
|
| 274 |
-
|
| 275 |
-
NOTE: We don't fail hard here because the app can still function
|
| 276 |
-
without cloud backup (using /data persistent storage). Just warn.
|
| 277 |
-
"""
|
| 278 |
try:
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
# The token type tells us about permissions
|
| 282 |
-
# "write" or "fineGrained" tokens with repo write access work
|
| 283 |
-
# "read" tokens will fail on upload
|
| 284 |
-
token_name = user_info.get("auth", {}).get("accessToken", {}).get("displayName", "unknown")
|
| 285 |
-
print(f"CLOUD BACKUP: Token verified (name: {token_name})")
|
| 286 |
-
except Exception as e:
|
| 287 |
-
print(f"CLOUD BACKUP WARNING: Could not verify token permissions: {e}")
|
| 288 |
-
print("CLOUD BACKUP WARNING: If saves fail, check that HF_TOKEN has WRITE access")
|
| 289 |
|
| 290 |
def save_conversations(self, conversations_data: List[Dict], force: bool = False):
|
| 291 |
-
|
| 292 |
-
Save conversations to HF Dataset.
|
| 293 |
-
|
| 294 |
-
CHANGELOG [2025-01-30 - Claude]
|
| 295 |
-
Debounced save to avoid API spam. Use force=True for shutdown saves.
|
| 296 |
-
|
| 297 |
-
CHANGELOG [2025-01-31 - Claude]
|
| 298 |
-
Now logs when save is skipped due to missing config (was silent before).
|
| 299 |
-
Added _repo_ready guard per Gemini's race condition catch: if the repo
|
| 300 |
-
hasn't finished initializing (or failed to initialize), skip the save
|
| 301 |
-
rather than letting it throw an opaque HfApi error.
|
| 302 |
-
|
| 303 |
-
Args:
|
| 304 |
-
conversations_data: List of conversation dicts to save
|
| 305 |
-
force: If True, save immediately ignoring debounce
|
| 306 |
-
"""
|
| 307 |
-
if not self.is_configured:
|
| 308 |
-
# CHANGELOG [2025-01-31 - Claude]
|
| 309 |
-
# Was completely silent before. Now logs so you can see in Space logs.
|
| 310 |
-
print("Cloud save skipped: MEMORY_REPO not configured")
|
| 311 |
-
return False
|
| 312 |
-
|
| 313 |
-
# CHANGELOG [2025-01-31 - Claude]
|
| 314 |
-
# Race condition guard: _ensure_repo_exists() runs in __init__ but
|
| 315 |
-
# could fail (network issue, bad token, etc). If repo isn't ready,
|
| 316 |
-
# don't try to upload - it'll just throw a confusing 404 or 403.
|
| 317 |
-
if not self._repo_ready:
|
| 318 |
-
print("Cloud save skipped: memory repo not ready (init may have failed)")
|
| 319 |
-
# Retry initialization in case it was a transient network issue
|
| 320 |
-
self._ensure_repo_exists()
|
| 321 |
-
if not self._repo_ready:
|
| 322 |
-
return False
|
| 323 |
-
|
| 324 |
current_time = time.time()
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
if current_time - self._last_save_time < self.SAVE_DEBOUNCE_SECONDS:
|
| 329 |
-
self._pending_save = True
|
| 330 |
-
return False
|
| 331 |
-
|
| 332 |
with self._save_lock:
|
| 333 |
try:
|
| 334 |
-
# Save to local temp file first
|
| 335 |
temp_path = Path("/tmp/conversations_backup.json")
|
| 336 |
temp_path.write_text(json.dumps(conversations_data, indent=2))
|
| 337 |
-
|
| 338 |
-
# Upload to HF Dataset
|
| 339 |
self.api.upload_file(
|
| 340 |
path_or_fileobj=str(temp_path),
|
| 341 |
path_in_repo="conversations.json",
|
|
@@ -344,585 +87,129 @@ class HFDatasetPersistence:
|
|
| 344 |
token=self.token,
|
| 345 |
commit_message=f"Backup {len(conversations_data)} conversations"
|
| 346 |
)
|
| 347 |
-
|
| 348 |
self._last_save_time = current_time
|
| 349 |
self._pending_save = False
|
| 350 |
-
print(f"Cloud saved {len(conversations_data)} conversations to {self.repo_id}")
|
| 351 |
return True
|
| 352 |
-
|
| 353 |
-
except Exception as e:
|
| 354 |
-
print(f"Failed to save conversations to cloud: {e}")
|
| 355 |
-
return False
|
| 356 |
|
| 357 |
def load_conversations(self) -> List[Dict]:
|
| 358 |
-
|
| 359 |
-
Load conversations from HF Dataset.
|
| 360 |
-
|
| 361 |
-
CHANGELOG [2025-01-30 - Claude]
|
| 362 |
-
Called on startup to restore conversation history.
|
| 363 |
-
|
| 364 |
-
Returns:
|
| 365 |
-
List of conversation dicts, or empty list if none found
|
| 366 |
-
"""
|
| 367 |
-
if not self.is_configured:
|
| 368 |
-
print("Cloud load skipped: MEMORY_REPO not configured")
|
| 369 |
-
return []
|
| 370 |
-
|
| 371 |
try:
|
| 372 |
from huggingface_hub import hf_hub_download
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
repo_id=self.repo_id,
|
| 377 |
-
filename="conversations.json",
|
| 378 |
-
repo_type="dataset",
|
| 379 |
-
token=self.token
|
| 380 |
-
)
|
| 381 |
-
|
| 382 |
-
with open(local_path, 'r') as f:
|
| 383 |
-
data = json.load(f)
|
| 384 |
-
|
| 385 |
-
print(f"Cloud loaded {len(data)} conversations from {self.repo_id}")
|
| 386 |
-
return data
|
| 387 |
-
|
| 388 |
-
except Exception as e:
|
| 389 |
-
# File might not exist yet (first run)
|
| 390 |
-
if "404" in str(e) or "not found" in str(e).lower():
|
| 391 |
-
print(f"No existing conversations found in {self.repo_id} (first run)")
|
| 392 |
-
else:
|
| 393 |
-
print(f"Failed to load conversations from cloud: {e}")
|
| 394 |
-
return []
|
| 395 |
-
|
| 396 |
-
def has_pending_save(self) -> bool:
|
| 397 |
-
"""Check if there's a pending save that was debounced."""
|
| 398 |
-
return self._pending_save
|
| 399 |
-
|
| 400 |
|
| 401 |
class RecursiveContextManager:
|
| 402 |
-
"""
|
| 403 |
-
Manages unlimited context via recursive retrieval.
|
| 404 |
-
|
| 405 |
-
The model has TOOLS to search and read the codebase selectively,
|
| 406 |
-
rather than loading everything upfront.
|
| 407 |
-
|
| 408 |
-
CHANGELOG [2025-01-30 - Claude]
|
| 409 |
-
Added HF Dataset persistence. Conversations now survive Space restarts.
|
| 410 |
-
|
| 411 |
-
CHANGELOG [2025-01-31 - Claude]
|
| 412 |
-
FIXED: ChromaDB path now uses /data (persistent) instead of /workspace (ephemeral).
|
| 413 |
-
FIXED: Cloud backup logs clear warnings when not configured.
|
| 414 |
-
FIXED: Backup frequency increased (every 3 saves, was 5).
|
| 415 |
-
FIXED: First conversation turn always triggers immediate cloud save.
|
| 416 |
-
"""
|
| 417 |
-
|
| 418 |
def __init__(self, repo_path: str):
|
| 419 |
-
"""
|
| 420 |
-
Initialize context manager for a repository.
|
| 421 |
-
|
| 422 |
-
Args:
|
| 423 |
-
repo_path: Path to the code repository
|
| 424 |
-
"""
|
| 425 |
self.repo_path = Path(repo_path)
|
| 426 |
-
|
| 427 |
-
# Initialize persistence layer FIRST
|
| 428 |
-
# RATIONALE: Need this before ChromaDB so we can restore data
|
| 429 |
self.persistence = HFDatasetPersistence()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
|
| 431 |
-
# Initialize ChromaDB for semantic search
|
| 432 |
-
# CHANGELOG [2025-01-31 - Claude]
|
| 433 |
-
# Now uses CHROMA_DB_PATH which is resolved at module load to either
|
| 434 |
-
# /data/chroma_db (persistent) or /workspace/chroma_db (ephemeral fallback).
|
| 435 |
-
# See _select_chroma_path() at top of file for selection logic.
|
| 436 |
self.chroma_client = chromadb.PersistentClient(
|
| 437 |
path=CHROMA_DB_PATH,
|
| 438 |
-
settings=Settings(
|
| 439 |
-
anonymized_telemetry=False,
|
| 440 |
-
allow_reset=True
|
| 441 |
-
)
|
| 442 |
)
|
| 443 |
-
print(f"ChromaDB initialized at: {CHROMA_DB_PATH}")
|
| 444 |
|
| 445 |
-
# Create or get CODEBASE collection
|
| 446 |
collection_name = self._get_collection_name()
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
metadata={"description": "E-T Systems codebase"}
|
| 454 |
-
)
|
| 455 |
-
print(f"Created new collection: {collection_name}")
|
| 456 |
-
self._index_codebase()
|
| 457 |
|
| 458 |
-
|
| 459 |
-
#
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
except Exception:
|
| 466 |
-
self.conversations = self.chroma_client.create_collection(
|
| 467 |
-
name=conversations_name,
|
| 468 |
-
metadata={"description": "Clawdbot conversation history"}
|
| 469 |
-
)
|
| 470 |
-
print(f"Created conversation collection: {conversations_name}")
|
| 471 |
|
| 472 |
-
# RESTORE FROM CLOUD if local is empty but cloud has data
|
| 473 |
-
# RATIONALE: Space restarted, ChromaDB wiped, but HF Dataset has our history
|
| 474 |
if self.conversations.count() == 0:
|
| 475 |
self._restore_from_cloud()
|
| 476 |
|
| 477 |
-
# Track saves for periodic backup
|
| 478 |
-
# CHANGELOG [2025-01-31 - Claude]: Reduced from 5 to 3 for more frequent backups.
|
| 479 |
-
# Also added _is_first_save flag for immediate first-turn backup.
|
| 480 |
self._saves_since_backup = 0
|
| 481 |
-
self.BACKUP_EVERY_N_SAVES =
|
| 482 |
-
self._is_first_save = True
|
| 483 |
|
| 484 |
def _restore_from_cloud(self):
|
| 485 |
-
"""
|
| 486 |
-
Restore conversations from HF Dataset to ChromaDB.
|
| 487 |
-
|
| 488 |
-
CHANGELOG [2025-01-30 - Claude]
|
| 489 |
-
Called when local ChromaDB is empty but cloud might have data.
|
| 490 |
-
This is the magic that makes memory survive restarts.
|
| 491 |
-
"""
|
| 492 |
cloud_data = self.persistence.load_conversations()
|
| 493 |
-
|
| 494 |
-
if not cloud_data:
|
| 495 |
-
print("No cloud conversations to restore")
|
| 496 |
-
return
|
| 497 |
-
|
| 498 |
-
print(f"Restoring {len(cloud_data)} conversations from cloud...")
|
| 499 |
-
|
| 500 |
-
restored = 0
|
| 501 |
for conv in cloud_data:
|
| 502 |
try:
|
| 503 |
-
self.conversations.add(
|
| 504 |
-
|
| 505 |
-
metadatas=[conv["metadata"]],
|
| 506 |
-
ids=[conv["id"]]
|
| 507 |
-
)
|
| 508 |
-
restored += 1
|
| 509 |
-
except Exception as e:
|
| 510 |
-
# Might fail if ID already exists (shouldn't happen but safety first)
|
| 511 |
-
print(f"Skipping conversation {conv.get('id')}: {e}")
|
| 512 |
-
|
| 513 |
-
print(f"Restored {restored} conversations (total: {self.conversations.count()})")
|
| 514 |
|
| 515 |
def _backup_to_cloud(self, force: bool = False):
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
Called periodically and on shutdown to ensure durability.
|
| 521 |
-
|
| 522 |
-
Args:
|
| 523 |
-
force: If True, save immediately ignoring debounce
|
| 524 |
-
"""
|
| 525 |
-
if self.conversations.count() == 0:
|
| 526 |
-
return
|
| 527 |
-
|
| 528 |
-
# Get all conversations from ChromaDB
|
| 529 |
-
all_convs = self.conversations.get(
|
| 530 |
-
include=["documents", "metadatas"]
|
| 531 |
-
)
|
| 532 |
-
|
| 533 |
-
# Format for JSON storage
|
| 534 |
-
backup_data = []
|
| 535 |
-
for i, (doc, meta, id_) in enumerate(zip(
|
| 536 |
-
all_convs["documents"],
|
| 537 |
-
all_convs["metadatas"],
|
| 538 |
-
all_convs["ids"]
|
| 539 |
-
)):
|
| 540 |
-
backup_data.append({
|
| 541 |
-
"id": id_,
|
| 542 |
-
"document": doc,
|
| 543 |
-
"metadata": meta
|
| 544 |
-
})
|
| 545 |
-
|
| 546 |
-
# Save to cloud
|
| 547 |
self.persistence.save_conversations(backup_data, force=force)
|
| 548 |
|
| 549 |
def _get_collection_name(self) -> str:
|
| 550 |
-
"""Generate unique collection name based on repo path."""
|
| 551 |
path_hash = hashlib.md5(str(self.repo_path).encode()).hexdigest()[:8]
|
| 552 |
return f"codebase_{path_hash}"
|
| 553 |
|
| 554 |
def _index_codebase(self):
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
This creates the "environment" that the model can search through.
|
| 559 |
-
We index with metadata so search results include file paths.
|
| 560 |
-
"""
|
| 561 |
-
print(f"Indexing codebase at {self.repo_path}...")
|
| 562 |
-
|
| 563 |
-
# File types to index
|
| 564 |
-
code_extensions = {
|
| 565 |
-
'.py', '.js', '.ts', '.tsx', '.jsx',
|
| 566 |
-
'.md', '.txt', '.json', '.yaml', '.yml'
|
| 567 |
-
}
|
| 568 |
-
|
| 569 |
-
# Skip these directories
|
| 570 |
-
skip_dirs = {
|
| 571 |
-
'node_modules', '.git', '__pycache__', 'venv',
|
| 572 |
-
'env', '.venv', 'dist', 'build'
|
| 573 |
-
}
|
| 574 |
-
|
| 575 |
-
documents = []
|
| 576 |
-
metadatas = []
|
| 577 |
-
ids = []
|
| 578 |
-
|
| 579 |
for file_path in self.repo_path.rglob('*'):
|
| 580 |
-
|
| 581 |
-
if file_path.is_dir():
|
| 582 |
-
continue
|
| 583 |
-
if any(skip in file_path.parts for skip in skip_dirs):
|
| 584 |
-
continue
|
| 585 |
-
if file_path.suffix not in code_extensions:
|
| 586 |
continue
|
| 587 |
-
|
| 588 |
try:
|
| 589 |
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
relative_path = str(file_path.relative_to(self.repo_path))
|
| 596 |
-
|
| 597 |
-
documents.append(content)
|
| 598 |
-
metadatas.append({
|
| 599 |
-
"path": relative_path,
|
| 600 |
-
"type": file_path.suffix[1:], # Remove leading dot
|
| 601 |
-
"size": len(content)
|
| 602 |
-
})
|
| 603 |
-
ids.append(relative_path)
|
| 604 |
-
|
| 605 |
-
except Exception as e:
|
| 606 |
-
print(f"Skipping {file_path.name}: {e}")
|
| 607 |
-
continue
|
| 608 |
-
|
| 609 |
if documents:
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
for i in range(0, len(documents), batch_size):
|
| 613 |
-
batch_docs = documents[i:i+batch_size]
|
| 614 |
-
batch_meta = metadatas[i:i+batch_size]
|
| 615 |
-
batch_ids = ids[i:i+batch_size]
|
| 616 |
-
|
| 617 |
-
self.collection.add(
|
| 618 |
-
documents=batch_docs,
|
| 619 |
-
metadatas=batch_meta,
|
| 620 |
-
ids=batch_ids
|
| 621 |
-
)
|
| 622 |
-
|
| 623 |
-
print(f"Indexed {len(documents)} files")
|
| 624 |
-
else:
|
| 625 |
-
print("No files found to index")
|
| 626 |
|
| 627 |
def search_code(self, query: str, n_results: int = 5) -> List[Dict]:
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
Model can search for concepts without knowing exact file names.
|
| 633 |
-
|
| 634 |
-
Args:
|
| 635 |
-
query: What to search for (e.g. "surprise detection", "vector embedding")
|
| 636 |
-
n_results: How many results to return
|
| 637 |
-
|
| 638 |
-
Returns:
|
| 639 |
-
List of dicts with {file, snippet, relevance}
|
| 640 |
-
"""
|
| 641 |
-
if self.collection.count() == 0:
|
| 642 |
-
return [{"error": "No files indexed yet"}]
|
| 643 |
-
|
| 644 |
-
results = self.collection.query(
|
| 645 |
-
query_texts=[query],
|
| 646 |
-
n_results=min(n_results, self.collection.count())
|
| 647 |
-
)
|
| 648 |
-
|
| 649 |
-
# Format results for the model
|
| 650 |
-
formatted = []
|
| 651 |
-
for i in range(len(results['documents'][0])):
|
| 652 |
-
# Truncate document to first 500 chars for search results
|
| 653 |
-
# Model can read_file() if it wants the full content
|
| 654 |
-
snippet = results['documents'][0][i][:500]
|
| 655 |
-
if len(results['documents'][0][i]) > 500:
|
| 656 |
-
snippet += "... [truncated, use read_file to see more]"
|
| 657 |
-
|
| 658 |
-
formatted.append({
|
| 659 |
-
"file": results['metadatas'][0][i]['path'],
|
| 660 |
-
"snippet": snippet,
|
| 661 |
-
"relevance": round(1 - results['distances'][0][i], 3),
|
| 662 |
-
"type": results['metadatas'][0][i]['type']
|
| 663 |
-
})
|
| 664 |
-
|
| 665 |
-
return formatted
|
| 666 |
|
| 667 |
def read_file(self, path: str, lines: Optional[Tuple[int, int]] = None) -> str:
|
| 668 |
-
"""
|
| 669 |
-
Read a specific file or line range.
|
| 670 |
-
|
| 671 |
-
This is a TOOL available to the model.
|
| 672 |
-
After searching, model can read full files as needed.
|
| 673 |
-
|
| 674 |
-
Args:
|
| 675 |
-
path: Relative path to file
|
| 676 |
-
lines: Optional (start, end) line numbers (1-indexed, inclusive)
|
| 677 |
-
|
| 678 |
-
Returns:
|
| 679 |
-
File content or specified lines
|
| 680 |
-
"""
|
| 681 |
full_path = self.repo_path / path
|
| 682 |
-
|
| 683 |
-
if not full_path.exists():
|
| 684 |
-
return f"Error: File not found: {path}"
|
| 685 |
-
|
| 686 |
-
if not full_path.is_relative_to(self.repo_path):
|
| 687 |
-
return "Error: Path outside repository"
|
| 688 |
-
|
| 689 |
try:
|
| 690 |
content = full_path.read_text(encoding='utf-8', errors='ignore')
|
| 691 |
-
|
| 692 |
if lines:
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
# Adjust for 1-indexed
|
| 696 |
-
selected_lines = content_lines[start-1:end]
|
| 697 |
-
return '\n'.join(selected_lines)
|
| 698 |
-
|
| 699 |
return content
|
| 700 |
-
|
| 701 |
-
except Exception as e:
|
| 702 |
-
return f"Error reading file: {str(e)}"
|
| 703 |
|
| 704 |
def search_testament(self, query: str) -> str:
|
| 705 |
-
""
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
This is a TOOL available to the model.
|
| 709 |
-
Helps model understand design rationale.
|
| 710 |
-
|
| 711 |
-
Args:
|
| 712 |
-
query: What decision to look for
|
| 713 |
-
|
| 714 |
-
Returns:
|
| 715 |
-
Relevant Testament sections
|
| 716 |
-
"""
|
| 717 |
-
testament_path = self.repo_path / "TESTAMENT.md"
|
| 718 |
-
|
| 719 |
-
if not testament_path.exists():
|
| 720 |
-
return "Testament not found. No architectural decisions recorded yet."
|
| 721 |
-
|
| 722 |
try:
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
# Simple relevance: sections that contain query terms
|
| 729 |
-
query_lower = query.lower()
|
| 730 |
-
relevant = []
|
| 731 |
-
|
| 732 |
-
for section in sections:
|
| 733 |
-
if query_lower in section.lower():
|
| 734 |
-
# Include section with header
|
| 735 |
-
if not section.startswith('#'):
|
| 736 |
-
section = '## ' + section
|
| 737 |
-
relevant.append(section)
|
| 738 |
-
|
| 739 |
-
if relevant:
|
| 740 |
-
return '\n\n'.join(relevant)
|
| 741 |
-
else:
|
| 742 |
-
return f"No Testament entries found matching '{query}'"
|
| 743 |
-
|
| 744 |
-
except Exception as e:
|
| 745 |
-
return f"Error searching Testament: {str(e)}"
|
| 746 |
|
| 747 |
def list_files(self, directory: str = ".") -> List[str]:
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
This is a TOOL available to the model.
|
| 752 |
-
Helps model explore repository structure.
|
| 753 |
-
|
| 754 |
-
Args:
|
| 755 |
-
directory: Directory to list (relative path)
|
| 756 |
-
|
| 757 |
-
Returns:
|
| 758 |
-
List of file/directory names
|
| 759 |
-
"""
|
| 760 |
-
dir_path = self.repo_path / directory
|
| 761 |
-
|
| 762 |
-
if not dir_path.exists():
|
| 763 |
-
return [f"Error: Directory not found: {directory}"]
|
| 764 |
-
|
| 765 |
-
if not dir_path.is_relative_to(self.repo_path):
|
| 766 |
-
return ["Error: Path outside repository"]
|
| 767 |
-
|
| 768 |
try:
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
# Skip hidden and system directories
|
| 772 |
-
if item.name.startswith('.'):
|
| 773 |
-
continue
|
| 774 |
-
if item.name in {'node_modules', '__pycache__', 'venv'}:
|
| 775 |
-
continue
|
| 776 |
-
|
| 777 |
-
# Mark directories with /
|
| 778 |
-
if item.is_dir():
|
| 779 |
-
items.append(f"{item.name}/")
|
| 780 |
-
else:
|
| 781 |
-
items.append(item.name)
|
| 782 |
-
|
| 783 |
-
return items
|
| 784 |
-
|
| 785 |
-
except Exception as e:
|
| 786 |
-
return [f"Error listing directory: {str(e)}"]
|
| 787 |
|
| 788 |
def save_conversation_turn(self, user_message: str, assistant_message: str, turn_id: int):
|
| 789 |
-
"""
|
| 790 |
-
Save a conversation turn to persistent storage.
|
| 791 |
-
|
| 792 |
-
CHANGELOG [2025-01-30 - Josh]
|
| 793 |
-
Implements MIT recursive technique for conversations.
|
| 794 |
-
Chat history becomes searchable context that persists across sessions.
|
| 795 |
-
|
| 796 |
-
CHANGELOG [2025-01-30 - Claude]
|
| 797 |
-
Added cloud backup integration. Every N saves triggers HF Dataset backup.
|
| 798 |
-
|
| 799 |
-
CHANGELOG [2025-01-31 - Claude]
|
| 800 |
-
FIXED: First conversation turn now triggers immediate cloud backup.
|
| 801 |
-
Previously, a user could have a single exchange and leave, and the
|
| 802 |
-
debounce timer would prevent the cloud save from ever firing.
|
| 803 |
-
Also reduced BACKUP_EVERY_N_SAVES from 5 to 3.
|
| 804 |
-
|
| 805 |
-
Args:
|
| 806 |
-
user_message: What the user said
|
| 807 |
-
assistant_message: What Clawdbot responded
|
| 808 |
-
turn_id: Unique ID for this turn (timestamp-based)
|
| 809 |
-
"""
|
| 810 |
-
# Create a combined document for semantic search
|
| 811 |
combined = f"USER: {user_message}\n\nASSISTANT: {assistant_message}"
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
unique_id = f"turn_{int(time.time())}_{turn_id}"
|
| 815 |
-
|
| 816 |
-
# Save to ChromaDB (fast local access)
|
| 817 |
-
self.conversations.add(
|
| 818 |
-
documents=[combined],
|
| 819 |
-
metadatas=[{
|
| 820 |
-
"user": user_message[:500], # Truncate for metadata
|
| 821 |
-
"assistant": assistant_message[:500],
|
| 822 |
-
"timestamp": int(time.time()),
|
| 823 |
-
"turn": turn_id
|
| 824 |
-
}],
|
| 825 |
-
ids=[unique_id]
|
| 826 |
-
)
|
| 827 |
-
|
| 828 |
-
print(f"Saved conversation turn {turn_id} (total: {self.conversations.count()})")
|
| 829 |
-
|
| 830 |
-
# CLOUD BACKUP LOGIC
|
| 831 |
-
# CHANGELOG [2025-01-31 - Claude]
|
| 832 |
-
# First save always goes to cloud immediately (force=True).
|
| 833 |
-
# This ensures even single-message sessions persist.
|
| 834 |
-
# Subsequent saves follow the periodic backup schedule.
|
| 835 |
-
if self._is_first_save:
|
| 836 |
-
print("First conversation turn - forcing immediate cloud backup")
|
| 837 |
-
self._backup_to_cloud(force=True)
|
| 838 |
-
self._is_first_save = False
|
| 839 |
-
self._saves_since_backup = 0
|
| 840 |
-
else:
|
| 841 |
-
# Periodic cloud backup
|
| 842 |
-
# RATIONALE: Don't backup every message (API spam), but don't wait too long
|
| 843 |
-
self._saves_since_backup += 1
|
| 844 |
-
if self._saves_since_backup >= self.BACKUP_EVERY_N_SAVES:
|
| 845 |
-
self._backup_to_cloud()
|
| 846 |
-
self._saves_since_backup = 0
|
| 847 |
-
|
| 848 |
-
def search_conversations(self, query: str, n_results: int = 5) -> List[Dict]:
|
| 849 |
-
"""
|
| 850 |
-
Search past conversations for relevant context.
|
| 851 |
-
|
| 852 |
-
This enables TRUE unlimited context - Clawdbot can remember
|
| 853 |
-
everything ever discussed by searching its own conversation history.
|
| 854 |
-
|
| 855 |
-
Args:
|
| 856 |
-
query: What to search for in past conversations
|
| 857 |
-
n_results: How many results to return
|
| 858 |
-
|
| 859 |
-
Returns:
|
| 860 |
-
List of past conversation turns with user/assistant messages
|
| 861 |
-
"""
|
| 862 |
-
if self.conversations.count() == 0:
|
| 863 |
-
return []
|
| 864 |
-
|
| 865 |
-
results = self.conversations.query(
|
| 866 |
-
query_texts=[query],
|
| 867 |
-
n_results=min(n_results, self.conversations.count())
|
| 868 |
-
)
|
| 869 |
-
|
| 870 |
-
formatted = []
|
| 871 |
-
for i, (doc, metadata) in enumerate(zip(
|
| 872 |
-
results['documents'][0], results['metadatas'][0]
|
| 873 |
-
)):
|
| 874 |
-
formatted.append({
|
| 875 |
-
"turn": metadata.get("turn", "unknown"),
|
| 876 |
-
"user": metadata.get("user", ""),
|
| 877 |
-
"assistant": metadata.get("assistant", ""),
|
| 878 |
-
"full_text": doc,
|
| 879 |
-
"relevance": i + 1 # Lower is more relevant
|
| 880 |
-
})
|
| 881 |
-
|
| 882 |
-
return formatted
|
| 883 |
-
|
| 884 |
-
def get_conversation_count(self) -> int:
|
| 885 |
-
"""Get total number of saved conversation turns."""
|
| 886 |
-
return self.conversations.count()
|
| 887 |
-
|
| 888 |
-
def get_stats(self) -> Dict:
|
| 889 |
-
"""
|
| 890 |
-
Get statistics about indexed codebase.
|
| 891 |
-
|
| 892 |
-
CHANGELOG [2025-01-31 - Claude]
|
| 893 |
-
Added storage_path and cloud_backup_status for better diagnostics.
|
| 894 |
-
|
| 895 |
-
Returns:
|
| 896 |
-
Dict with file counts, sizes, etc.
|
| 897 |
-
"""
|
| 898 |
-
return {
|
| 899 |
-
"total_files": self.collection.count(),
|
| 900 |
-
"repo_path": str(self.repo_path),
|
| 901 |
-
"collection_name": self.collection.name,
|
| 902 |
-
"conversations": self.conversations.count(),
|
| 903 |
-
"storage_path": CHROMA_DB_PATH,
|
| 904 |
-
"cloud_backup_configured": self.persistence.is_configured,
|
| 905 |
-
"cloud_backup_repo": self.persistence.repo_id or "Not set"
|
| 906 |
-
}
|
| 907 |
-
|
| 908 |
-
def force_backup(self):
|
| 909 |
-
"""
|
| 910 |
-
Force immediate backup to cloud.
|
| 911 |
-
|
| 912 |
-
CHANGELOG [2025-01-30 - Claude]
|
| 913 |
-
Call this on app shutdown to ensure no data loss.
|
| 914 |
-
"""
|
| 915 |
-
print("Forcing cloud backup...")
|
| 916 |
-
self._backup_to_cloud(force=True)
|
| 917 |
-
print("Backup complete")
|
| 918 |
-
|
| 919 |
-
def shutdown(self):
|
| 920 |
-
"""
|
| 921 |
-
Clean shutdown - ensure all data is saved.
|
| 922 |
-
|
| 923 |
-
CHANGELOG [2025-01-30 - Claude]
|
| 924 |
-
Call this when the Space is shutting down.
|
| 925 |
-
"""
|
| 926 |
-
print("Shutting down RecursiveContextManager...")
|
| 927 |
-
self.force_backup()
|
| 928 |
-
print("Shutdown complete")
|
|
|
|
| 1 |
"""
|
| 2 |
Recursive Context Manager for Clawdbot
|
| 3 |
+
[Corrected version to fix /.cache PermissionError]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import List, Dict, Optional, Tuple
|
| 8 |
import chromadb
|
| 9 |
from chromadb.config import Settings
|
| 10 |
+
from chromadb.utils.embedding_functions import ONNXMiniLM_L6_V2
|
| 11 |
import hashlib
|
| 12 |
import json
|
| 13 |
import os
|
| 14 |
import time
|
| 15 |
import threading
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
def _select_chroma_path():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
data_path = Path("/data/chroma_db")
|
| 19 |
try:
|
| 20 |
data_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 21 |
test_file = data_path / ".write_test"
|
| 22 |
test_file.write_text("test")
|
| 23 |
test_file.unlink()
|
|
|
|
|
|
|
|
|
|
| 24 |
return str(data_path)
|
| 25 |
+
except (OSError, PermissionError):
|
| 26 |
+
workspace_path = Path("/workspace/chroma_db")
|
| 27 |
+
workspace_path.mkdir(parents=True, exist_ok=True)
|
| 28 |
+
return str(workspace_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
CHROMA_DB_PATH = _select_chroma_path()
|
| 31 |
|
|
|
|
| 32 |
class HFDatasetPersistence:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def __init__(self, repo_id: str = None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
from huggingface_hub import HfApi
|
|
|
|
| 35 |
self.api = HfApi()
|
| 36 |
self.repo_id = repo_id or os.getenv("MEMORY_REPO")
|
| 37 |
self.token = (
|
|
|
|
| 39 |
os.getenv("HUGGING_FACE_HUB_TOKEN") or
|
| 40 |
os.getenv("HUGGINGFACE_TOKEN")
|
| 41 |
)
|
|
|
|
|
|
|
| 42 |
self._repo_ready = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
self._save_lock = threading.Lock()
|
| 44 |
self._pending_save = False
|
| 45 |
self._last_save_time = 0
|
| 46 |
+
self.SAVE_DEBOUNCE_SECONDS = 10
|
| 47 |
|
|
|
|
|
|
|
| 48 |
if self.repo_id and self.token:
|
| 49 |
self._ensure_repo_exists()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
self._verify_write_permissions()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
def _ensure_repo_exists(self):
|
| 53 |
+
if self._repo_ready: return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
try:
|
| 55 |
+
self.api.repo_info(repo_id=self.repo_id, repo_type="dataset", token=self.token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
self._repo_ready = True
|
| 57 |
except Exception:
|
|
|
|
| 58 |
try:
|
| 59 |
+
self.api.create_repo(repo_id=self.repo_id, repo_type="dataset", private=True, token=self.token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
self._repo_ready = True
|
| 61 |
+
except Exception: pass
|
|
|
|
|
|
|
| 62 |
|
| 63 |
@property
|
| 64 |
def is_configured(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
return bool(self.repo_id and self.token)
|
| 66 |
|
| 67 |
def _verify_write_permissions(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
try:
|
| 69 |
+
self.api.whoami(token=self.token)
|
| 70 |
+
except Exception: pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def save_conversations(self, conversations_data: List[Dict], force: bool = False):
|
| 73 |
+
if not self.is_configured or not self._repo_ready: return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
current_time = time.time()
|
| 75 |
+
if not force and (current_time - self._last_save_time < self.SAVE_DEBOUNCE_SECONDS):
|
| 76 |
+
self._pending_save = True
|
| 77 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
with self._save_lock:
|
| 79 |
try:
|
|
|
|
| 80 |
temp_path = Path("/tmp/conversations_backup.json")
|
| 81 |
temp_path.write_text(json.dumps(conversations_data, indent=2))
|
|
|
|
|
|
|
| 82 |
self.api.upload_file(
|
| 83 |
path_or_fileobj=str(temp_path),
|
| 84 |
path_in_repo="conversations.json",
|
|
|
|
| 87 |
token=self.token,
|
| 88 |
commit_message=f"Backup {len(conversations_data)} conversations"
|
| 89 |
)
|
|
|
|
| 90 |
self._last_save_time = current_time
|
| 91 |
self._pending_save = False
|
|
|
|
| 92 |
return True
|
| 93 |
+
except Exception: return False
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
def load_conversations(self) -> List[Dict]:
|
| 96 |
+
if not self.is_configured: return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
try:
|
| 98 |
from huggingface_hub import hf_hub_download
|
| 99 |
+
local_path = hf_hub_download(repo_id=self.repo_id, filename="conversations.json", repo_type="dataset", token=self.token)
|
| 100 |
+
with open(local_path, 'r') as f: return json.load(f)
|
| 101 |
+
except Exception: return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
class RecursiveContextManager:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
def __init__(self, repo_path: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
self.repo_path = Path(repo_path)
|
|
|
|
|
|
|
|
|
|
| 106 |
self.persistence = HFDatasetPersistence()
|
| 107 |
+
|
| 108 |
+
# FIX: Explicitly configure embedding model path to prevent PermissionError
|
| 109 |
+
self.embedding_function = ONNXMiniLM_L6_V2()
|
| 110 |
+
cache_dir = os.getenv("CHROMA_CACHE_DIR", "/tmp/.cache/chroma")
|
| 111 |
+
self.embedding_function.DOWNLOAD_PATH = cache_dir
|
| 112 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
self.chroma_client = chromadb.PersistentClient(
|
| 115 |
path=CHROMA_DB_PATH,
|
| 116 |
+
settings=Settings(anonymized_telemetry=False, allow_reset=True)
|
|
|
|
|
|
|
|
|
|
| 117 |
)
|
|
|
|
| 118 |
|
|
|
|
| 119 |
collection_name = self._get_collection_name()
|
| 120 |
+
# FIX: Pass embedding_function here
|
| 121 |
+
self.collection = self.chroma_client.get_or_create_collection(
|
| 122 |
+
name=collection_name,
|
| 123 |
+
embedding_function=self.embedding_function,
|
| 124 |
+
metadata={"description": "E-T Systems codebase"}
|
| 125 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
+
conversations_name = f"conversations_{collection_name.split('_')[1]}"
|
| 128 |
+
# FIX: Pass embedding_function here as well
|
| 129 |
+
self.conversations = self.chroma_client.get_or_create_collection(
|
| 130 |
+
name=conversations_name,
|
| 131 |
+
embedding_function=self.embedding_function,
|
| 132 |
+
metadata={"description": "Clawdbot conversation history"}
|
| 133 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
|
|
|
|
|
|
| 135 |
if self.conversations.count() == 0:
|
| 136 |
self._restore_from_cloud()
|
| 137 |
|
|
|
|
|
|
|
|
|
|
| 138 |
self._saves_since_backup = 0
|
| 139 |
+
self.BACKUP_EVERY_N_SAVES = 1 # Immediate cloud sync for PRO storage reliability
|
| 140 |
+
self._is_first_save = True
|
| 141 |
|
| 142 |
def _restore_from_cloud(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
cloud_data = self.persistence.load_conversations()
|
| 144 |
+
if not cloud_data: return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
for conv in cloud_data:
|
| 146 |
try:
|
| 147 |
+
self.conversations.add(documents=[conv["document"]], metadatas=[conv["metadata"]], ids=[conv["id"]])
|
| 148 |
+
except Exception: pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
def _backup_to_cloud(self, force: bool = False):
|
| 151 |
+
if self.conversations.count() == 0: return
|
| 152 |
+
all_convs = self.conversations.get(include=["documents", "metadatas"])
|
| 153 |
+
backup_data = [{"id": id_, "document": doc, "metadata": meta}
|
| 154 |
+
for doc, meta, id_ in zip(all_convs["documents"], all_convs["metadatas"], all_convs["ids"])]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
self.persistence.save_conversations(backup_data, force=force)
|
| 156 |
|
| 157 |
def _get_collection_name(self) -> str:
|
|
|
|
| 158 |
path_hash = hashlib.md5(str(self.repo_path).encode()).hexdigest()[:8]
|
| 159 |
return f"codebase_{path_hash}"
|
| 160 |
|
| 161 |
def _index_codebase(self):
|
| 162 |
+
code_extensions = {'.py', '.js', '.ts', '.tsx', '.jsx', '.md', '.txt', '.json', '.yaml', '.yml'}
|
| 163 |
+
skip_dirs = {'node_modules', '.git', '__pycache__', 'venv', 'env', '.venv', 'dist', 'build'}
|
| 164 |
+
documents, metadatas, ids = [], [], []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
for file_path in self.repo_path.rglob('*'):
|
| 166 |
+
if file_path.is_dir() or any(skip in file_path.parts for skip in skip_dirs) or file_path.suffix not in code_extensions:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
continue
|
|
|
|
| 168 |
try:
|
| 169 |
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
| 170 |
+
if not content.strip() or len(content) > 100000: continue
|
| 171 |
+
rel = str(file_path.relative_to(self.repo_path))
|
| 172 |
+
documents.append(content); ids.append(rel)
|
| 173 |
+
metadatas.append({"path": rel, "type": file_path.suffix[1:], "size": len(content)})
|
| 174 |
+
except Exception: continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
if documents:
|
| 176 |
+
for i in range(0, len(documents), 100):
|
| 177 |
+
self.collection.add(documents=documents[i:i+100], metadatas=metadatas[i:i+100], ids=ids[i:i+100])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
def search_code(self, query: str, n_results: int = 5) -> List[Dict]:
|
| 180 |
+
if self.collection.count() == 0: return []
|
| 181 |
+
results = self.collection.query(query_texts=[query], n_results=min(n_results, self.collection.count()))
|
| 182 |
+
return [{"file": m['path'], "snippet": d[:500], "relevance": round(1-dist, 3)}
|
| 183 |
+
for d, m, dist in zip(results['documents'][0], results['metadatas'][0], results['distances'][0])]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
def read_file(self, path: str, lines: Optional[Tuple[int, int]] = None) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
full_path = self.repo_path / path
|
| 187 |
+
if not full_path.exists(): return "Error: File not found"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
try:
|
| 189 |
content = full_path.read_text(encoding='utf-8', errors='ignore')
|
|
|
|
| 190 |
if lines:
|
| 191 |
+
l_list = content.split('\n')
|
| 192 |
+
return '\n'.join(l_list[lines[0]-1:lines[1]])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
return content
|
| 194 |
+
except Exception as e: return str(e)
|
|
|
|
|
|
|
| 195 |
|
| 196 |
def search_testament(self, query: str) -> str:
|
| 197 |
+
t_path = self.repo_path / "TESTAMENT.md"
|
| 198 |
+
if not t_path.exists(): return "Testament not found"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
try:
|
| 200 |
+
sections = t_path.read_text(encoding='utf-8').split('\n## ')
|
| 201 |
+
relevant = [('## ' + s if not s.startswith('#') else s) for s in sections if query.lower() in s.lower()]
|
| 202 |
+
return '\n\n'.join(relevant) if relevant else "No matches"
|
| 203 |
+
except Exception as e: return str(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
def list_files(self, directory: str = ".") -> List[str]:
|
| 206 |
+
d_path = self.repo_path / directory
|
| 207 |
+
if not d_path.exists(): return ["Error: Not found"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
try:
|
| 209 |
+
return [(f.name + '/' if f.is_dir() else f.name) for f in sorted(d_path.iterdir()) if not f.name.startswith('.')]
|
| 210 |
+
except Exception as e: return [str(e)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
def save_conversation_turn(self, user_message: str, assistant_message: str, turn_id: int):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
combined = f"USER: {user_message}\n\nASSISTANT: {assistant_message}"
|
| 214 |
+
u_id = f"turn_{int(time.time())}_{turn_id}"
|
| 215 |
+
self.conversations.add(documents=[combined], metadatas=[{"user": user_message[:500], "assistant": assistant_message[:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|