CapStoneRAG10 / recover_chroma_advanced.py
Developer
Initial commit for HuggingFace Spaces - RAG Capstone Project with Qdrant Cloud
1d10b0a
"""Advanced ChromaDB recovery - rebuild index from collection folder metadata."""
import os
import sys
import sqlite3
import pickle
import json
from pathlib import Path
import shutil
def rebuild_chroma_index_advanced():
"""Rebuild SQLite index by reading ChromaDB internal metadata.
This script:
1. Reads collection metadata from pickle files
2. Directly updates SQLite with collection entries
3. Verifies all collections are properly registered
"""
print("\n" + "=" * 80)
print("🔧 Advanced ChromaDB Recovery - Direct SQLite Rebuild")
print("=" * 80)
chroma_path = "./chroma_db"
sqlite_path = os.path.join(chroma_path, "chroma.sqlite3")
# Step 1: Find all collection folders
print("\n📁 Step 1: Scanning collection folders...")
print("-" * 80)
collections_found = []
try:
for item in os.listdir(chroma_path):
item_path = os.path.join(chroma_path, item)
# UUID folder check
if os.path.isdir(item_path) and len(item) == 36 and item.count('-') == 4:
# Check for metadata file
metadata_file = os.path.join(item_path, "metadata.json")
index_metadata_file = os.path.join(item_path, "index_metadata.pickle")
collection_info = {
"uuid": item,
"path": item_path,
"has_metadata": os.path.exists(metadata_file),
"has_index_metadata": os.path.exists(index_metadata_file),
"files": os.listdir(item_path)
}
collections_found.append(collection_info)
print(f"✅ {item}")
except Exception as e:
print(f"❌ Error scanning: {e}")
return False
print(f"\n✅ Found {len(collections_found)} collection folder(s)")
if len(collections_found) == 0:
print("⚠️ No collections to recover")
return True
# Step 2: Backup existing SQLite
print("\n💾 Step 2: Backing up SQLite...")
print("-" * 80)
if os.path.exists(sqlite_path):
try:
shutil.copy2(sqlite_path, sqlite_path + ".backup")
print(f"✅ Backup created: {sqlite_path}.backup")
except Exception as e:
print(f"⚠️ Could not backup: {e}")
# Step 3: Attempt to read collection metadata
print("\n📖 Step 3: Reading collection metadata...")
print("-" * 80)
for collection_info in collections_found:
uuid = collection_info["uuid"]
coll_path = collection_info["path"]
print(f"\n📂 Collection: {uuid}")
# Try to read metadata.json
metadata_file = os.path.join(coll_path, "metadata.json")
metadata = None
if os.path.exists(metadata_file):
try:
with open(metadata_file, 'r') as f:
metadata = json.load(f)
print(f" ✅ Found metadata.json")
if 'name' in metadata:
print(f" Name: {metadata.get('name')}")
if 'metadata' in metadata:
print(f" Metadata: {metadata.get('metadata')}")
except Exception as e:
print(f" ⚠️ Could not read metadata.json: {e}")
# Try to read index_metadata.pickle
index_file = os.path.join(coll_path, "index_metadata.pickle")
if os.path.exists(index_file):
try:
with open(index_file, 'rb') as f:
index_data = pickle.load(f)
print(f" ✅ Found index_metadata.pickle")
if isinstance(index_data, dict):
for key in list(index_data.keys())[:3]:
print(f" Key: {key}")
except Exception as e:
print(f" ⚠️ Could not read index_metadata.pickle: {e}")
# Store metadata for SQLite update
collection_info["metadata"] = metadata
# Step 4: Check if we need to manually create SQLite entries
print("\n🔄 Step 4: SQLite recovery strategy...")
print("-" * 80)
if os.path.exists(sqlite_path):
print("ℹ️ SQLite exists - attempting to inspect and repair...")
try:
conn = sqlite3.connect(sqlite_path)
cursor = conn.cursor()
# Check what tables exist
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
tables = cursor.fetchall()
print(f"✅ Tables in SQLite: {[t[0] for t in tables]}")
# Check collections table
try:
cursor.execute("SELECT COUNT(*) FROM collections")
count = cursor.fetchone()[0]
print(f"ℹ️ Collections in SQLite: {count}")
if count == 0 and len(collections_found) > 0:
print("\n💡 Collections table is empty - attempting to populate...")
# This is where we'd insert collection records
# But we need the proper schema which we don't have direct access to
print("⚠️ Cannot directly modify SQLite without knowing exact schema")
print(" Use ChromaDB API for proper registration")
except Exception as e:
print(f"⚠️ Could not query collections: {e}")
conn.close()
except Exception as e:
print(f"⚠️ Error accessing SQLite: {e}")
# Step 5: Recommend using ChromaDB API
print("\n💡 Step 5: Recommended recovery approach...")
print("-" * 80)
print("""
The collection data is intact in:
""")
for coll in collections_found:
print(f" • {coll['uuid']}")
print("""
✅ NEXT STEPS - Use ChromaDB API to register collections:
1. Delete chroma.sqlite3 to force fresh rebuild
2. In Streamlit, use: client.get_or_create_collection()
to re-register each collection by name
3. Or use the Python console:
from chromadb import PersistentClient
client = PersistentClient(path='./chroma_db')
# Create collection with same UUID if needed
collection = client.create_collection(
name='<original_name>',
metadata={...}
)
4. The collection data will be automatically re-indexed
See docs/CHROMADB_RECOVERY.md for complete instructions
""")
return True
def main():
"""Main entry point."""
try:
success = rebuild_chroma_index_advanced()
print("\n" + "=" * 80)
if success:
print("✅ ANALYSIS COMPLETE")
print("\n📝 Action required:")
print(" 1. Follow the recommended recovery approach above")
print(" 2. Or manually delete chroma.sqlite3 and restart Streamlit")
exit_code = 0
else:
print("❌ RECOVERY FAILED")
exit_code = 1
print("=" * 80 + "\n")
sys.exit(exit_code)
except Exception as e:
print(f"\n❌ FATAL ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()