Spaces:

gopikrishnait
/

CapStoneRAG10

Sleeping

CapStoneRAG10 / recover_chroma_advanced.py

Developer

Initial commit for HuggingFace Spaces - RAG Capstone Project with Qdrant Cloud

1d10b0a 4 months ago

7.44 kB

	"""Advanced ChromaDB recovery - rebuild index from collection folder metadata."""
	import os
	import sys
	import sqlite3
	import pickle
	import json
	from pathlib import Path
	import shutil

	def rebuild_chroma_index_advanced():
	"""Rebuild SQLite index by reading ChromaDB internal metadata.

	This script:
	1. Reads collection metadata from pickle files
	2. Directly updates SQLite with collection entries
	3. Verifies all collections are properly registered
	"""

	print("\n" + "=" * 80)
	print("🔧 Advanced ChromaDB Recovery - Direct SQLite Rebuild")
	print("=" * 80)

	chroma_path = "./chroma_db"
	sqlite_path = os.path.join(chroma_path, "chroma.sqlite3")

	# Step 1: Find all collection folders
	print("\n📁 Step 1: Scanning collection folders...")
	print("-" * 80)

	collections_found = []
	try:
	for item in os.listdir(chroma_path):
	item_path = os.path.join(chroma_path, item)
	# UUID folder check
	if os.path.isdir(item_path) and len(item) == 36 and item.count('-') == 4:
	# Check for metadata file
	metadata_file = os.path.join(item_path, "metadata.json")
	index_metadata_file = os.path.join(item_path, "index_metadata.pickle")

	collection_info = {
	"uuid": item,
	"path": item_path,
	"has_metadata": os.path.exists(metadata_file),
	"has_index_metadata": os.path.exists(index_metadata_file),
	"files": os.listdir(item_path)
	}
	collections_found.append(collection_info)
	print(f"✅ {item}")

	except Exception as e:
	print(f"❌ Error scanning: {e}")
	return False

	print(f"\n✅ Found {len(collections_found)} collection folder(s)")

	if len(collections_found) == 0:
	print("⚠️ No collections to recover")
	return True

	# Step 2: Backup existing SQLite
	print("\n💾 Step 2: Backing up SQLite...")
	print("-" * 80)

	if os.path.exists(sqlite_path):
	try:
	shutil.copy2(sqlite_path, sqlite_path + ".backup")
	print(f"✅ Backup created: {sqlite_path}.backup")
	except Exception as e:
	print(f"⚠️ Could not backup: {e}")

	# Step 3: Attempt to read collection metadata
	print("\n📖 Step 3: Reading collection metadata...")
	print("-" * 80)

	for collection_info in collections_found:
	uuid = collection_info["uuid"]
	coll_path = collection_info["path"]

	print(f"\n📂 Collection: {uuid}")

	# Try to read metadata.json
	metadata_file = os.path.join(coll_path, "metadata.json")
	metadata = None

	if os.path.exists(metadata_file):
	try:
	with open(metadata_file, 'r') as f:
	metadata = json.load(f)
	print(f" ✅ Found metadata.json")
	if 'name' in metadata:
	print(f" Name: {metadata.get('name')}")
	if 'metadata' in metadata:
	print(f" Metadata: {metadata.get('metadata')}")
	except Exception as e:
	print(f" ⚠️ Could not read metadata.json: {e}")

	# Try to read index_metadata.pickle
	index_file = os.path.join(coll_path, "index_metadata.pickle")
	if os.path.exists(index_file):
	try:
	with open(index_file, 'rb') as f:
	index_data = pickle.load(f)
	print(f" ✅ Found index_metadata.pickle")
	if isinstance(index_data, dict):
	for key in list(index_data.keys())[:3]:
	print(f" Key: {key}")
	except Exception as e:
	print(f" ⚠️ Could not read index_metadata.pickle: {e}")

	# Store metadata for SQLite update
	collection_info["metadata"] = metadata

	# Step 4: Check if we need to manually create SQLite entries
	print("\n🔄 Step 4: SQLite recovery strategy...")
	print("-" * 80)

	if os.path.exists(sqlite_path):
	print("ℹ️ SQLite exists - attempting to inspect and repair...")

	try:
	conn = sqlite3.connect(sqlite_path)
	cursor = conn.cursor()

	# Check what tables exist
	cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
	tables = cursor.fetchall()
	print(f"✅ Tables in SQLite: {[t[0] for t in tables]}")

	# Check collections table
	try:
	cursor.execute("SELECT COUNT(*) FROM collections")
	count = cursor.fetchone()[0]
	print(f"ℹ️ Collections in SQLite: {count}")

	if count == 0 and len(collections_found) > 0:
	print("\n💡 Collections table is empty - attempting to populate...")

	# This is where we'd insert collection records
	# But we need the proper schema which we don't have direct access to
	print("⚠️ Cannot directly modify SQLite without knowing exact schema")
	print(" Use ChromaDB API for proper registration")

	except Exception as e:
	print(f"⚠️ Could not query collections: {e}")

	conn.close()

	except Exception as e:
	print(f"⚠️ Error accessing SQLite: {e}")

	# Step 5: Recommend using ChromaDB API
	print("\n💡 Step 5: Recommended recovery approach...")
	print("-" * 80)

	print("""
	The collection data is intact in:
	""")
	for coll in collections_found:
	print(f" • {coll['uuid']}")

	print("""
	✅ NEXT STEPS - Use ChromaDB API to register collections:

	1. Delete chroma.sqlite3 to force fresh rebuild
	2. In Streamlit, use: client.get_or_create_collection()
	to re-register each collection by name
	3. Or use the Python console:

	from chromadb import PersistentClient
	client = PersistentClient(path='./chroma_db')

	# Create collection with same UUID if needed
	collection = client.create_collection(
	name='<original_name>',
	metadata={...}
	)

	4. The collection data will be automatically re-indexed

	See docs/CHROMADB_RECOVERY.md for complete instructions
	""")

	return True


	def main():
	"""Main entry point."""
	try:
	success = rebuild_chroma_index_advanced()

	print("\n" + "=" * 80)
	if success:
	print("✅ ANALYSIS COMPLETE")
	print("\n📝 Action required:")
	print(" 1. Follow the recommended recovery approach above")
	print(" 2. Or manually delete chroma.sqlite3 and restart Streamlit")
	exit_code = 0
	else:
	print("❌ RECOVERY FAILED")
	exit_code = 1

	print("=" * 80 + "\n")
	sys.exit(exit_code)

	except Exception as e:
	print(f"\n❌ FATAL ERROR: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)


	if __name__ == "__main__":
	main()