CapStoneRAG10 / reset_sqlite_index.py
Developer
Initial commit for HuggingFace Spaces - RAG Capstone Project with Qdrant Cloud
1d10b0a
"""Reset ChromaDB SQLite index while preserving all collection data."""
import chromadb
from chromadb.config import Settings
import os
import sys
def reset_sqlite_index():
"""Reset SQLite index while preserving collection data.
This script:
1. Detects existing collection UUID folders
2. Forces ChromaDB to rebuild the sqlite3 index
3. Verifies collections are properly indexed
4. Reports status
"""
print("\n" + "=" * 70)
print("πŸ”§ ChromaDB SQLite Index Reset Tool")
print("=" * 70)
chroma_path = "./chroma_db"
# Step 1: Verify collection folders exist
print("\nπŸ“ Step 1: Scanning for collection folders...")
print("-" * 70)
if not os.path.exists(chroma_path):
print(f"❌ ERROR: {chroma_path} directory not found!")
return False
# Find all UUID folders
uuid_folders = []
try:
for item in os.listdir(chroma_path):
item_path = os.path.join(chroma_path, item)
# Check if it's a directory and matches UUID pattern (36 chars, 4 hyphens)
if os.path.isdir(item_path) and len(item) == 36 and item.count('-') == 4:
uuid_folders.append(item)
except Exception as e:
print(f"❌ Error scanning directory: {e}")
return False
print(f"βœ… Found {len(uuid_folders)} collection folder(s)")
if len(uuid_folders) > 0:
for i, folder in enumerate(uuid_folders, 1):
folder_path = os.path.join(chroma_path, folder)
# Check what files are in the collection
files = os.listdir(folder_path)
print(f" {i}. {folder}")
print(f" Files: {', '.join(files)}")
else:
print("⚠️ WARNING: No collection folders found!")
# Step 2: Check SQLite status
print("\nπŸ“Š Step 2: Checking SQLite status...")
print("-" * 70)
sqlite_path = os.path.join(chroma_path, "chroma.sqlite3")
if os.path.exists(sqlite_path):
sqlite_size = os.path.getsize(sqlite_path)
print(f"βœ… chroma.sqlite3 exists (size: {sqlite_size:,} bytes)")
if sqlite_size < 100000: # Less than 100KB is likely empty
print("⚠️ SQLite file is very small (likely empty/corrupted)")
else:
print("βœ… chroma.sqlite3 does not exist (will be created)")
# Step 3: Reset by creating new client
print("\nπŸ”„ Step 3: Rebuilding SQLite index...")
print("-" * 70)
try:
print("Creating fresh ChromaDB PersistentClient...")
client = chromadb.PersistentClient(
path=chroma_path,
settings=Settings(
anonymized_telemetry=False,
allow_reset=True
)
)
print("βœ… Client created successfully")
except Exception as e:
print(f"❌ ERROR creating client: {e}")
return False
# Step 4: Verify collections are indexed
print("\nπŸ” Step 4: Verifying collection index...")
print("-" * 70)
try:
collections = client.list_collections()
print(f"βœ… ChromaDB found {len(collections)} collection(s)")
if len(collections) > 0:
print("\nIndexed Collections:")
for i, collection in enumerate(collections, 1):
doc_count = collection.count()
metadata = collection.metadata
print(f"\n {i}. {collection.name}")
print(f" Document count: {doc_count}")
print(f" Metadata: {metadata}")
print("\nβœ… SUCCESS! All collections are properly indexed!")
return True
elif len(uuid_folders) > 0:
# Collections folders exist but not indexed
print("⚠️ WARNING: Collection folders exist but not indexed in SQLite")
print("\nThis can happen if:")
print(" - SQLite file was deleted and recreated without scanning folders")
print(" - Collection data is corrupted")
print(" - Permission issues prevent reading collection folders")
print("\nπŸ“‹ Troubleshooting steps:")
print(" 1. Check file permissions in chroma_db directory")
print(" 2. Try deleting chroma.sqlite3 and restarting application")
print(" 3. See docs/CHROMADB_RECOVERY.md for more options")
return False
else:
print("βœ… No collections currently indexed (database is clean)")
return True
except Exception as e:
print(f"❌ ERROR reading collections: {e}")
return False
def main():
"""Main entry point."""
try:
success = reset_sqlite_index()
print("\n" + "=" * 70)
if success:
print("βœ… RESET COMPLETE - Collections are properly indexed!")
print("\nπŸ“ Next steps:")
print(" 1. Start Streamlit: streamlit run streamlit_app.py")
print(" 2. Check 'Existing Collections' dropdown")
print(" 3. Load a collection and verify it works")
exit_code = 0
else:
print("❌ RESET INCOMPLETE - See messages above for details")
print("\nπŸ“ Next steps:")
print(" 1. Review error messages above")
print(" 2. Check docs/CHROMADB_RECOVERY.md for solutions")
print(" 3. Contact support if issues persist")
exit_code = 1
print("=" * 70 + "\n")
sys.exit(exit_code)
except KeyboardInterrupt:
print("\n\n⚠️ Script interrupted by user")
sys.exit(1)
except Exception as e:
print(f"\n❌ FATAL ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()