File size: 5,930 Bytes
1d10b0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""Reset ChromaDB SQLite index while preserving all collection data."""
import chromadb
from chromadb.config import Settings
import os
import sys


def reset_sqlite_index():
    """Reset SQLite index while preserving collection data.
    
    This script:
    1. Detects existing collection UUID folders
    2. Forces ChromaDB to rebuild the sqlite3 index
    3. Verifies collections are properly indexed
    4. Reports status
    """
    
    print("\n" + "=" * 70)
    print("πŸ”§ ChromaDB SQLite Index Reset Tool")
    print("=" * 70)
    
    chroma_path = "./chroma_db"
    
    # Step 1: Verify collection folders exist
    print("\nπŸ“ Step 1: Scanning for collection folders...")
    print("-" * 70)
    
    if not os.path.exists(chroma_path):
        print(f"❌ ERROR: {chroma_path} directory not found!")
        return False
    
    # Find all UUID folders
    uuid_folders = []
    try:
        for item in os.listdir(chroma_path):
            item_path = os.path.join(chroma_path, item)
            # Check if it's a directory and matches UUID pattern (36 chars, 4 hyphens)
            if os.path.isdir(item_path) and len(item) == 36 and item.count('-') == 4:
                uuid_folders.append(item)
    except Exception as e:
        print(f"❌ Error scanning directory: {e}")
        return False
    
    print(f"βœ… Found {len(uuid_folders)} collection folder(s)")
    
    if len(uuid_folders) > 0:
        for i, folder in enumerate(uuid_folders, 1):
            folder_path = os.path.join(chroma_path, folder)
            # Check what files are in the collection
            files = os.listdir(folder_path)
            print(f"  {i}. {folder}")
            print(f"     Files: {', '.join(files)}")
    else:
        print("⚠️  WARNING: No collection folders found!")
    
    # Step 2: Check SQLite status
    print("\nπŸ“Š Step 2: Checking SQLite status...")
    print("-" * 70)
    
    sqlite_path = os.path.join(chroma_path, "chroma.sqlite3")
    if os.path.exists(sqlite_path):
        sqlite_size = os.path.getsize(sqlite_path)
        print(f"βœ… chroma.sqlite3 exists (size: {sqlite_size:,} bytes)")
        
        if sqlite_size < 100000:  # Less than 100KB is likely empty
            print("⚠️  SQLite file is very small (likely empty/corrupted)")
    else:
        print("βœ… chroma.sqlite3 does not exist (will be created)")
    
    # Step 3: Reset by creating new client
    print("\nπŸ”„ Step 3: Rebuilding SQLite index...")
    print("-" * 70)
    
    try:
        print("Creating fresh ChromaDB PersistentClient...")
        client = chromadb.PersistentClient(
            path=chroma_path,
            settings=Settings(
                anonymized_telemetry=False,
                allow_reset=True
            )
        )
        print("βœ… Client created successfully")
        
    except Exception as e:
        print(f"❌ ERROR creating client: {e}")
        return False
    
    # Step 4: Verify collections are indexed
    print("\nπŸ” Step 4: Verifying collection index...")
    print("-" * 70)
    
    try:
        collections = client.list_collections()
        print(f"βœ… ChromaDB found {len(collections)} collection(s)")
        
        if len(collections) > 0:
            print("\nIndexed Collections:")
            for i, collection in enumerate(collections, 1):
                doc_count = collection.count()
                metadata = collection.metadata
                print(f"\n  {i}. {collection.name}")
                print(f"     Document count: {doc_count}")
                print(f"     Metadata: {metadata}")
            
            print("\nβœ… SUCCESS! All collections are properly indexed!")
            return True
        
        elif len(uuid_folders) > 0:
            # Collections folders exist but not indexed
            print("⚠️  WARNING: Collection folders exist but not indexed in SQLite")
            print("\nThis can happen if:")
            print("  - SQLite file was deleted and recreated without scanning folders")
            print("  - Collection data is corrupted")
            print("  - Permission issues prevent reading collection folders")
            
            print("\nπŸ“‹ Troubleshooting steps:")
            print("  1. Check file permissions in chroma_db directory")
            print("  2. Try deleting chroma.sqlite3 and restarting application")
            print("  3. See docs/CHROMADB_RECOVERY.md for more options")
            
            return False
        
        else:
            print("βœ… No collections currently indexed (database is clean)")
            return True
            
    except Exception as e:
        print(f"❌ ERROR reading collections: {e}")
        return False


def main():
    """Main entry point."""
    try:
        success = reset_sqlite_index()
        
        print("\n" + "=" * 70)
        if success:
            print("βœ… RESET COMPLETE - Collections are properly indexed!")
            print("\nπŸ“ Next steps:")
            print("  1. Start Streamlit: streamlit run streamlit_app.py")
            print("  2. Check 'Existing Collections' dropdown")
            print("  3. Load a collection and verify it works")
            exit_code = 0
        else:
            print("❌ RESET INCOMPLETE - See messages above for details")
            print("\nπŸ“ Next steps:")
            print("  1. Review error messages above")
            print("  2. Check docs/CHROMADB_RECOVERY.md for solutions")
            print("  3. Contact support if issues persist")
            exit_code = 1
        
        print("=" * 70 + "\n")
        sys.exit(exit_code)
        
    except KeyboardInterrupt:
        print("\n\n⚠️  Script interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\n❌ FATAL ERROR: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()