CapStoneRAG10 / archived_scripts /audit_collection_names.py
Developer
Initial commit for HuggingFace Spaces - RAG Capstone Project with Qdrant Cloud
1d10b0a
"""Query SQLite to find existing collection names."""
import sqlite3
conn = sqlite3.connect('chroma_db/chroma.sqlite3')
cursor = conn.cursor()
print("=" * 80)
print("πŸ“Š ChromaDB Collection Names Audit")
print("=" * 80)
# Check segments table schema
print("\nπŸ“‹ Segments Table Schema:")
try:
cursor.execute('PRAGMA table_info(segments)')
cols = cursor.fetchall()
for col in cols[:5]: # First 5 columns
print(f" {col[1]} ({col[2]})")
except Exception as e:
print(f" Error: {e}")
# Check segment_metadata
print("\nπŸ“– Segment Metadata (sample):")
try:
cursor.execute("""
SELECT DISTINCT s.id, s.collection_id, sm.metadata
FROM segments s
LEFT JOIN segment_metadata sm ON s.id = sm.id
LIMIT 4
""")
results = cursor.fetchall()
for i, (seg_id, coll_id, metadata) in enumerate(results, 1):
print(f"\n {i}. Segment ID: {seg_id[:20]}...")
if coll_id:
print(f" Collection: {coll_id}")
if metadata:
print(f" Metadata: {metadata[:80]}...")
except Exception as e:
print(f" Error: {e}")
# Check collections table
print("\nπŸ“š Collections Table:")
try:
cursor.execute('SELECT id, name, config_json_str FROM collections')
results = cursor.fetchall()
for i, (coll_id, name, config) in enumerate(results, 1):
print(f"\n {i}. Name: {name}")
print(f" ID: {coll_id}")
if config:
print(f" Config: {config[:100]}...")
except Exception as e:
print(f" Error: {e}")
# Try to find real names by checking what was previously stored
print("\nπŸ” Searching for original collection names...")
try:
cursor.execute("""
SELECT DISTINCT coll_id, COUNT(*) as doc_count
FROM embeddings
GROUP BY coll_id
""")
results = cursor.fetchall()
print(f"\n Collections with embeddings:")
for coll_id, count in results:
print(f" {coll_id}: {count} embeddings")
except Exception as e:
print(f" Could not query embeddings: {e}")
conn.close()
print("\n" + "=" * 80)