File size: 1,586 Bytes
21f8709
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import sys
import os
import time

# Add backend to path
sys.path.append(os.path.join(os.getcwd(), 'backend'))

from services.vector_store import VectorStore

def debug_legacy_data():
    print("🚀 Starting Debug of Legacy Data...")
    
    store = VectorStore()
    
    # 1. Add a document WITHOUT session_id (simulating legacy data)
    # We need to bypass the add_document_chunks method or modify it temporarily, 
    # or just use the collection directly.
    
    print("\n1️⃣ Adding legacy document (no session_id)...")
    legacy_text = "This is a legacy document about Ancient Rome."
    legacy_meta = {"source": "history.pdf"} # No session_id
    
    # Manually add to collection
    embedding = store.embedding_model.encode([f"passage: {legacy_text}"], normalize_embeddings=True)
    
    store.collection.add(
        embeddings=embedding.tolist(),
        metadatas=[legacy_meta],
        documents=[legacy_text],
        ids=["legacy_doc_1"]
    )
    
    # 2. Query WITH a session_id
    session_id = "current_user_session"
    print(f"\n2️⃣ Querying with session_id='{session_id}'...")
    
    results = store.find_similar_chunks("Rome", session_id=session_id, n_results=5)
    
    print(f"   Results: {results['documents']}")
    
    if results['documents'] and results['documents'][0]:
        print("   ❌ FAIL: Legacy document was returned despite filter!")
    else:
        print("   ✅ PASS: Legacy document was NOT returned.")

    # Clean up
    store.collection.delete(ids=["legacy_doc_1"])

if __name__ == "__main__":
    debug_legacy_data()