Spaces:
Running
Running
| import sys | |
| import os | |
| import time | |
| # Add backend to path | |
| sys.path.append(os.path.join(os.getcwd(), 'backend')) | |
| from services.vector_store import VectorStore | |
| def debug_legacy_data(): | |
| print("🚀 Starting Debug of Legacy Data...") | |
| store = VectorStore() | |
| # 1. Add a document WITHOUT session_id (simulating legacy data) | |
| # We need to bypass the add_document_chunks method or modify it temporarily, | |
| # or just use the collection directly. | |
| print("\n1️⃣ Adding legacy document (no session_id)...") | |
| legacy_text = "This is a legacy document about Ancient Rome." | |
| legacy_meta = {"source": "history.pdf"} # No session_id | |
| # Manually add to collection | |
| embedding = store.embedding_model.encode([f"passage: {legacy_text}"], normalize_embeddings=True) | |
| store.collection.add( | |
| embeddings=embedding.tolist(), | |
| metadatas=[legacy_meta], | |
| documents=[legacy_text], | |
| ids=["legacy_doc_1"] | |
| ) | |
| # 2. Query WITH a session_id | |
| session_id = "current_user_session" | |
| print(f"\n2️⃣ Querying with session_id='{session_id}'...") | |
| results = store.find_similar_chunks("Rome", session_id=session_id, n_results=5) | |
| print(f" Results: {results['documents']}") | |
| if results['documents'] and results['documents'][0]: | |
| print(" ❌ FAIL: Legacy document was returned despite filter!") | |
| else: | |
| print(" ✅ PASS: Legacy document was NOT returned.") | |
| # Clean up | |
| store.collection.delete(ids=["legacy_doc_1"]) | |
| if __name__ == "__main__": | |
| debug_legacy_data() | |