Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Script to update Qdrant vectors that are missing 'content' in payload. | |
| """ | |
| import asyncio | |
| import os | |
| import sys | |
| from typing import Any | |
| sys.path.append(os.path.join(os.path.dirname(__file__), '..')) | |
| from app.db.qdrant import QdrantDB | |
| from app.ingestion.reader import extract_text_from_markdown | |
| from app.ingestion.chunker import chunk_text | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| SOURCE_DIR = "../website/docs/modules" # sahi path | |
| COLLECTION_NAME = "test-clustor" | |
| async def update_missing_content(): | |
| qdrant_db = QdrantDB() | |
| client = await qdrant_db._get_client() | |
| logger.info("Fetching all points from Qdrant...") | |
| scroll_result = await client.scroll( | |
| collection_name=COLLECTION_NAME, | |
| limit=10000, | |
| with_payload=True, | |
| with_vectors=False | |
| ) | |
| points = scroll_result[0] | |
| logger.info(f"Found {len(points)} points in collection") | |
| updated_count = 0 | |
| for point in points: | |
| payload = point.payload or {} | |
| if not payload.get("content") or not payload.get("content").strip(): | |
| source_file = payload.get("source_file") | |
| chunk_index = payload.get("chunk_index") | |
| if not source_file or chunk_index is None: | |
| logger.warning(f"Skipping point {point.id} - missing metadata") | |
| continue | |
| # FIX: Leading "modules\\" hata do | |
| clean_source = source_file | |
| if clean_source.startswith("modules\\"): | |
| clean_source = clean_source[len("modules\\"):] | |
| elif clean_source.startswith("modules/"): | |
| clean_source = clean_source[len("modules/"):] | |
| full_path = os.path.join(SOURCE_DIR, clean_source.replace("\\", os.path.sep)) | |
| if not os.path.exists(full_path): | |
| logger.warning(f"File not found even after cleaning: {full_path} (original: {source_file})") | |
| continue | |
| logger.info(f"Fixing point {point.id} from {clean_source} chunk {chunk_index}") | |
| with open(full_path, "r", encoding="utf-8") as f: | |
| raw_content = f.read() | |
| text_content = extract_text_from_markdown(raw_content) | |
| chunks = chunk_text( | |
| text=text_content, | |
| source_file=clean_source, | |
| chunk_size=400, | |
| overlap=50 | |
| ) | |
| if chunk_index < len(chunks): | |
| missing_content = chunks[chunk_index]["content"] | |
| new_payload = payload.copy() | |
| new_payload["content"] = missing_content | |
| await client.set_payload( | |
| collection_name=COLLECTION_NAME, | |
| points=[point.id], | |
| payload=new_payload | |
| ) | |
| updated_count += 1 | |
| logger.info(f"Updated point {point.id} with content") | |
| else: | |
| logger.warning(f"Chunk index {chunk_index} out of range") | |
| logger.info(f"Update complete! Fixed {updated_count} points with missing content.") | |
| logger.info("Now book questions will give perfect answers with sources!") | |
| if __name__ == "__main__": | |
| asyncio.run(update_missing_content()) |