#!/usr/bin/env python3 """ Script to update Qdrant vectors that are missing 'content' in payload. """ import asyncio import os import sys from typing import Any sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from app.db.qdrant import QdrantDB from app.ingestion.reader import extract_text_from_markdown from app.ingestion.chunker import chunk_text import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) SOURCE_DIR = "../website/docs/modules" # sahi path COLLECTION_NAME = "test-clustor" async def update_missing_content(): qdrant_db = QdrantDB() client = await qdrant_db._get_client() logger.info("Fetching all points from Qdrant...") scroll_result = await client.scroll( collection_name=COLLECTION_NAME, limit=10000, with_payload=True, with_vectors=False ) points = scroll_result[0] logger.info(f"Found {len(points)} points in collection") updated_count = 0 for point in points: payload = point.payload or {} if not payload.get("content") or not payload.get("content").strip(): source_file = payload.get("source_file") chunk_index = payload.get("chunk_index") if not source_file or chunk_index is None: logger.warning(f"Skipping point {point.id} - missing metadata") continue # FIX: Leading "modules\\" hata do clean_source = source_file if clean_source.startswith("modules\\"): clean_source = clean_source[len("modules\\"):] elif clean_source.startswith("modules/"): clean_source = clean_source[len("modules/"):] full_path = os.path.join(SOURCE_DIR, clean_source.replace("\\", os.path.sep)) if not os.path.exists(full_path): logger.warning(f"File not found even after cleaning: {full_path} (original: {source_file})") continue logger.info(f"Fixing point {point.id} from {clean_source} chunk {chunk_index}") with open(full_path, "r", encoding="utf-8") as f: raw_content = f.read() text_content = extract_text_from_markdown(raw_content) chunks = chunk_text( text=text_content, source_file=clean_source, chunk_size=400, overlap=50 ) if chunk_index < len(chunks): missing_content = chunks[chunk_index]["content"] new_payload = payload.copy() new_payload["content"] = missing_content await client.set_payload( collection_name=COLLECTION_NAME, points=[point.id], payload=new_payload ) updated_count += 1 logger.info(f"Updated point {point.id} with content") else: logger.warning(f"Chunk index {chunk_index} out of range") logger.info(f"Update complete! Fixed {updated_count} points with missing content.") logger.info("Now book questions will give perfect answers with sources!") if __name__ == "__main__": asyncio.run(update_missing_content())