Spaces:

sk3078
/

Rag_chatbot

Running

File size: 3,552 Bytes

5391a0a

#!/usr/bin/env python3
"""

Script to update Qdrant vectors that are missing 'content' in payload.

"""

import asyncio
import os
import sys
from typing import Any
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

from app.db.qdrant import QdrantDB
from app.ingestion.reader import extract_text_from_markdown
from app.ingestion.chunker import chunk_text
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

SOURCE_DIR = "../website/docs/modules"  # sahi path
COLLECTION_NAME = "test-clustor"

async def update_missing_content():
    qdrant_db = QdrantDB()
    client = await qdrant_db._get_client()
    
    logger.info("Fetching all points from Qdrant...")
    scroll_result = await client.scroll(
        collection_name=COLLECTION_NAME,
        limit=10000,
        with_payload=True,
        with_vectors=False
    )
    
    points = scroll_result[0]
    logger.info(f"Found {len(points)} points in collection")
    
    updated_count = 0
    
    for point in points:
        payload = point.payload or {}
        
        if not payload.get("content") or not payload.get("content").strip():
            source_file = payload.get("source_file")
            chunk_index = payload.get("chunk_index")
            
            if not source_file or chunk_index is None:
                logger.warning(f"Skipping point {point.id} - missing metadata")
                continue
            
            # FIX: Leading "modules\\" hata do
            clean_source = source_file
            if clean_source.startswith("modules\\"):
                clean_source = clean_source[len("modules\\"):]
            elif clean_source.startswith("modules/"):
                clean_source = clean_source[len("modules/"):]
            
            full_path = os.path.join(SOURCE_DIR, clean_source.replace("\\", os.path.sep))
            
            if not os.path.exists(full_path):
                logger.warning(f"File not found even after cleaning: {full_path} (original: {source_file})")
                continue
            
            logger.info(f"Fixing point {point.id} from {clean_source} chunk {chunk_index}")
            
            with open(full_path, "r", encoding="utf-8") as f:
                raw_content = f.read()
            
            text_content = extract_text_from_markdown(raw_content)
            
            chunks = chunk_text(
                text=text_content,
                source_file=clean_source,
                chunk_size=400,
                overlap=50
            )
            
            if chunk_index < len(chunks):
                missing_content = chunks[chunk_index]["content"]
                
                new_payload = payload.copy()
                new_payload["content"] = missing_content
                
                await client.set_payload(
                    collection_name=COLLECTION_NAME,
                    points=[point.id],
                    payload=new_payload
                )
                
                updated_count += 1
                logger.info(f"Updated point {point.id} with content")
            else:
                logger.warning(f"Chunk index {chunk_index} out of range")
    
    logger.info(f"Update complete! Fixed {updated_count} points with missing content.")
    logger.info("Now book questions will give perfect answers with sources!")

if __name__ == "__main__":
    asyncio.run(update_missing_content())