Rag_chatbot / scripts /update_missing_content.py
suhail
new changes
5391a0a
#!/usr/bin/env python3
"""
Script to update Qdrant vectors that are missing 'content' in payload.
"""
import asyncio
import os
import sys
from typing import Any
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from app.db.qdrant import QdrantDB
from app.ingestion.reader import extract_text_from_markdown
from app.ingestion.chunker import chunk_text
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
SOURCE_DIR = "../website/docs/modules" # sahi path
COLLECTION_NAME = "test-clustor"
async def update_missing_content():
qdrant_db = QdrantDB()
client = await qdrant_db._get_client()
logger.info("Fetching all points from Qdrant...")
scroll_result = await client.scroll(
collection_name=COLLECTION_NAME,
limit=10000,
with_payload=True,
with_vectors=False
)
points = scroll_result[0]
logger.info(f"Found {len(points)} points in collection")
updated_count = 0
for point in points:
payload = point.payload or {}
if not payload.get("content") or not payload.get("content").strip():
source_file = payload.get("source_file")
chunk_index = payload.get("chunk_index")
if not source_file or chunk_index is None:
logger.warning(f"Skipping point {point.id} - missing metadata")
continue
# FIX: Leading "modules\\" hata do
clean_source = source_file
if clean_source.startswith("modules\\"):
clean_source = clean_source[len("modules\\"):]
elif clean_source.startswith("modules/"):
clean_source = clean_source[len("modules/"):]
full_path = os.path.join(SOURCE_DIR, clean_source.replace("\\", os.path.sep))
if not os.path.exists(full_path):
logger.warning(f"File not found even after cleaning: {full_path} (original: {source_file})")
continue
logger.info(f"Fixing point {point.id} from {clean_source} chunk {chunk_index}")
with open(full_path, "r", encoding="utf-8") as f:
raw_content = f.read()
text_content = extract_text_from_markdown(raw_content)
chunks = chunk_text(
text=text_content,
source_file=clean_source,
chunk_size=400,
overlap=50
)
if chunk_index < len(chunks):
missing_content = chunks[chunk_index]["content"]
new_payload = payload.copy()
new_payload["content"] = missing_content
await client.set_payload(
collection_name=COLLECTION_NAME,
points=[point.id],
payload=new_payload
)
updated_count += 1
logger.info(f"Updated point {point.id} with content")
else:
logger.warning(f"Chunk index {chunk_index} out of range")
logger.info(f"Update complete! Fixed {updated_count} points with missing content.")
logger.info("Now book questions will give perfect answers with sources!")
if __name__ == "__main__":
asyncio.run(update_missing_content())