Spaces:

UsmanAmeen
/

Physical-AI-Docusaurus-Backend

Sleeping

File size: 4,682 Bytes

39af4d2

import os
import sys
import glob
from typing import List, Generator
from bs4 import BeautifulSoup
import google.generativeai as genai
from qdrant_client import QdrantClient, models

# Add the project root to the Python path to allow importing from `src`
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))

from src.core.config import settings

# --- Configuration ---
EMBEDDING_MODEL = 'models/gemini-embedding-001'
COLLECTION_NAME = "textbook_content"
DOCS_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..', 'frontend', 'docs'))
EMBEDDING_DIMENSION = 3072 # For Gemini gemini-embedding-001

def get_documents(path: str) -> List[str]:
    """Finds all markdown files in the specified path."""
    print(f"Searching for markdown files in: {path}")
    files = glob.glob(f"{path}/**/*.md", recursive=True)
    files.extend(glob.glob(f"{path}/**/*.mdx", recursive=True))
    print(f"Found {len(files)} documents.")
    return files

def get_text_chunks(file_path: str, chunk_size: int = 2000, overlap: int = 200) -> Generator[str, None, None]:
    """Reads a file, cleans HTML/Markdown, and yields text chunks."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Use BeautifulSoup to strip markdown/html tags for cleaner text
        soup = BeautifulSoup(content, 'html.parser')
        text = soup.get_text()
        
        if not text:
            return

        start = 0
        while start < len(text):
            end = start + chunk_size
            yield text[start:end]
            start += chunk_size - overlap

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return

def main():
    """
    Main function to run the data ingestion process.
    """
    print("--- Starting Data Ingestion ---")

    # --- Initialize Clients ---
    try:
        genai.configure(api_key=settings.GEMINI_API_KEY)
        qdrant_client = QdrantClient(url=settings.QDRANT_URL, api_key=settings.QDRANT_API_KEY)
        print("Successfully initialized Gemini and Qdrant clients.")
    except Exception as e:
        print(f"Error initializing clients: {e}")
        return

    # --- Setup Qdrant Collection ---
    print(f"Setting up Qdrant collection: '{COLLECTION_NAME}'")
    try:
        qdrant_client.recreate_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=models.VectorParams(size=EMBEDDING_DIMENSION, distance=models.Distance.COSINE),
        )
        print(f"Collection '{COLLECTION_NAME}' created/recreated successfully.")
    except Exception as e:
        print(f"Error creating Qdrant collection: {e}")
        return

    # --- Process and Upload Documents ---
    documents = get_documents(DOCS_PATH)
    all_chunks = []
    chunk_metadata = []
    point_id_counter = 0

    for doc_path in documents:
        print(f"\nProcessing document: {doc_path}")
        for chunk in get_text_chunks(doc_path):
            all_chunks.append(chunk)
            chunk_metadata.append({"text": chunk, "source": os.path.basename(doc_path)})
            point_id_counter += 1

    # Batch embeddings for Gemini
    batch_size = 100 
    for i in range(0, len(all_chunks), batch_size):
        batch_chunks = all_chunks[i:i + batch_size]
        batch_metadata = chunk_metadata[i:i + batch_size]
        batch_ids = list(range(i, i + len(batch_chunks)))

        try:
            # Generate embeddings using Gemini
            response = genai.embed_content(
                model=EMBEDDING_MODEL,
                content=batch_chunks,
                task_type="retrieval_document"
            )
            embeddings = response['embedding']

            # Prepare points for Qdrant
            points_to_upsert = []
            for j, embedding in enumerate(embeddings):
                points_to_upsert.append(
                    models.PointStruct(
                        id=batch_ids[j],
                        vector=embedding,
                        payload=batch_metadata[j],
                    )
                )

            # Upsert in batches to Qdrant
            qdrant_client.upsert(collection_name=COLLECTION_NAME, points=points_to_upsert, wait=True)
            print(f"Upserted a batch of {len(points_to_upsert)} points (IDs {batch_ids[0]} - {batch_ids[-1]}).")

        except Exception as e:
            print(f"Error processing batch {i} to {i + len(batch_chunks)}: {e}")

    print(f"\n--- Data Ingestion Complete ---")
    print(f"Total points added to collection '{COLLECTION_NAME}': {point_id_counter}")


if __name__ == "__main__":
    main()