UsmanAmeen's picture
ready to deploy backend
39af4d2
import os
import sys
import glob
from typing import List, Generator
from bs4 import BeautifulSoup
import google.generativeai as genai
from qdrant_client import QdrantClient, models
# Add the project root to the Python path to allow importing from `src`
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
from src.core.config import settings
# --- Configuration ---
EMBEDDING_MODEL = 'models/gemini-embedding-001'
COLLECTION_NAME = "textbook_content"
DOCS_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..', 'frontend', 'docs'))
EMBEDDING_DIMENSION = 3072 # For Gemini gemini-embedding-001
def get_documents(path: str) -> List[str]:
"""Finds all markdown files in the specified path."""
print(f"Searching for markdown files in: {path}")
files = glob.glob(f"{path}/**/*.md", recursive=True)
files.extend(glob.glob(f"{path}/**/*.mdx", recursive=True))
print(f"Found {len(files)} documents.")
return files
def get_text_chunks(file_path: str, chunk_size: int = 2000, overlap: int = 200) -> Generator[str, None, None]:
"""Reads a file, cleans HTML/Markdown, and yields text chunks."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Use BeautifulSoup to strip markdown/html tags for cleaner text
soup = BeautifulSoup(content, 'html.parser')
text = soup.get_text()
if not text:
return
start = 0
while start < len(text):
end = start + chunk_size
yield text[start:end]
start += chunk_size - overlap
except Exception as e:
print(f"Error processing file {file_path}: {e}")
return
def main():
"""
Main function to run the data ingestion process.
"""
print("--- Starting Data Ingestion ---")
# --- Initialize Clients ---
try:
genai.configure(api_key=settings.GEMINI_API_KEY)
qdrant_client = QdrantClient(url=settings.QDRANT_URL, api_key=settings.QDRANT_API_KEY)
print("Successfully initialized Gemini and Qdrant clients.")
except Exception as e:
print(f"Error initializing clients: {e}")
return
# --- Setup Qdrant Collection ---
print(f"Setting up Qdrant collection: '{COLLECTION_NAME}'")
try:
qdrant_client.recreate_collection(
collection_name=COLLECTION_NAME,
vectors_config=models.VectorParams(size=EMBEDDING_DIMENSION, distance=models.Distance.COSINE),
)
print(f"Collection '{COLLECTION_NAME}' created/recreated successfully.")
except Exception as e:
print(f"Error creating Qdrant collection: {e}")
return
# --- Process and Upload Documents ---
documents = get_documents(DOCS_PATH)
all_chunks = []
chunk_metadata = []
point_id_counter = 0
for doc_path in documents:
print(f"\nProcessing document: {doc_path}")
for chunk in get_text_chunks(doc_path):
all_chunks.append(chunk)
chunk_metadata.append({"text": chunk, "source": os.path.basename(doc_path)})
point_id_counter += 1
# Batch embeddings for Gemini
batch_size = 100
for i in range(0, len(all_chunks), batch_size):
batch_chunks = all_chunks[i:i + batch_size]
batch_metadata = chunk_metadata[i:i + batch_size]
batch_ids = list(range(i, i + len(batch_chunks)))
try:
# Generate embeddings using Gemini
response = genai.embed_content(
model=EMBEDDING_MODEL,
content=batch_chunks,
task_type="retrieval_document"
)
embeddings = response['embedding']
# Prepare points for Qdrant
points_to_upsert = []
for j, embedding in enumerate(embeddings):
points_to_upsert.append(
models.PointStruct(
id=batch_ids[j],
vector=embedding,
payload=batch_metadata[j],
)
)
# Upsert in batches to Qdrant
qdrant_client.upsert(collection_name=COLLECTION_NAME, points=points_to_upsert, wait=True)
print(f"Upserted a batch of {len(points_to_upsert)} points (IDs {batch_ids[0]} - {batch_ids[-1]}).")
except Exception as e:
print(f"Error processing batch {i} to {i + len(batch_chunks)}: {e}")
print(f"\n--- Data Ingestion Complete ---")
print(f"Total points added to collection '{COLLECTION_NAME}': {point_id_counter}")
if __name__ == "__main__":
main()