import pandas as pd from sentence_transformers import SentenceTransformer import torch from app.qdrant_client import client from qdrant_client.http import models from pympler import asizeof print("Loading model and data...") # --- Setup device --- device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f"Using device: {device}") # --- Load model --- model = SentenceTransformer("MossaabDev/Quran_embed_V2.2", device=device) print("Model size:", asizeof.asizeof(model)) # --- Load ayahs from ayas.csv --- df = pd.read_csv("app/data/ayas.csv", encoding="utf-8") # Ensure expected columns if not {'answers', 'arabic'}.issubset(df.columns): raise ValueError("❌ 'ayas.csv' must contain 'answers' and 'arabic' columns.") # Remove duplicates and NaN df = df.dropna(subset=['answers', 'arabic']).drop_duplicates(subset=['answers']) ayat = df['answers'].tolist() print(f"Total unique ayat loaded: {len(ayat)}") print("✅ Model and data ready.") # --- Check if collection exists --- collections = [c.name for c in client.get_collections().collections] if "ayahs_collection" not in collections: print("Creating Qdrant collection and uploading embeddings...") embeddings = model.encode(ayat, convert_to_tensor=False, show_progress_bar=True).tolist() client.recreate_collection( collection_name="ayahs_collection", vectors_config=models.VectorParams( size=len(embeddings[0]), distance=models.Distance.COSINE ), ) points = [ models.PointStruct( id=idx, vector=emb, payload={ "text": ayah, "arabic": df.iloc[idx]['arabic'] } ) for idx, (emb, ayah) in enumerate(zip(embeddings, ayat)) ] client.upsert(collection_name="ayahs_collection", points=points) print("✅ Embeddings uploaded to Qdrant.") else: print("✅ Collection already exists, skipping upload.")