File size: 1,958 Bytes
38bac22
 
 
 
 
 
7a1f92c
38bac22
 
7a1f92c
38bac22
 
 
7a1f92c
eebabd3
7a1f92c
 
 
 
38bac22
7a1f92c
 
 
38bac22
7a1f92c
 
 
 
38bac22
7a1f92c
38bac22
 
 
 
 
7a1f92c
 
 
38bac22
 
 
 
 
 
 
 
 
 
 
 
7a1f92c
 
 
 
38bac22
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
from app.qdrant_client import client
from qdrant_client.http import models
from pympler import asizeof

print("Loading model and data...")

# --- Setup device ---
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# --- Load model ---
model = SentenceTransformer("MossaabDev/Quran_embed_V2.2", device=device)
print("Model size:", asizeof.asizeof(model))

# --- Load ayahs from ayas.csv ---
df = pd.read_csv("app/data/ayas.csv", encoding="utf-8")

# Ensure expected columns
if not {'answers', 'arabic'}.issubset(df.columns):
    raise ValueError("❌ 'ayas.csv' must contain 'answers' and 'arabic' columns.")

# Remove duplicates and NaN
df = df.dropna(subset=['answers', 'arabic']).drop_duplicates(subset=['answers'])
ayat = df['answers'].tolist()
print(f"Total unique ayat loaded: {len(ayat)}")

print("✅ Model and data ready.")

# --- Check if collection exists ---
collections = [c.name for c in client.get_collections().collections]
if "ayahs_collection" not in collections:
    print("Creating Qdrant collection and uploading embeddings...")

    embeddings = model.encode(ayat, convert_to_tensor=False, show_progress_bar=True).tolist()

    client.recreate_collection(
        collection_name="ayahs_collection",
        vectors_config=models.VectorParams(
            size=len(embeddings[0]),
            distance=models.Distance.COSINE
        ),
    )

    points = [
        models.PointStruct(
            id=idx,
            vector=emb,
            payload={
                "text": ayah,
                "arabic": df.iloc[idx]['arabic']
            }
        )
        for idx, (emb, ayah) in enumerate(zip(embeddings, ayat))
    ]

    client.upsert(collection_name="ayahs_collection", points=points)
    print("✅ Embeddings uploaded to Qdrant.")
else:
    print("✅ Collection already exists, skipping upload.")