Spaces:
Runtime error
Runtime error
Update app/data_loader.py
Browse files- app/data_loader.py +22 -11
app/data_loader.py
CHANGED
|
@@ -1,34 +1,43 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
from sentence_transformers import SentenceTransformer
|
| 3 |
import torch
|
| 4 |
-
from app.utils import remove_numbers
|
| 5 |
from app.qdrant_client import client
|
| 6 |
from qdrant_client.http import models
|
| 7 |
from pympler import asizeof
|
|
|
|
| 8 |
print("Loading model and data...")
|
| 9 |
|
|
|
|
| 10 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 11 |
print(f"Using device: {device}")
|
| 12 |
|
|
|
|
| 13 |
model = SentenceTransformer("app/my_finetuned_modelV2", device=device)
|
| 14 |
-
print("
|
| 15 |
-
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
| 22 |
|
|
|
|
| 23 |
|
| 24 |
# --- Check if collection exists ---
|
| 25 |
collections = [c.name for c in client.get_collections().collections]
|
| 26 |
if "ayahs_collection" not in collections:
|
| 27 |
print("Creating Qdrant collection and uploading embeddings...")
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
client.recreate_collection(
|
| 30 |
collection_name="ayahs_collection",
|
| 31 |
-
|
| 32 |
vectors_config=models.VectorParams(
|
| 33 |
size=len(embeddings[0]),
|
| 34 |
distance=models.Distance.COSINE
|
|
@@ -39,7 +48,10 @@ if "ayahs_collection" not in collections:
|
|
| 39 |
models.PointStruct(
|
| 40 |
id=idx,
|
| 41 |
vector=emb,
|
| 42 |
-
payload={
|
|
|
|
|
|
|
|
|
|
| 43 |
)
|
| 44 |
for idx, (emb, ayah) in enumerate(zip(embeddings, ayat))
|
| 45 |
]
|
|
@@ -48,4 +60,3 @@ if "ayahs_collection" not in collections:
|
|
| 48 |
print("✅ Embeddings uploaded to Qdrant.")
|
| 49 |
else:
|
| 50 |
print("✅ Collection already exists, skipping upload.")
|
| 51 |
-
# Load embeddings from Qdrant
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
from sentence_transformers import SentenceTransformer
|
| 3 |
import torch
|
|
|
|
| 4 |
from app.qdrant_client import client
|
| 5 |
from qdrant_client.http import models
|
| 6 |
from pympler import asizeof
|
| 7 |
+
|
| 8 |
print("Loading model and data...")
|
| 9 |
|
| 10 |
+
# --- Setup device ---
|
| 11 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 12 |
print(f"Using device: {device}")
|
| 13 |
|
| 14 |
+
# --- Load model ---
|
| 15 |
model = SentenceTransformer("app/my_finetuned_modelV2", device=device)
|
| 16 |
+
print("Model size:", asizeof.asizeof(model))
|
| 17 |
+
|
| 18 |
+
# --- Load ayahs from ayas.csv ---
|
| 19 |
+
df = pd.read_csv("app/data/ayas.csv", encoding="utf-8")
|
| 20 |
|
| 21 |
+
# Ensure expected columns
|
| 22 |
+
if not {'answers', 'arabic'}.issubset(df.columns):
|
| 23 |
+
raise ValueError("❌ 'ayas.csv' must contain 'answers' and 'arabic' columns.")
|
| 24 |
|
| 25 |
+
# Remove duplicates and NaN
|
| 26 |
+
df = df.dropna(subset=['answers', 'arabic']).drop_duplicates(subset=['answers'])
|
| 27 |
+
ayat = df['answers'].tolist()
|
| 28 |
+
print(f"Total unique ayat loaded: {len(ayat)}")
|
| 29 |
|
| 30 |
+
print("✅ Model and data ready.")
|
| 31 |
|
| 32 |
# --- Check if collection exists ---
|
| 33 |
collections = [c.name for c in client.get_collections().collections]
|
| 34 |
if "ayahs_collection" not in collections:
|
| 35 |
print("Creating Qdrant collection and uploading embeddings...")
|
| 36 |
+
|
| 37 |
+
embeddings = model.encode(ayat, convert_to_tensor=False, show_progress_bar=True).tolist()
|
| 38 |
+
|
| 39 |
client.recreate_collection(
|
| 40 |
collection_name="ayahs_collection",
|
|
|
|
| 41 |
vectors_config=models.VectorParams(
|
| 42 |
size=len(embeddings[0]),
|
| 43 |
distance=models.Distance.COSINE
|
|
|
|
| 48 |
models.PointStruct(
|
| 49 |
id=idx,
|
| 50 |
vector=emb,
|
| 51 |
+
payload={
|
| 52 |
+
"text": ayah,
|
| 53 |
+
"arabic": df.iloc[idx]['arabic']
|
| 54 |
+
}
|
| 55 |
)
|
| 56 |
for idx, (emb, ayah) in enumerate(zip(embeddings, ayat))
|
| 57 |
]
|
|
|
|
| 60 |
print("✅ Embeddings uploaded to Qdrant.")
|
| 61 |
else:
|
| 62 |
print("✅ Collection already exists, skipping upload.")
|
|
|