MossaabDev commited on
Commit
7a1f92c
·
verified ·
1 Parent(s): 64ce75f

Update app/data_loader.py

Browse files
Files changed (1) hide show
  1. app/data_loader.py +22 -11
app/data_loader.py CHANGED
@@ -1,34 +1,43 @@
1
  import pandas as pd
2
  from sentence_transformers import SentenceTransformer
3
  import torch
4
- from app.utils import remove_numbers
5
  from app.qdrant_client import client
6
  from qdrant_client.http import models
7
  from pympler import asizeof
 
8
  print("Loading model and data...")
9
 
 
10
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
11
  print(f"Using device: {device}")
12
 
 
13
  model = SentenceTransformer("app/my_finetuned_modelV2", device=device)
14
- print("model size : ", asizeof.asizeof(model))
15
- df = pd.read_csv("app/data/cleaned_fileV2.csv")
 
 
16
 
17
- df['answer'] = df['answer'].apply(remove_numbers)
18
- ayat = list(set(df['answer']))
19
- print(f"Total unique ayat loaded: {asizeof.asizeof(ayat)}")
20
 
21
- print("✅ Model and embeddings ready.")
 
 
 
22
 
 
23
 
24
  # --- Check if collection exists ---
25
  collections = [c.name for c in client.get_collections().collections]
26
  if "ayahs_collection" not in collections:
27
  print("Creating Qdrant collection and uploading embeddings...")
28
- embeddings = model.encode(ayat, convert_to_tensor=False).tolist()
 
 
29
  client.recreate_collection(
30
  collection_name="ayahs_collection",
31
-
32
  vectors_config=models.VectorParams(
33
  size=len(embeddings[0]),
34
  distance=models.Distance.COSINE
@@ -39,7 +48,10 @@ if "ayahs_collection" not in collections:
39
  models.PointStruct(
40
  id=idx,
41
  vector=emb,
42
- payload={"text": ayah}
 
 
 
43
  )
44
  for idx, (emb, ayah) in enumerate(zip(embeddings, ayat))
45
  ]
@@ -48,4 +60,3 @@ if "ayahs_collection" not in collections:
48
  print("✅ Embeddings uploaded to Qdrant.")
49
  else:
50
  print("✅ Collection already exists, skipping upload.")
51
- # Load embeddings from Qdrant
 
1
  import pandas as pd
2
  from sentence_transformers import SentenceTransformer
3
  import torch
 
4
  from app.qdrant_client import client
5
  from qdrant_client.http import models
6
  from pympler import asizeof
7
+
8
  print("Loading model and data...")
9
 
10
+ # --- Setup device ---
11
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
12
  print(f"Using device: {device}")
13
 
14
+ # --- Load model ---
15
  model = SentenceTransformer("app/my_finetuned_modelV2", device=device)
16
+ print("Model size:", asizeof.asizeof(model))
17
+
18
+ # --- Load ayahs from ayas.csv ---
19
+ df = pd.read_csv("app/data/ayas.csv", encoding="utf-8")
20
 
21
+ # Ensure expected columns
22
+ if not {'answers', 'arabic'}.issubset(df.columns):
23
+ raise ValueError(" 'ayas.csv' must contain 'answers' and 'arabic' columns.")
24
 
25
+ # Remove duplicates and NaN
26
+ df = df.dropna(subset=['answers', 'arabic']).drop_duplicates(subset=['answers'])
27
+ ayat = df['answers'].tolist()
28
+ print(f"Total unique ayat loaded: {len(ayat)}")
29
 
30
+ print("✅ Model and data ready.")
31
 
32
  # --- Check if collection exists ---
33
  collections = [c.name for c in client.get_collections().collections]
34
  if "ayahs_collection" not in collections:
35
  print("Creating Qdrant collection and uploading embeddings...")
36
+
37
+ embeddings = model.encode(ayat, convert_to_tensor=False, show_progress_bar=True).tolist()
38
+
39
  client.recreate_collection(
40
  collection_name="ayahs_collection",
 
41
  vectors_config=models.VectorParams(
42
  size=len(embeddings[0]),
43
  distance=models.Distance.COSINE
 
48
  models.PointStruct(
49
  id=idx,
50
  vector=emb,
51
+ payload={
52
+ "text": ayah,
53
+ "arabic": df.iloc[idx]['arabic']
54
+ }
55
  )
56
  for idx, (emb, ayah) in enumerate(zip(embeddings, ayat))
57
  ]
 
60
  print("✅ Embeddings uploaded to Qdrant.")
61
  else:
62
  print("✅ Collection already exists, skipping upload.")