MossaabDev commited on
Commit
38bac22
·
verified ·
1 Parent(s): 022e20f

Update app/data_loader.py

Browse files
Files changed (1) hide show
  1. app/data_loader.py +51 -51
app/data_loader.py CHANGED
@@ -1,51 +1,51 @@
1
- import pandas as pd
2
- from sentence_transformers import SentenceTransformer
3
- import torch
4
- from app.utils import remove_numbers
5
- from app.qdrant_client import client
6
- from qdrant_client.http import models
7
- from pympler import asizeof
8
- print("Loading model and data...")
9
-
10
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
11
- print(f"Using device: {device}")
12
-
13
- model = SentenceTransformer("app/fine-tuned-sentence-transformer-my-dataset", device=device)
14
- print("model size : ", asizeof.asizeof(model))
15
- df = pd.read_csv("app/data/cleaned_file.csv")
16
-
17
- df['answer'] = df['answer'].apply(remove_numbers)
18
- ayat = list(set(df['answer']))
19
- print(f"Total unique ayat loaded: {asizeof.asizeof(ayat)}")
20
-
21
- print("✅ Model and embeddings ready.")
22
-
23
-
24
- # --- Check if collection exists ---
25
- collections = [c.name for c in client.get_collections().collections]
26
- if "ayahs_collection" not in collections:
27
- print("Creating Qdrant collection and uploading embeddings...")
28
- embeddings = model.encode(ayat, convert_to_tensor=False).tolist()
29
- client.recreate_collection(
30
- collection_name="ayahs_collection",
31
-
32
- vectors_config=models.VectorParams(
33
- size=len(embeddings[0]),
34
- distance=models.Distance.COSINE
35
- ),
36
- )
37
-
38
- points = [
39
- models.PointStruct(
40
- id=idx,
41
- vector=emb,
42
- payload={"text": ayah}
43
- )
44
- for idx, (emb, ayah) in enumerate(zip(embeddings, ayat))
45
- ]
46
-
47
- client.upsert(collection_name="ayahs_collection", points=points)
48
- print("✅ Embeddings uploaded to Qdrant.")
49
- else:
50
- print("✅ Collection already exists, skipping upload.")
51
- # Load embeddings from Qdrant
 
1
+ import pandas as pd
2
+ from sentence_transformers import SentenceTransformer
3
+ import torch
4
+ from app.utils import remove_numbers
5
+ from app.qdrant_client import client
6
+ from qdrant_client.http import models
7
+ from pympler import asizeof
8
+ print("Loading model and data...")
9
+
10
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
11
+ print(f"Using device: {device}")
12
+
13
+ model = SentenceTransformer("MossaabDev/quran_embed_V2", device=device)
14
+ print("model size : ", asizeof.asizeof(model))
15
+ df = pd.read_csv("app/data/cleaned_file.csv")
16
+
17
+ df['answer'] = df['answer'].apply(remove_numbers)
18
+ ayat = list(set(df['answer']))
19
+ print(f"Total unique ayat loaded: {asizeof.asizeof(ayat)}")
20
+
21
+ print("✅ Model and embeddings ready.")
22
+
23
+
24
+ # --- Check if collection exists ---
25
+ collections = [c.name for c in client.get_collections().collections]
26
+ if "ayahs_collection" not in collections:
27
+ print("Creating Qdrant collection and uploading embeddings...")
28
+ embeddings = model.encode(ayat, convert_to_tensor=False).tolist()
29
+ client.recreate_collection(
30
+ collection_name="ayahs_collection",
31
+
32
+ vectors_config=models.VectorParams(
33
+ size=len(embeddings[0]),
34
+ distance=models.Distance.COSINE
35
+ ),
36
+ )
37
+
38
+ points = [
39
+ models.PointStruct(
40
+ id=idx,
41
+ vector=emb,
42
+ payload={"text": ayah}
43
+ )
44
+ for idx, (emb, ayah) in enumerate(zip(embeddings, ayat))
45
+ ]
46
+
47
+ client.upsert(collection_name="ayahs_collection", points=points)
48
+ print("✅ Embeddings uploaded to Qdrant.")
49
+ else:
50
+ print("✅ Collection already exists, skipping upload.")
51
+ # Load embeddings from Qdrant