haajidheere commited on
Commit
38e7930
·
verified ·
1 Parent(s): 76eb344

Add build_embeddings.py

Browse files
Files changed (1) hide show
  1. build_embeddings.py +56 -0
build_embeddings.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.preprocessing import normalize
5
+ import os
6
+
7
+ DATA_PATH = "data/cleaned/abbreviations.csv"
8
+ OUTPUT_DIR = "ai_model"
9
+
10
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
11
+
12
+ df = pd.read_csv(DATA_PATH)
13
+ print(f"Loaded {len(df)} entries")
14
+
15
+ if 'domain' not in df.columns:
16
+ def infer_domain(row):
17
+ text = f"{row.get('somali', '')} {row.get('english', '')} {row.get('italian', '')}".lower()
18
+ medical_keywords = ['medicine', 'medical', 'disease', 'health', 'doctor', 'hospital', 'clinic', 'treatment', 'patient', 'diagnosis', 'therapy', 'pharma', 'drug', 'medic', 'caafimaad', 'daktari', 'bukaan']
19
+ legal_keywords = ['law', 'legal', 'court', 'judge', 'court', ' legislation', 'statute', 'contract', 'rights', 'crime', 'offense', 'prosecution', 'defense', 'lawyer', 'sharciga', 'qodob', 'xeer']
20
+ science_keywords = ['biology', 'botany', 'physics', 'chemistry', 'science', 'astronomy', 'zoology', 'meteorology', 'agriculture', 'technology', 'math', 'computer', 'environment']
21
+ religious_keywords = ['religion', 'god', 'islam', 'christian', 'church', 'prayer', 'faith', 'diin', 'iimaan', ' MASJID']
22
+
23
+ if any(kw in text for kw in medical_keywords):
24
+ return 'Medical'
25
+ elif any(kw in text for kw in legal_keywords):
26
+ return 'Legal'
27
+ elif any(kw in text for kw in science_keywords):
28
+ return 'Science'
29
+ elif any(kw in text for kw in religious_keywords):
30
+ return 'Religious'
31
+ return 'General'
32
+
33
+ df['domain'] = df.apply(infer_domain, axis=1)
34
+
35
+ for col in ['somali', 'english', 'italian', 'domain']:
36
+ df[col] = df[col].fillna('')
37
+
38
+ df["search_text"] = (
39
+ df["somali"].str.lower() + " " +
40
+ df["english"].str.lower() + " " +
41
+ df["italian"].str.lower() + " " +
42
+ df["domain"].str.lower()
43
+ ).str.strip()
44
+
45
+ print("Loading multilingual model...")
46
+ model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
47
+
48
+ print("Generating embeddings...")
49
+ embeddings = model.encode(df["search_text"].tolist(), normalize_embeddings=True)
50
+ embeddings = normalize(embeddings, axis=1, norm='l2')
51
+
52
+ np.save(f"{OUTPUT_DIR}/embeddings.npy", embeddings)
53
+ df.to_csv(f"{OUTPUT_DIR}/search_data.csv", index=False)
54
+
55
+ print(f"Embeddings created: {embeddings.shape}")
56
+ print(f"Saved to {OUTPUT_DIR}/")