haajidheere commited on
Commit
a3a45aa
·
verified ·
1 Parent(s): 38e7930

Add search.py

Browse files
Files changed (1) hide show
  1. search.py +44 -0
search.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sentence_transformers import SentenceTransformer, util
3
+ import pandas as pd
4
+ import os
5
+
6
+ MODEL_DIR = "ai_model"
7
+
8
+ model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
9
+ df = pd.read_csv(f"{MODEL_DIR}/search_data.csv")
10
+ embeddings = np.load(f"{MODEL_DIR}/embeddings.npy")
11
+
12
+ def get_confidence_label(score):
13
+ if score >= 0.7:
14
+ return "high"
15
+ elif score >= 0.5:
16
+ return "medium"
17
+ return "low"
18
+
19
+ def search(query, top_k=5, threshold=0.3):
20
+ """Semantic search with confidence scores"""
21
+ q_emb = model.encode([query.lower()], normalize_embeddings=True)[0]
22
+
23
+ scores = util.cos_sim(q_emb, embeddings)[0].numpy()
24
+
25
+ valid_idx = np.where(scores >= threshold)[0]
26
+ if len(valid_idx) == 0:
27
+ return pd.DataFrame(columns=["rank", "somali", "english", "italian", "domain", "similarity_score", "confidence_label"])
28
+
29
+ top_idx = valid_idx[np.argsort(scores[valid_idx])[-top_k:][::-1]]
30
+
31
+ results = df.iloc[top_idx][["somali", "english", "italian", "domain"]].copy()
32
+ results["similarity_score"] = np.round(scores[top_idx], 4)
33
+ results["confidence_label"] = [get_confidence_label(s) for s in scores[top_idx]]
34
+ results.insert(0, "rank", range(1, len(results) + 1))
35
+ results = results.reset_index(drop=True)
36
+
37
+ return results
38
+
39
+ if __name__ == "__main__":
40
+ import sys
41
+ query = sys.argv[1] if len(sys.argv) > 1 else "dhaqaale"
42
+ print(f"Searching for: {query}\n")
43
+ results = search(query)
44
+ print(results.to_string(index=False))