ErayNet-nirig / search.py
haajidheere's picture
Add search.py
a3a45aa verified
import numpy as np
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import os
MODEL_DIR = "ai_model"
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
df = pd.read_csv(f"{MODEL_DIR}/search_data.csv")
embeddings = np.load(f"{MODEL_DIR}/embeddings.npy")
def get_confidence_label(score):
if score >= 0.7:
return "high"
elif score >= 0.5:
return "medium"
return "low"
def search(query, top_k=5, threshold=0.3):
"""Semantic search with confidence scores"""
q_emb = model.encode([query.lower()], normalize_embeddings=True)[0]
scores = util.cos_sim(q_emb, embeddings)[0].numpy()
valid_idx = np.where(scores >= threshold)[0]
if len(valid_idx) == 0:
return pd.DataFrame(columns=["rank", "somali", "english", "italian", "domain", "similarity_score", "confidence_label"])
top_idx = valid_idx[np.argsort(scores[valid_idx])[-top_k:][::-1]]
results = df.iloc[top_idx][["somali", "english", "italian", "domain"]].copy()
results["similarity_score"] = np.round(scores[top_idx], 4)
results["confidence_label"] = [get_confidence_label(s) for s in scores[top_idx]]
results.insert(0, "rank", range(1, len(results) + 1))
results = results.reset_index(drop=True)
return results
if __name__ == "__main__":
import sys
query = sys.argv[1] if len(sys.argv) > 1 else "dhaqaale"
print(f"Searching for: {query}\n")
results = search(query)
print(results.to_string(index=False))