import numpy as np from sentence_transformers import SentenceTransformer, util import pandas as pd import os MODEL_DIR = "ai_model" model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2") df = pd.read_csv(f"{MODEL_DIR}/search_data.csv") embeddings = np.load(f"{MODEL_DIR}/embeddings.npy") def get_confidence_label(score): if score >= 0.7: return "high" elif score >= 0.5: return "medium" return "low" def search(query, top_k=5, threshold=0.3): """Semantic search with confidence scores""" q_emb = model.encode([query.lower()], normalize_embeddings=True)[0] scores = util.cos_sim(q_emb, embeddings)[0].numpy() valid_idx = np.where(scores >= threshold)[0] if len(valid_idx) == 0: return pd.DataFrame(columns=["rank", "somali", "english", "italian", "domain", "similarity_score", "confidence_label"]) top_idx = valid_idx[np.argsort(scores[valid_idx])[-top_k:][::-1]] results = df.iloc[top_idx][["somali", "english", "italian", "domain"]].copy() results["similarity_score"] = np.round(scores[top_idx], 4) results["confidence_label"] = [get_confidence_label(s) for s in scores[top_idx]] results.insert(0, "rank", range(1, len(results) + 1)) results = results.reset_index(drop=True) return results if __name__ == "__main__": import sys query = sys.argv[1] if len(sys.argv) > 1 else "dhaqaale" print(f"Searching for: {query}\n") results = search(query) print(results.to_string(index=False))