from app.data_loader import model from sklearn.metrics.pairwise import cosine_similarity from app.qdrant_client import client import pandas as pd from pympler import asizeof import re # Load Arabic ayahs ayat_arabic = pd.read_csv("app/data/ayas.csv", encoding="utf-8") ayat_arabic_list = ayat_arabic['arabic'].tolist() ayat_english_list = ayat_arabic['answers'].tolist() # assumes you have an 'english' column def normalize(s: str): return re.sub(r'[^a-zA-Z0-9 ]', '', s).lower().strip() def find_top_5_ayahs_qdrant(question: str): q_emb = model.encode(question).tolist() search_result = client.search( collection_name="ayahs_collection", query_vector=q_emb, limit=5 ) results = [] for point in search_result: english_ayah = point.payload['text'] # Try to find the Arabic equivalent by matching the English verse print(f"{english_ayah.strip}") normalized_english_ayah = normalize(english_ayah) normalized_df = ayat_arabic.copy() normalized_df['normalized'] = ayat_arabic['answers'].apply(normalize) match = normalized_df[normalized_df['normalized'] == normalized_english_ayah] arabic_ayah = match.iloc[0]['arabic'] if not match.empty else "❌ Not found" link = match.iloc[0]['link'] if not match.empty else "https://quran.com" results.append({ "question": question, "answer": english_ayah, "arabic": arabic_ayah, "link": link }) print("results size:", asizeof.asizeof(results)) return results