shifaa_api / app /rag.py
MossaabDev's picture
Update app/rag.py
9823c9b verified
from app.data_loader import model
from sklearn.metrics.pairwise import cosine_similarity
from app.qdrant_client import client
import pandas as pd
from pympler import asizeof
import re
# Load Arabic ayahs
ayat_arabic = pd.read_csv("app/data/ayas.csv", encoding="utf-8")
ayat_arabic_list = ayat_arabic['arabic'].tolist()
ayat_english_list = ayat_arabic['answers'].tolist() # assumes you have an 'english' column
def normalize(s: str):
return re.sub(r'[^a-zA-Z0-9 ]', '', s).lower().strip()
def find_top_5_ayahs_qdrant(question: str):
q_emb = model.encode(question).tolist()
search_result = client.search(
collection_name="ayahs_collection",
query_vector=q_emb,
limit=5
)
results = []
for point in search_result:
english_ayah = point.payload['text']
# Try to find the Arabic equivalent by matching the English verse
print(f"{english_ayah.strip}")
normalized_english_ayah = normalize(english_ayah)
normalized_df = ayat_arabic.copy()
normalized_df['normalized'] = ayat_arabic['answers'].apply(normalize)
match = normalized_df[normalized_df['normalized'] == normalized_english_ayah]
arabic_ayah = match.iloc[0]['arabic'] if not match.empty else "❌ Not found"
link = match.iloc[0]['link'] if not match.empty else "https://quran.com"
results.append({
"question": question,
"answer": english_ayah,
"arabic": arabic_ayah,
"link": link
})
print("results size:", asizeof.asizeof(results))
return results