from langchain_text_splitters import CharacterTextSplitter from langchain_core.documents import Document # Added import for Document from sentence_transformers import SentenceTransformer, util import pandas as pd class rag_text_chooser: def __init__(self,data_rag): self.data_rag = pd.read_excel(data_rag) self.corpus, self.answers = self.get_questions_Answers() self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') self.corpus_embeddings = self.model.encode(self.corpus) def get_questions_Answers(self): questions = [] answers = [] for _, row in self.data_rag.iterrows(): questions.append(row.iloc[0]) answers.append(row.iloc[1]) return questions, answers def get_relevant_question(self, query): # 4) Encode the Arabic query query_embedding = self.model.encode(query) # 5) Compute cosine similarity cos_scores = util.cos_sim(query_embedding, self.corpus_embeddings)[0] # 6) Rank results top_results = cos_scores.argsort(descending=True) matched_result = None for idx in top_results: if (cos_scores[idx] < .7): # Mostly not relate to her: matched_result = None else: matched_result = (f"Questions {self.corpus[idx]} \n Answer {self.answers[idx]} \n (score: {cos_scores[idx]:.4f})") break return matched_result