File size: 1,518 Bytes
32a7233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d0a7da
32a7233
8b208f1
81e5d6f
32a7233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.documents import Document # Added import for Document
from sentence_transformers import SentenceTransformer, util
import pandas as pd

class rag_text_chooser:
    def __init__(self,data_rag):
        self.data_rag = pd.read_excel(data_rag) 

        self.corpus, self.answers = self.get_questions_Answers()
        self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
        self.corpus_embeddings = self.model.encode(self.corpus)



    def get_questions_Answers(self):

        questions = []
        answers = []

        for _, row in self.data_rag.iterrows():
            questions.append(row.iloc[0])  
            answers.append(row.iloc[1])
        
        return questions, answers

    def get_relevant_question(self, query):
        # 4) Encode the Arabic query
        query_embedding = self.model.encode(query)

        # 5) Compute cosine similarity
        cos_scores = util.cos_sim(query_embedding, self.corpus_embeddings)[0]

        # 6) Rank results
        top_results = cos_scores.argsort(descending=True)


        matched_result = None
        for idx in top_results:
            if (cos_scores[idx] < .7): # Mostly not relate to her:
                matched_result = None
            else:
                matched_result = (f"Questions {self.corpus[idx]} \n Answer {self.answers[idx]} \n (score: {cos_scores[idx]:.4f})")

            break

        return matched_result