File size: 1,982 Bytes
9ddeec6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import PyPDF2
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re


class pdf_query:
    def __init__(self):
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        self.read = None

    def file(self, file):
        self.read = PyPDF2.PdfReader(file)

    def extract_text(self):
        text = ""
        for page in self.read.pages:
            content = page.extract_text()
            if content:
                text += content + "\n"
        return text.strip()

    def split_into_chunks(self, text, chunk_size=300):
        # Split using punctuation for better sentence boundaries
        sentences = re.split(r'(?<=[.!?])\s+', text)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk) + len(sentence) <= chunk_size:
                current_chunk += sentence + " "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "
        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

    def creat_model(self,chunks):
        model = SentenceTransformer("all-MiniLM-L6-v2")
        chunk_embeddings = model.encode(chunks)
        return model,chunk_embeddings

    def answer_question(self,question, chunks, chunk_embeddings,model,threshold=0.6):
        q_embedding = model.encode([question])  # same model as above
        scores = cosine_similarity(q_embedding, chunk_embeddings)
        best_score = np.max(scores)
        best_chunk_index = np.argmax(scores)
        if best_score >= threshold:
            best_chunk = chunks[best_chunk_index]
            # Clean the answer
            cleaned_answer = re.sub(r'\s+', ' ', best_chunk.strip())
            return  cleaned_answer
        else:
            return {"answer": "Answer not found in PDF"}