pdf_reader / first1.py
deepkansara-123's picture
Upload 6 files
9ddeec6 verified
import PyPDF2
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
class pdf_query:
def __init__(self):
self.model = SentenceTransformer("all-MiniLM-L6-v2")
self.read = None
def file(self, file):
self.read = PyPDF2.PdfReader(file)
def extract_text(self):
text = ""
for page in self.read.pages:
content = page.extract_text()
if content:
text += content + "\n"
return text.strip()
def split_into_chunks(self, text, chunk_size=300):
# Split using punctuation for better sentence boundaries
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= chunk_size:
current_chunk += sentence + " "
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def creat_model(self,chunks):
model = SentenceTransformer("all-MiniLM-L6-v2")
chunk_embeddings = model.encode(chunks)
return model,chunk_embeddings
def answer_question(self,question, chunks, chunk_embeddings,model,threshold=0.6):
q_embedding = model.encode([question]) # same model as above
scores = cosine_similarity(q_embedding, chunk_embeddings)
best_score = np.max(scores)
best_chunk_index = np.argmax(scores)
if best_score >= threshold:
best_chunk = chunks[best_chunk_index]
# Clean the answer
cleaned_answer = re.sub(r'\s+', ' ', best_chunk.strip())
return cleaned_answer
else:
return {"answer": "Answer not found in PDF"}