Spaces:
Sleeping
Sleeping
File size: 1,982 Bytes
9ddeec6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import PyPDF2
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
class pdf_query:
def __init__(self):
self.model = SentenceTransformer("all-MiniLM-L6-v2")
self.read = None
def file(self, file):
self.read = PyPDF2.PdfReader(file)
def extract_text(self):
text = ""
for page in self.read.pages:
content = page.extract_text()
if content:
text += content + "\n"
return text.strip()
def split_into_chunks(self, text, chunk_size=300):
# Split using punctuation for better sentence boundaries
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= chunk_size:
current_chunk += sentence + " "
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def creat_model(self,chunks):
model = SentenceTransformer("all-MiniLM-L6-v2")
chunk_embeddings = model.encode(chunks)
return model,chunk_embeddings
def answer_question(self,question, chunks, chunk_embeddings,model,threshold=0.6):
q_embedding = model.encode([question]) # same model as above
scores = cosine_similarity(q_embedding, chunk_embeddings)
best_score = np.max(scores)
best_chunk_index = np.argmax(scores)
if best_score >= threshold:
best_chunk = chunks[best_chunk_index]
# Clean the answer
cleaned_answer = re.sub(r'\s+', ' ', best_chunk.strip())
return cleaned_answer
else:
return {"answer": "Answer not found in PDF"} |