Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| import re | |
| class pdf_query: | |
| def __init__(self): | |
| self.model = SentenceTransformer("all-MiniLM-L6-v2") | |
| self.read = None | |
| def file(self, file): | |
| self.read = PyPDF2.PdfReader(file) | |
| def extract_text(self): | |
| text = "" | |
| for page in self.read.pages: | |
| content = page.extract_text() | |
| if content: | |
| text += content + "\n" | |
| return text.strip() | |
| def split_into_chunks(self, text, chunk_size=300): | |
| # Split using punctuation for better sentence boundaries | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) <= chunk_size: | |
| current_chunk += sentence + " " | |
| else: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence + " " | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| def creat_model(self,chunks): | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| chunk_embeddings = model.encode(chunks) | |
| return model,chunk_embeddings | |
| def answer_question(self,question, chunks, chunk_embeddings,model,threshold=0.6): | |
| q_embedding = model.encode([question]) # same model as above | |
| scores = cosine_similarity(q_embedding, chunk_embeddings) | |
| best_score = np.max(scores) | |
| best_chunk_index = np.argmax(scores) | |
| if best_score >= threshold: | |
| best_chunk = chunks[best_chunk_index] | |
| # Clean the answer | |
| cleaned_answer = re.sub(r'\s+', ' ', best_chunk.strip()) | |
| return cleaned_answer | |
| else: | |
| return {"answer": "Answer not found in PDF"} |