Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering | |
| import PyPDF2 | |
| from sentence_transformers import SentenceTransformer | |
| import torch | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # --- Load Models and Tokenizers --- | |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") | |
| qa_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") | |
| explainer = pipeline("text2text-generation", model="google/flan-t5-base") | |
| def extract_text_from_pdf(pdf_file): | |
| text = "" | |
| try: | |
| with open(pdf_file.name, 'rb') as pdfFileObj: | |
| pdfReader = PyPDF2.PdfReader(pdfFileObj) | |
| for pageNum in range(len(pdfReader.pages)): | |
| pageObj = pdfReader.pages[pageNum] | |
| text += pageObj.extract_text() | |
| except Exception as e: | |
| logger.error(f"Error reading PDF: {e}") | |
| return None | |
| return text | |
| def chunk_text(text, chunk_size=500, chunk_overlap=50): | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = min(start + chunk_size, len(text)) | |
| chunks.append(text[start:end]) | |
| start += chunk_size - chunk_overlap | |
| return chunks | |
| def process_and_answer(pdf_file, question): | |
| if pdf_file is not None: | |
| extracted_text = extract_text_from_pdf(pdf_file) | |
| if not extracted_text: | |
| return "Could not extract text from the PDF." | |
| text_chunks = chunk_text(extracted_text) | |
| embeddings = embedding_model.encode(text_chunks) | |
| question_embedding = embedding_model.encode(question) | |
| # Simple similarity search (you can use a more efficient method for larger documents) | |
| import numpy as np | |
| similarities = np.inner(question_embedding, embeddings) | |
| most_relevant_chunk_index = np.argmax(similarities) | |
| context = text_chunks[most_relevant_chunk_index] | |
| inputs = tokenizer(question, context, return_tensors="pt", truncation="only", max_length=512) | |
| with torch.no_grad(): | |
| outputs = qa_model(**inputs) | |
| answer_start_index = torch.argmax(outputs.start_logits) | |
| answer_end_index = torch.argmax(outputs.end_logits) + 1 | |
| answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start_index:answer_end_index])) | |
| return answer.strip() if answer.strip() else "Could not find an answer in the document." | |
| else: | |
| return "Please upload a PDF file and ask a question." | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Ask Questions About Your Documents") | |
| gr.Markdown("Upload a PDF and ask specific questions about its content.") | |
| pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
| question_input = gr.Textbox(label="Your Question", placeholder="E.g., Who is the author of this book?") | |
| answer_button = gr.Button("Find Answer") | |
| output_answer = gr.Textbox(label="Answer") | |
| answer_button.click( | |
| fn=process_and_answer, | |
| inputs=[pdf_input, question_input], | |
| outputs=output_answer | |
| ) | |
| demo.launch() |