import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering import PyPDF2 from sentence_transformers import SentenceTransformer import torch import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # --- Load Models and Tokenizers --- embedding_model = SentenceTransformer('all-MiniLM-L6-v2') tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") qa_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") explainer = pipeline("text2text-generation", model="google/flan-t5-base") def extract_text_from_pdf(pdf_file): text = "" try: with open(pdf_file.name, 'rb') as pdfFileObj: pdfReader = PyPDF2.PdfReader(pdfFileObj) for pageNum in range(len(pdfReader.pages)): pageObj = pdfReader.pages[pageNum] text += pageObj.extract_text() except Exception as e: logger.error(f"Error reading PDF: {e}") return None return text def chunk_text(text, chunk_size=500, chunk_overlap=50): chunks = [] start = 0 while start < len(text): end = min(start + chunk_size, len(text)) chunks.append(text[start:end]) start += chunk_size - chunk_overlap return chunks def process_and_answer(pdf_file, question): if pdf_file is not None: extracted_text = extract_text_from_pdf(pdf_file) if not extracted_text: return "Could not extract text from the PDF." text_chunks = chunk_text(extracted_text) embeddings = embedding_model.encode(text_chunks) question_embedding = embedding_model.encode(question) # Simple similarity search (you can use a more efficient method for larger documents) import numpy as np similarities = np.inner(question_embedding, embeddings) most_relevant_chunk_index = np.argmax(similarities) context = text_chunks[most_relevant_chunk_index] inputs = tokenizer(question, context, return_tensors="pt", truncation="only", max_length=512) with torch.no_grad(): outputs = qa_model(**inputs) answer_start_index = torch.argmax(outputs.start_logits) answer_end_index = torch.argmax(outputs.end_logits) + 1 answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start_index:answer_end_index])) return answer.strip() if answer.strip() else "Could not find an answer in the document." else: return "Please upload a PDF file and ask a question." with gr.Blocks() as demo: gr.Markdown("## Ask Questions About Your Documents") gr.Markdown("Upload a PDF and ask specific questions about its content.") pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) question_input = gr.Textbox(label="Your Question", placeholder="E.g., Who is the author of this book?") answer_button = gr.Button("Find Answer") output_answer = gr.Textbox(label="Answer") answer_button.click( fn=process_and_answer, inputs=[pdf_input, question_input], outputs=output_answer ) demo.launch()