petchutney's picture
Update app.py
63b8334 verified
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import PyPDF2
from sentence_transformers import SentenceTransformer
import torch
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# --- Load Models and Tokenizers ---
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
qa_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
explainer = pipeline("text2text-generation", model="google/flan-t5-base")
def extract_text_from_pdf(pdf_file):
text = ""
try:
with open(pdf_file.name, 'rb') as pdfFileObj:
pdfReader = PyPDF2.PdfReader(pdfFileObj)
for pageNum in range(len(pdfReader.pages)):
pageObj = pdfReader.pages[pageNum]
text += pageObj.extract_text()
except Exception as e:
logger.error(f"Error reading PDF: {e}")
return None
return text
def chunk_text(text, chunk_size=500, chunk_overlap=50):
chunks = []
start = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunks.append(text[start:end])
start += chunk_size - chunk_overlap
return chunks
def process_and_answer(pdf_file, question):
if pdf_file is not None:
extracted_text = extract_text_from_pdf(pdf_file)
if not extracted_text:
return "Could not extract text from the PDF."
text_chunks = chunk_text(extracted_text)
embeddings = embedding_model.encode(text_chunks)
question_embedding = embedding_model.encode(question)
# Simple similarity search (you can use a more efficient method for larger documents)
import numpy as np
similarities = np.inner(question_embedding, embeddings)
most_relevant_chunk_index = np.argmax(similarities)
context = text_chunks[most_relevant_chunk_index]
inputs = tokenizer(question, context, return_tensors="pt", truncation="only", max_length=512)
with torch.no_grad():
outputs = qa_model(**inputs)
answer_start_index = torch.argmax(outputs.start_logits)
answer_end_index = torch.argmax(outputs.end_logits) + 1
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start_index:answer_end_index]))
return answer.strip() if answer.strip() else "Could not find an answer in the document."
else:
return "Please upload a PDF file and ask a question."
with gr.Blocks() as demo:
gr.Markdown("## Ask Questions About Your Documents")
gr.Markdown("Upload a PDF and ask specific questions about its content.")
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
question_input = gr.Textbox(label="Your Question", placeholder="E.g., Who is the author of this book?")
answer_button = gr.Button("Find Answer")
output_answer = gr.Textbox(label="Answer")
answer_button.click(
fn=process_and_answer,
inputs=[pdf_input, question_input],
outputs=output_answer
)
demo.launch()