Spaces:
Sleeping
Sleeping
File size: 3,258 Bytes
c34f72a 63b8334 c34f72a 63b8334 de323b9 c34f72a 63b8334 c34f72a 63b8334 c34f72a 63b8334 c34f72a de323b9 63b8334 c34f72a 63b8334 c34f72a 63b8334 c34f72a 63b8334 c34f72a 63b8334 c34f72a 63b8334 c34f72a 63b8334 c34f72a 63b8334 c34f72a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import PyPDF2
from sentence_transformers import SentenceTransformer
import torch
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# --- Load Models and Tokenizers ---
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
qa_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
explainer = pipeline("text2text-generation", model="google/flan-t5-base")
def extract_text_from_pdf(pdf_file):
text = ""
try:
with open(pdf_file.name, 'rb') as pdfFileObj:
pdfReader = PyPDF2.PdfReader(pdfFileObj)
for pageNum in range(len(pdfReader.pages)):
pageObj = pdfReader.pages[pageNum]
text += pageObj.extract_text()
except Exception as e:
logger.error(f"Error reading PDF: {e}")
return None
return text
def chunk_text(text, chunk_size=500, chunk_overlap=50):
chunks = []
start = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunks.append(text[start:end])
start += chunk_size - chunk_overlap
return chunks
def process_and_answer(pdf_file, question):
if pdf_file is not None:
extracted_text = extract_text_from_pdf(pdf_file)
if not extracted_text:
return "Could not extract text from the PDF."
text_chunks = chunk_text(extracted_text)
embeddings = embedding_model.encode(text_chunks)
question_embedding = embedding_model.encode(question)
# Simple similarity search (you can use a more efficient method for larger documents)
import numpy as np
similarities = np.inner(question_embedding, embeddings)
most_relevant_chunk_index = np.argmax(similarities)
context = text_chunks[most_relevant_chunk_index]
inputs = tokenizer(question, context, return_tensors="pt", truncation="only", max_length=512)
with torch.no_grad():
outputs = qa_model(**inputs)
answer_start_index = torch.argmax(outputs.start_logits)
answer_end_index = torch.argmax(outputs.end_logits) + 1
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start_index:answer_end_index]))
return answer.strip() if answer.strip() else "Could not find an answer in the document."
else:
return "Please upload a PDF file and ask a question."
with gr.Blocks() as demo:
gr.Markdown("## Ask Questions About Your Documents")
gr.Markdown("Upload a PDF and ask specific questions about its content.")
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
question_input = gr.Textbox(label="Your Question", placeholder="E.g., Who is the author of this book?")
answer_button = gr.Button("Find Answer")
output_answer = gr.Textbox(label="Answer")
answer_button.click(
fn=process_and_answer,
inputs=[pdf_input, question_input],
outputs=output_answer
)
demo.launch() |