import os import PyPDF2 from typing import Dict from transformers import BertTokenizerFast, BertForQuestionAnswering import torch import streamlit as st # Load the pre-trained model and tokenizer tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") model = BertForQuestionAnswering.from_pretrained("bert-base-uncased") def extract_text_from_pdf(pdf_path): with open(pdf_path, "rb") as file: reader = PyPDF2.PdfFileReader(file) text = "" for page_num in range(reader.getNumPages()): text += reader.getPage(page_num).extractText() return text def preprocess_text(question, context): inputs = tokenizer(question, context, return_tensors="pt") return inputs def question_answering_system(question, pdf_path): context = extract_text_from_pdf(pdf_path) inputs = preprocess_text(question, context) start_scores, end_scores = model(**inputs) start_index = torch.argmax(start_scores) end_index = torch.argmax(end_scores) + 1 answer_tokens = inputs["input_ids"][0][start_index:end_index] answer = tokenizer.convert_ids_to_tokens(answer_tokens) return " ".join(answer) # Set up Streamlit app st.set_page_config(page_title="PDF Question Answering", layout="wide") st.title("PDF Question Answering System") st.write("Upload a PDF file and enter a question related to its content.") pdf_file = st.file_uploader("Upload PDF File", type=["pdf"]) if pdf_file: uploaded_file_name = pdf_file.name else: uploaded_file_name = "" question = st.text_input("Enter your question:", key="question") if pdf_file and question: try: answer = question_answering_system(question, uploaded_file_name) st.success(f"Answer: {answer}") except Exception as e: st.error(f"Error: {str(e)}") st.markdown("Made with ❤️ by [Streamlit](https://streamlit.io/) and [Hugging Face Transformers](https://huggingface.co/transformers/)")