Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import pdfminer | |
| from pdfminer.high_level import extract_pages | |
| from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer | |
| import tensorflow as tf | |
| import streamlit as st | |
| def preprocess_text(element): | |
| """Preprocesses text elements from the PDF.""" | |
| if isinstance(element, pdfminer.layout.LTTextBoxHorizontal): | |
| text = element.get_text().strip() | |
| # Remove non-textual elements | |
| text = re.sub(r'[^\w\s]', '', text) | |
| # Convert to lowercase | |
| text = text.lower() | |
| return text | |
| else: | |
| return "" | |
| def answer_question(text, question, max_length=512): | |
| """Answers a question using the provided text and a pre-trained model.""" | |
| qa_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad" | |
| qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(qa_model_name) | |
| # Add special tokens and tokenize | |
| inputs = tokenizer(question, text, return_tensors="tf", padding=True, truncation=True, max_length=max_length) | |
| # Model prediction | |
| outputs = qa_model(inputs) | |
| start_logits = outputs.start_logits | |
| end_logits = outputs.end_logits | |
| # Find the indices of the start and end positions | |
| answer_start = tf.argmax(start_logits, axis=1).numpy()[0] | |
| answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0] | |
| # Extract the answer text from the original text | |
| answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])) | |
| return answer if answer else "No answer found." | |
| # Streamlit app | |
| st.set_page_config(page_title="PDF Summarizer and Q&A") | |
| st.header("PDF Summarizer and Q&A") | |
| # User options | |
| st.subheader("Settings") | |
| min_summary_length = st.slider("Minimum Summary Length", min_value=50, max_value=500, value=100) | |
| summarization_model = "facebook/bart-large-cnn" | |
| # File upload and processing | |
| uploaded_file = st.file_uploader("Choose a PDF file") | |
| if uploaded_file is not None: | |
| with st.spinner("Processing..."): | |
| text = "" | |
| for page_layout in extract_pages(uploaded_file): | |
| for element in page_layout: | |
| text += preprocess_text(element) + "\n" | |
| if text: | |
| question = st.text_input("Ask a question about the PDF:") | |
| summarize_button = st.button("Generate Summary") | |
| if summarize_button: | |
| with st.spinner("Summarizing..."): | |
| summary_response = pipeline("summarization", model=summarization_model)(text, min_length=min_summary_length) | |
| st.subheader("Summary") | |
| st.write(summary_response[0]["summary_text"]) | |
| if question: | |
| with st.spinner("Answering..."): | |
| answer = answer_question(text, question) | |
| st.subheader("Answer") | |
| st.write(answer) | |
| else: | |
| st.error("No text found in the PDF.") | |