Spaces:
Sleeping
Sleeping
File size: 1,817 Bytes
dc32969 2309762 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | import streamlit as st
from transformers import pipeline
import pdfplumber
# Set the title
st.set_page_config(page_title="PDF Summarizer & Theme Extractor")
st.title("π PDF Summary and Theme Explorer")
# Load Hugging Face models
@st.cache_resource
def load_models():
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
return summarizer, classifier
summarizer, classifier = load_models()
# PDF Upload
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
if uploaded_file:
# Extract text from PDF
with pdfplumber.open(uploaded_file) as pdf:
text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
if not text.strip():
st.warning("No readable text found in the PDF.")
else:
st.subheader("π Extracted Text (Preview)")
st.text_area("Extracted Text", text[:1500] + "...", height=200)
with st.spinner("Summarizing..."):
# Truncate text for summarization
input_text = text[:1024 * 2] # Transformers limit input tokens
summary = summarizer(input_text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
st.subheader("π Summary")
st.write(summary)
with st.spinner("Extracting key themes..."):
candidate_labels = ["finance", "politics", "health", "technology", "education", "environment", "law", "science", "culture"]
result = classifier(text[:1024], candidate_labels)
themes = [label for label, score in zip(result['labels'], result['scores']) if score > 0.3]
st.subheader("π·οΈ Key Themes")
st.write(", ".join(themes) if themes else "No strong themes identified.")
|