Spaces:

nnitiwe
/

pdf-theme-explorer

Sleeping

App Files Files Community

pdf-theme-explorer / src /streamlit_app.py

nnitiwe

Update src/streamlit_app.py

2309762 verified 9 months ago

raw

history blame contribute delete

1.82 kB

	import streamlit as st
	from transformers import pipeline
	import pdfplumber

	# Set the title
	st.set_page_config(page_title="PDF Summarizer & Theme Extractor")
	st.title("📄 PDF Summary and Theme Explorer")

	# Load Hugging Face models
	@st.cache_resource
	def load_models():
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
	classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
	return summarizer, classifier

	summarizer, classifier = load_models()

	# PDF Upload
	uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

	if uploaded_file:
	# Extract text from PDF
	with pdfplumber.open(uploaded_file) as pdf:
	text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

	if not text.strip():
	st.warning("No readable text found in the PDF.")
	else:
	st.subheader("📚 Extracted Text (Preview)")
	st.text_area("Extracted Text", text[:1500] + "...", height=200)

	with st.spinner("Summarizing..."):
	# Truncate text for summarization
	input_text = text[:1024 * 2] # Transformers limit input tokens
	summary = summarizer(input_text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']

	st.subheader("📝 Summary")
	st.write(summary)

	with st.spinner("Extracting key themes..."):
	candidate_labels = ["finance", "politics", "health", "technology", "education", "environment", "law", "science", "culture"]
	result = classifier(text[:1024], candidate_labels)
	themes = [label for label, score in zip(result['labels'], result['scores']) if score > 0.3]

	st.subheader("🏷️ Key Themes")
	st.write(", ".join(themes) if themes else "No strong themes identified.")