Spaces:

nnitiwe
/

pdf-theme-explorer

Sleeping

App Files Files Community

nnitiwe commited on Jun 23, 2025

Commit

2309762

verified ·

1 Parent(s): 0a4a39c

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +45 -39

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,46 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+from transformers import pipeline
+import pdfplumber
+# Set the title
+st.set_page_config(page_title="PDF Summarizer & Theme Extractor")
+st.title("📄 PDF Summary and Theme Explorer")
+# Load Hugging Face models
+@st.cache_resource
+def load_models():
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+    return summarizer, classifier
+summarizer, classifier = load_models()
+# PDF Upload
+uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
+if uploaded_file:
+    # Extract text from PDF
+    with pdfplumber.open(uploaded_file) as pdf:
+        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
+    if not text.strip():
+        st.warning("No readable text found in the PDF.")
+    else:
+        st.subheader("📚 Extracted Text (Preview)")
+        st.text_area("Extracted Text", text[:1500] + "...", height=200)
+        with st.spinner("Summarizing..."):
+            # Truncate text for summarization
+            input_text = text[:1024 * 2]  # Transformers limit input tokens
+            summary = summarizer(input_text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
+        st.subheader("📝 Summary")
+        st.write(summary)
+        with st.spinner("Extracting key themes..."):
+            candidate_labels = ["finance", "politics", "health", "technology", "education", "environment", "law", "science", "culture"]
+            result = classifier(text[:1024], candidate_labels)
+            themes = [label for label, score in zip(result['labels'], result['scores']) if score > 0.3]
+        st.subheader("🏷️ Key Themes")
+        st.write(", ".join(themes) if themes else "No strong themes identified.")