File size: 1,817 Bytes
dc32969
2309762
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import streamlit as st
from transformers import pipeline
import pdfplumber

# Set the title
st.set_page_config(page_title="PDF Summarizer & Theme Extractor")
st.title("πŸ“„ PDF Summary and Theme Explorer")

# Load Hugging Face models
@st.cache_resource
def load_models():
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    return summarizer, classifier

summarizer, classifier = load_models()

# PDF Upload
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

if uploaded_file:
    # Extract text from PDF
    with pdfplumber.open(uploaded_file) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

    if not text.strip():
        st.warning("No readable text found in the PDF.")
    else:
        st.subheader("πŸ“š Extracted Text (Preview)")
        st.text_area("Extracted Text", text[:1500] + "...", height=200)

        with st.spinner("Summarizing..."):
            # Truncate text for summarization
            input_text = text[:1024 * 2]  # Transformers limit input tokens
            summary = summarizer(input_text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']

        st.subheader("πŸ“ Summary")
        st.write(summary)

        with st.spinner("Extracting key themes..."):
            candidate_labels = ["finance", "politics", "health", "technology", "education", "environment", "law", "science", "culture"]
            result = classifier(text[:1024], candidate_labels)
            themes = [label for label, score in zip(result['labels'], result['scores']) if score > 0.3]

        st.subheader("🏷️ Key Themes")
        st.write(", ".join(themes) if themes else "No strong themes identified.")