Spaces:

Rejeno
/

Text_Summarization

Build error

App Files Files Community

Regino commited on Mar 13, 2025

Commit

8d54f3a

1 Parent(s): 1c11257

dbfdb

Browse files

Files changed (2) hide show

app.py +91 -24
requirements.txt +5 -2

app.py CHANGED Viewed

@@ -1,29 +1,96 @@
 import streamlit as st
-from transformers import pipeline
-# Set page title and description
 st.title("📄 Text Summarization App")
-st.write("""
-This app allows users to upload a text file and get a summarized version using a Natural Language Processing (NLP) model.
-It uses the `transformers` library from Hugging Face, which provides state-of-the-art machine learning models.
 """)
-# Load summarization pipeline
-summarizer = pipeline("summarization")
-# File uploader
-uploaded_file = st.file_uploader("Upload a text file", type=["txt"])
-if uploaded_file is not None:
-    # Read the file content
-    text = uploaded_file.read().decode("utf-8")
-    # Display original text (optional)
-    st.subheader("Original Text")
-    st.text_area("Content:", text, height=200)
-    # Summarize the text
-    if st.button("Summarize"):
-        summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
-        st.subheader("Summarized Text")
-        st.write(summary[0]['summary_text'])

 import streamlit as st
+import fitz  # PyMuPDF for PDF extraction
+import re
+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.summarizers.lsa import LsaSummarizer
+from rouge_score import rouge_scorer  # For ROUGE score evaluation
+# Function to extract text from PDF
+def extract_text_from_pdf(uploaded_file):
+    doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
+    text = ""
+    for page in doc:
+        text += page.get_text("text") + "\n"
+    return clean_text(text)
+# Function to clean text (removes unwanted symbols, extra spaces, and bullets)
+def clean_text(text):
+    text = re.sub(r"[•▪●◦○▶♦]", "", text)  # Remove bullet points
+    text = re.sub(r"[\u2022\u2023\u25AA\u25AB\u25A0\u25CF\u00B7]", "", text)  # Additional bullets
+    text = re.sub(r"\s+", " ", text)  # Normalize spaces
+    text = re.sub(r"[^a-zA-Z0-9.,!?()'\"%$@&\s]", "", text)  # Keep only readable text
+    return text.strip()
+# Function to summarize text using LSA
+def summarize_text(text, num_sentences=3):
+    text = clean_text(text)  # Clean text before summarizing
+    parser = PlaintextParser.from_string(text, Tokenizer("english"))
+    summarizer = LsaSummarizer()
+    summary = summarizer(parser.document, num_sentences)
+    return " ".join(str(sentence) for sentence in summary)
+# Function to calculate ROUGE scores
+def calculate_rouge(reference_text, generated_summary):
+    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
+    scores = scorer.score(reference_text, generated_summary)
+    rouge1 = scores["rouge1"].fmeasure
+    rouge2 = scores["rouge2"].fmeasure
+    rougeL = scores["rougeL"].fmeasure
+    return rouge1, rouge2, rougeL
+# Streamlit UI
 st.title("📄 Text Summarization App")
+st.write("This app summarizes long text using **Latent Semantic Analysis (LSA)**, an **unsupervised learning method**, and evaluates the summary using **ROUGE scores**.")
+# Sidebar input options
+st.sidebar.header("Options")
+file_uploaded = st.sidebar.file_uploader("Upload a file (TXT or PDF)", type=["txt", "pdf"])
+manual_text = st.sidebar.text_area("Or enter text manually", "")
+# Explanation of the models
+st.subheader("🔎 How It Works")
+st.markdown("""
+- **Summarization Model: Latent Semantic Analysis (LSA)**
+  LSA is an **unsupervised learning method** that identifies important sentences using **Singular Value Decomposition (SVD)**.
+  It finds hidden relationships between words and sentences **without requiring labeled data**.
+- **Evaluation Metric: ROUGE Score**
+  - **ROUGE-1**: Measures single-word overlap
+  - **ROUGE-2**: Measures two-word sequence overlap
+  - **ROUGE-L**: Measures the longest common subsequence
 """)
+# Summarization button
+if st.sidebar.button("Summarize"):
+    if file_uploaded:
+        if file_uploaded.type == "text/plain":  # TXT file
+            text = file_uploaded.read().decode("utf-8")
+        elif file_uploaded.type == "application/pdf":  # PDF file
+            text = extract_text_from_pdf(file_uploaded)
+        else:
+            st.sidebar.error("Unsupported file format.")
+            st.stop()
+    elif manual_text.strip():
+        text = manual_text
+    else:
+        st.sidebar.error("Please upload a file or enter text.")
+        st.stop()
+    # Generate summary
+    summary = summarize_text(text, num_sentences=5)
+    # Calculate ROUGE score
+    rouge1, rouge2, rougeL = calculate_rouge(text, summary)
+    # Display summary in justified format
+    st.subheader("📌 Summarized Text")
+    st.markdown(f"<p style='text-align: justify;'>{summary}</p>", unsafe_allow_html=True)
+    # Display ROUGE scores
+    st.subheader("📊 Summary Quality (ROUGE Score)")
+    st.write(f"**ROUGE-1 Score:** {rouge1:.4f}")
+    st.write(f"**ROUGE-2 Score:** {rouge2:.4f}")
+    st.write(f"**ROUGE-L Score:** {rougeL:.4f}")

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
 streamlit
-torch
-transformers

 streamlit
+pymupdf
+sumy
+rouge-score
+numpy
+nltk