Spaces:
Build error
Build error
| import nltk | |
| import streamlit as st | |
| import fitz # PyMuPDF for PDF extraction | |
| import re | |
| from sumy.parsers.plaintext import PlaintextParser | |
| from sumy.nlp.tokenizers import Tokenizer | |
| from sumy.summarizers.lsa import LsaSummarizer | |
| from rouge_score import rouge_scorer # For ROUGE score evaluation | |
| # Ensure the necessary tokenizer is downloaded | |
| nltk.download("punkt_tab") | |
| # Function to extract text from PDF | |
| def extract_text_from_pdf(uploaded_file): | |
| doc = fitz.open(stream=uploaded_file.read(), filetype="pdf") | |
| text = "" | |
| for page in doc: | |
| text += page.get_text("text") + "\n" | |
| return clean_text(text) | |
| # Function to clean text (removes unwanted symbols, extra spaces, and bullets) | |
| def clean_text(text): | |
| text = re.sub(r"[β’βͺββ¦ββΆβ¦]", "", text) # Remove bullet points | |
| text = re.sub(r"[\u2022\u2023\u25AA\u25AB\u25A0\u25CF\u00B7]", "", text) # Additional bullets | |
| text = re.sub(r"\s+", " ", text) # Normalize spaces | |
| text = re.sub(r"[^a-zA-Z0-9.,!?()'\"%$@&\s]", "", text) # Keep only readable text | |
| return text.strip() | |
| # Function to summarize text using LSA | |
| def summarize_text(text, num_sentences=3): | |
| text = clean_text(text) # Clean text before summarizing | |
| parser = PlaintextParser.from_string(text, Tokenizer("english")) | |
| summarizer = LsaSummarizer() | |
| summary = summarizer(parser.document, num_sentences) | |
| return " ".join(str(sentence) for sentence in summary) | |
| # Function to calculate ROUGE scores | |
| def calculate_rouge(reference_text, generated_summary): | |
| scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True) | |
| scores = scorer.score(reference_text, generated_summary) | |
| rouge1 = scores["rouge1"].fmeasure | |
| rouge2 = scores["rouge2"].fmeasure | |
| rougeL = scores["rougeL"].fmeasure | |
| return rouge1, rouge2, rougeL | |
| # Streamlit UI | |
| st.title("π Text Summarization App") | |
| st.write("This app summarizes long text using **Latent Semantic Analysis (LSA)**, an **unsupervised learning method**, and evaluates the summary using **ROUGE scores**.") | |
| # Sidebar input options | |
| st.sidebar.header("Options") | |
| file_uploaded = st.sidebar.file_uploader("Upload a file (TXT or PDF)", type=["txt", "pdf"]) | |
| manual_text = st.sidebar.text_area("Or enter text manually", "") | |
| # Explanation of the models | |
| st.subheader("π How It Works") | |
| st.markdown(""" | |
| - **Summarization Model: Latent Semantic Analysis (LSA)** | |
| LSA is an **unsupervised learning method** that identifies important sentences using **Singular Value Decomposition (SVD)**. | |
| It finds hidden relationships between words and sentences **without requiring labeled data**. | |
| - **Evaluation Metric: ROUGE Score** | |
| - **ROUGE-1**: Measures single-word overlap | |
| - **ROUGE-2**: Measures two-word sequence overlap | |
| - **ROUGE-L**: Measures the longest common subsequence | |
| """) | |
| # Summarization button | |
| if st.sidebar.button("Summarize"): | |
| if file_uploaded: | |
| if file_uploaded.type == "text/plain": # TXT file | |
| text = file_uploaded.read().decode("utf-8") | |
| elif file_uploaded.type == "application/pdf": # PDF file | |
| text = extract_text_from_pdf(file_uploaded) | |
| else: | |
| st.sidebar.error("Unsupported file format.") | |
| st.stop() | |
| elif manual_text.strip(): | |
| text = manual_text | |
| else: | |
| st.sidebar.error("Please upload a file or enter text.") | |
| st.stop() | |
| # Show loading animation | |
| with st.spinner("Summarizing text... Please wait."): | |
| # Generate summary | |
| summary = summarize_text(text, num_sentences=5) | |
| # Calculate ROUGE score | |
| rouge1, rouge2, rougeL = calculate_rouge(text, summary) | |
| # Display summary in justified format | |
| st.subheader("π Summarized Text") | |
| st.markdown(f"<p style='text-align: justify;'>{summary}</p>", unsafe_allow_html=True) | |
| # Display ROUGE scores | |
| st.subheader("π Summary Quality (ROUGE Score)") | |
| st.write(f"**ROUGE-1 Score:** {rouge1:.4f}") | |
| st.write(f"**ROUGE-2 Score:** {rouge2:.4f}") | |
| st.write(f"**ROUGE-L Score:** {rougeL:.4f}") | |