Spaces:

Rejeno
/

Text_Summarization

Build error

Regino

fbng

021c961 10 months ago

4.07 kB

	import nltk
	import streamlit as st
	import fitz # PyMuPDF for PDF extraction
	import re
	from sumy.parsers.plaintext import PlaintextParser
	from sumy.nlp.tokenizers import Tokenizer
	from sumy.summarizers.lsa import LsaSummarizer
	from rouge_score import rouge_scorer # For ROUGE score evaluation

	# Ensure the necessary tokenizer is downloaded
	nltk.download("punkt_tab")

	# Function to extract text from PDF
	def extract_text_from_pdf(uploaded_file):
	doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
	text = ""
	for page in doc:
	text += page.get_text("text") + "\n"
	return clean_text(text)

	# Function to clean text (removes unwanted symbols, extra spaces, and bullets)
	def clean_text(text):
	text = re.sub(r"[•▪●◦○▶♦]", "", text) # Remove bullet points
	text = re.sub(r"[\u2022\u2023\u25AA\u25AB\u25A0\u25CF\u00B7]", "", text) # Additional bullets
	text = re.sub(r"\s+", " ", text) # Normalize spaces
	text = re.sub(r"[^a-zA-Z0-9.,!?()'\"%$@&\s]", "", text) # Keep only readable text
	return text.strip()

	# Function to summarize text using LSA
	def summarize_text(text, num_sentences=3):
	text = clean_text(text) # Clean text before summarizing
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	summarizer = LsaSummarizer()
	summary = summarizer(parser.document, num_sentences)
	return " ".join(str(sentence) for sentence in summary)

	# Function to calculate ROUGE scores
	def calculate_rouge(reference_text, generated_summary):
	scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
	scores = scorer.score(reference_text, generated_summary)

	rouge1 = scores["rouge1"].fmeasure
	rouge2 = scores["rouge2"].fmeasure
	rougeL = scores["rougeL"].fmeasure

	return rouge1, rouge2, rougeL

	# Streamlit UI
	st.title("📄 Text Summarization App")
	st.write("This app summarizes long text using Latent Semantic Analysis (LSA), an unsupervised learning method, and evaluates the summary using ROUGE scores.")

	# Sidebar input options
	st.sidebar.header("Options")
	file_uploaded = st.sidebar.file_uploader("Upload a file (TXT or PDF)", type=["txt", "pdf"])
	manual_text = st.sidebar.text_area("Or enter text manually", "")

	# Explanation of the models
	st.subheader("🔎 How It Works")
	st.markdown("""
	- Summarization Model: Latent Semantic Analysis (LSA)
	LSA is an unsupervised learning method that identifies important sentences using Singular Value Decomposition (SVD).
	It finds hidden relationships between words and sentences without requiring labeled data.
	- Evaluation Metric: ROUGE Score
	- ROUGE-1: Measures single-word overlap
	- ROUGE-2: Measures two-word sequence overlap
	- ROUGE-L: Measures the longest common subsequence
	""")

	# Summarization button
	if st.sidebar.button("Summarize"):
	if file_uploaded:
	if file_uploaded.type == "text/plain": # TXT file
	text = file_uploaded.read().decode("utf-8")
	elif file_uploaded.type == "application/pdf": # PDF file
	text = extract_text_from_pdf(file_uploaded)
	else:
	st.sidebar.error("Unsupported file format.")
	st.stop()
	elif manual_text.strip():
	text = manual_text
	else:
	st.sidebar.error("Please upload a file or enter text.")
	st.stop()

	# Show loading animation
	with st.spinner("Summarizing text... Please wait."):
	# Generate summary
	summary = summarize_text(text, num_sentences=5)
	# Calculate ROUGE score
	rouge1, rouge2, rougeL = calculate_rouge(text, summary)

	# Display summary in justified format
	st.subheader("📌 Summarized Text")
	st.markdown(f"<p style='text-align: justify;'>{summary}</p>", unsafe_allow_html=True)

	# Display ROUGE scores
	st.subheader("📊 Summary Quality (ROUGE Score)")
	st.write(f"ROUGE-1 Score: {rouge1:.4f}")
	st.write(f"ROUGE-2 Score: {rouge2:.4f}")
	st.write(f"ROUGE-L Score: {rougeL:.4f}")