Spaces:

saket12-hf
/

Text-Summarization

Runtime error

Saket Chaudhari

Update app.py

794e84d verified 10 months ago

3.56 kB

	import gradio as gr
	from transformers import pipeline
	import torch
	from fpdf import FPDF
	import pandas as pd
	import json
	import csv

	# Load the summarization pipeline
	text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.float32)

	def chunk_text(input_text, max_chunk_size=1024):
	"""
	Splits the input text into smaller chunks of size `max_chunk_size` or smaller.
	"""
	words = input_text.split()
	chunks = []
	current_chunk = []

	for word in words:
	if len(" ".join(current_chunk + [word])) <= max_chunk_size:
	current_chunk.append(word)
	else:
	chunks.append(" ".join(current_chunk))
	current_chunk = [word]

	if current_chunk:
	chunks.append(" ".join(current_chunk))

	return chunks

	def summary(input_text, max_length=130, min_length=30, output_format="Plain Text"):
	"""
	Summarizes the input text, handling cases where the text exceeds the model's maximum sequence length.
	Supports different output formats (Plain Text, JSON, HTML, CSV, Markdown, PDF, Excel).
	"""
	chunks = chunk_text(input_text)
	summarized_chunks = []

	for chunk in chunks:
	output = text_summary(chunk, max_length=max_length, min_length=min_length)
	summarized_chunks.append(output[0]['summary_text'])

	summary_text = " ".join(summarized_chunks)

	# Return the output in the selected format
	if output_format == "Plain Text":
	return summary_text

	elif output_format == "JSON":
	result = {
	"summary": summary_text,
	"chunk_count": len(chunks),
	"original_length": len(input_text.split()),
	"summary_length": len(summary_text.split())
	}
	return json.dumps(result, indent=4)

	elif output_format == "HTML":
	html_output = f"<html><body><h2>Summary</h2><p>{summary_text}</p></body></html>"
	return html_output

	elif output_format == "CSV":
	csv_output = "Original Text, Summary\n"
	for chunk, summary in zip(chunks, summarized_chunks):
	csv_output += f'"{chunk}", "{summary}"\n'
	return csv_output

	elif output_format == "Markdown":
	markdown_output = f"## Summary\n\n{summary_text}"
	return markdown_output

	elif output_format == "PDF":
	pdf = FPDF()
	pdf.set_auto_page_break(auto=True, margin=15)
	pdf.add_page()
	pdf.set_font("Arial", size=12)
	pdf.multi_cell(0, 10, summary_text)
	pdf_output = "summary.pdf"
	pdf.output(pdf_output)
	return f"PDF generated: {pdf_output}"

	elif output_format == "Excel":
	data = {
	"Original Text": chunks,
	"Summary": summarized_chunks
	}
	df = pd.DataFrame(data)
	excel_output = "summary.xlsx"
	df.to_excel(excel_output, index=False)
	return f"Excel file generated: {excel_output}"

	# Create a Gradio interface with an additional output format selection
	iface = gr.Interface(
	fn=summary,
	inputs=[
	gr.Textbox(label="Input Text", lines=10),
	gr.Slider(label="Max Length", minimum=30, maximum=300, step=10, value=130),
	gr.Slider(label="Min Length", minimum=20, maximum=100, step=10, value=30),
	gr.Dropdown(label="Output Format", choices=["Plain Text", "JSON", "HTML", "CSV", "Markdown", "PDF", "Excel"], value="Plain Text")
	],
	outputs=gr.Textbox(label="Summarized Output"),
	title="Text Summarization"
	)

	iface.launch()