Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import pipeline | |
| import torch | |
| from fpdf import FPDF | |
| import pandas as pd | |
| import json | |
| import csv | |
| # Load the summarization pipeline | |
| text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.float32) | |
| def chunk_text(input_text, max_chunk_size=1024): | |
| """ | |
| Splits the input text into smaller chunks of size `max_chunk_size` or smaller. | |
| """ | |
| words = input_text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| for word in words: | |
| if len(" ".join(current_chunk + [word])) <= max_chunk_size: | |
| current_chunk.append(word) | |
| else: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [word] | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |
| def summary(input_text, max_length=130, min_length=30, output_format="Plain Text"): | |
| """ | |
| Summarizes the input text, handling cases where the text exceeds the model's maximum sequence length. | |
| Supports different output formats (Plain Text, JSON, HTML, CSV, Markdown, PDF, Excel). | |
| """ | |
| chunks = chunk_text(input_text) | |
| summarized_chunks = [] | |
| for chunk in chunks: | |
| output = text_summary(chunk, max_length=max_length, min_length=min_length) | |
| summarized_chunks.append(output[0]['summary_text']) | |
| summary_text = " ".join(summarized_chunks) | |
| # Return the output in the selected format | |
| if output_format == "Plain Text": | |
| return summary_text | |
| elif output_format == "JSON": | |
| result = { | |
| "summary": summary_text, | |
| "chunk_count": len(chunks), | |
| "original_length": len(input_text.split()), | |
| "summary_length": len(summary_text.split()) | |
| } | |
| return json.dumps(result, indent=4) | |
| elif output_format == "HTML": | |
| html_output = f"<html><body><h2>Summary</h2><p>{summary_text}</p></body></html>" | |
| return html_output | |
| elif output_format == "CSV": | |
| csv_output = "Original Text, Summary\n" | |
| for chunk, summary in zip(chunks, summarized_chunks): | |
| csv_output += f'"{chunk}", "{summary}"\n' | |
| return csv_output | |
| elif output_format == "Markdown": | |
| markdown_output = f"## Summary\n\n{summary_text}" | |
| return markdown_output | |
| elif output_format == "PDF": | |
| pdf = FPDF() | |
| pdf.set_auto_page_break(auto=True, margin=15) | |
| pdf.add_page() | |
| pdf.set_font("Arial", size=12) | |
| pdf.multi_cell(0, 10, summary_text) | |
| pdf_output = "summary.pdf" | |
| pdf.output(pdf_output) | |
| return f"PDF generated: {pdf_output}" | |
| elif output_format == "Excel": | |
| data = { | |
| "Original Text": chunks, | |
| "Summary": summarized_chunks | |
| } | |
| df = pd.DataFrame(data) | |
| excel_output = "summary.xlsx" | |
| df.to_excel(excel_output, index=False) | |
| return f"Excel file generated: {excel_output}" | |
| # Create a Gradio interface with an additional output format selection | |
| iface = gr.Interface( | |
| fn=summary, | |
| inputs=[ | |
| gr.Textbox(label="Input Text", lines=10), | |
| gr.Slider(label="Max Length", minimum=30, maximum=300, step=10, value=130), | |
| gr.Slider(label="Min Length", minimum=20, maximum=100, step=10, value=30), | |
| gr.Dropdown(label="Output Format", choices=["Plain Text", "JSON", "HTML", "CSV", "Markdown", "PDF", "Excel"], value="Plain Text") | |
| ], | |
| outputs=gr.Textbox(label="Summarized Output"), | |
| title="Text Summarization" | |
| ) | |
| iface.launch() |