Spaces:
Sleeping
Sleeping
File size: 1,501 Bytes
4b4fd48 248c19d 424339c 649792d 424339c ed33f5c 424339c 649792d 4b4fd48 424339c 4b4fd48 649792d 424339c 069ec03 4e9cc7a 424339c 4e9cc7a 4b4fd48 424339c 649792d 4b4fd48 424339c 4b4fd48 649792d 4b4fd48 7f36b2c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import re
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
# Choose the distilled BART checkpoint
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Pre‑build the HF summarization pipeline (faster in repeated calls)
summarizer = pipeline(
"summarization",
model=model,
tokenizer=tokenizer,
device=-1, # set to -1 for CPU, or the GPU ID if available
truncation = True
)
def remove_confidentiality(text: str) -> str:
pattern = r"\*\*CONFIDENTIALITY NOTICE:.*"
return re.split(pattern, text, flags=re.DOTALL)[0].strip()
def summarize_text(text):
cleaned = remove_confidentiality(text)
# pipeline will chunk long inputs automatically if you pass `max_length` and `min_length`
summary_list = summarizer(
cleaned,
max_length=250, # shorter target length for punchier summaries
min_length=50,
length_penalty=1.0,
num_beams=3,
temperature=0.7,
early_stopping=True
)
return summary_list[0]["summary_text"]
iface = gr.Interface(
fn=summarize_text,
inputs=gr.Textbox(lines=15, placeholder="Paste your text here..."),
outputs=gr.Textbox(label="Summary"),
title="Fast & Accurate Summarizer",
description="Using the distilled BART model for quicker, high-quality summaries."
)
if __name__ == "__main__":
iface.launch()
|