Spaces:

IotaCluster
/

Summarizer

Sleeping

File size: 1,501 Bytes

4b4fd48
248c19d
424339c
649792d
424339c
 
 
 
 
 
 
 
 
 
ed33f5c
 
424339c
649792d
4b4fd48
 
424339c
4b4fd48
649792d
424339c
 
 
 
069ec03
 
4e9cc7a
424339c
4e9cc7a
4b4fd48
 
424339c
649792d
4b4fd48
 
 
 
424339c
 
4b4fd48
649792d
4b4fd48
7f36b2c

import re
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Choose the distilled BART checkpoint
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Pre‑build the HF summarization pipeline (faster in repeated calls)
summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    device=-1,  # set to -1 for CPU, or the GPU ID if available
    truncation = True
)

def remove_confidentiality(text: str) -> str:
    pattern = r"\*\*CONFIDENTIALITY NOTICE:.*"
    return re.split(pattern, text, flags=re.DOTALL)[0].strip()

def summarize_text(text):
    cleaned = remove_confidentiality(text)
    # pipeline will chunk long inputs automatically if you pass `max_length` and `min_length`
    summary_list = summarizer(
        cleaned,
        max_length=250,   # shorter target length for punchier summaries
        min_length=50,
        length_penalty=1.0,
        num_beams=3,
        temperature=0.7,
        early_stopping=True
    )
    return summary_list[0]["summary_text"]

iface = gr.Interface(
    fn=summarize_text,
    inputs=gr.Textbox(lines=15, placeholder="Paste your text here..."),
    outputs=gr.Textbox(label="Summary"),
    title="Fast & Accurate Summarizer",
    description="Using the distilled BART model for quicker, high-quality summaries."
)

if __name__ == "__main__":
    iface.launch()