File size: 1,501 Bytes
4b4fd48
248c19d
424339c
649792d
424339c
 
 
 
 
 
 
 
 
 
ed33f5c
 
424339c
649792d
4b4fd48
 
424339c
4b4fd48
649792d
424339c
 
 
 
069ec03
 
4e9cc7a
424339c
4e9cc7a
4b4fd48
 
424339c
649792d
4b4fd48
 
 
 
424339c
 
4b4fd48
649792d
4b4fd48
7f36b2c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import re
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Choose the distilled BART checkpoint
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Pre‑build the HF summarization pipeline (faster in repeated calls)
summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    device=-1,  # set to -1 for CPU, or the GPU ID if available
    truncation = True
)

def remove_confidentiality(text: str) -> str:
    pattern = r"\*\*CONFIDENTIALITY NOTICE:.*"
    return re.split(pattern, text, flags=re.DOTALL)[0].strip()

def summarize_text(text):
    cleaned = remove_confidentiality(text)
    # pipeline will chunk long inputs automatically if you pass `max_length` and `min_length`
    summary_list = summarizer(
        cleaned,
        max_length=250,   # shorter target length for punchier summaries
        min_length=50,
        length_penalty=1.0,
        num_beams=3,
        temperature=0.7,
        early_stopping=True
    )
    return summary_list[0]["summary_text"]

iface = gr.Interface(
    fn=summarize_text,
    inputs=gr.Textbox(lines=15, placeholder="Paste your text here..."),
    outputs=gr.Textbox(label="Summary"),
    title="Fast & Accurate Summarizer",
    description="Using the distilled BART model for quicker, high-quality summaries."
)

if __name__ == "__main__":
    iface.launch()