Spaces:

Ilyakk
/

t5-summarization

Sleeping

App Files Files Community

Ilyakk commited on Sep 22, 2025

Commit

296eba9

verified ·

1 Parent(s): df6c3a8

Upload app.py

Browse files

Files changed (1) hide show

app.py +49 -38

app.py CHANGED Viewed

@@ -1,53 +1,64 @@
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-import nltk
-import torch
-import math
-model_name = "Ilyakk/t5-summarization"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-nltk.download("punkt")
-def generate_titles(text, num_titles=3, temperature=0.7):
-    # tokenize text
-    inputs = ["summarize: " + text]
-    inputs = tokenizer(inputs, return_tensors="pt")
-    num_tokens = len(inputs["input_ids"][0])
-    max_input_length = 512
-    num_spans = math.ceil(num_tokens / max_input_length)
-    overlap = math.ceil((num_spans * max_input_length - num_tokens) / max(num_spans - 1, 1))
-    spans_boundaries = []
     start = 0
     for i in range(num_spans):
-        spans_boundaries.append([start + max_input_length * i, start + max_input_length * (i + 1)])
         start -= overlap
-    spans_boundaries_selected = []
-    j = 0
-    for _ in range(num_titles):
-        spans_boundaries_selected.append(spans_boundaries[j])
-        j += 1
-        if j == len(spans_boundaries):
-            j = 0
-    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]] for boundary in spans_boundaries_selected]
-    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]] for boundary in spans_boundaries_selected]
-    inputs = {
-        "input_ids": torch.stack(tensor_ids),
-        "attention_mask": torch.stack(tensor_masks),
-    }
-    outputs = model.generate(**inputs, do_sample=True, temperature=temperature)
-    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-    predicted_titles = [nltk.sent_tokenize(decoded_output.strip())[0] for decoded_output in decoded_outputs]
-    return predicted_titles
-# Gradio interface
 demo = gr.Interface(
     fn=generate_titles,
     inputs=[
@@ -56,8 +67,8 @@ demo = gr.Interface(
         gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="Temperature"),
     ],
     outputs=gr.List(label="Generated titles"),
-    title="📰 T5 Title Generator",
-    description="Generate candidate titles for articles using a fine-tuned T5 model."
 )
 if __name__ == "__main__":

 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import nltk, math, torch
+MODEL_ID = "ilyakk/t5-summarization" \
+MAX_INPUT_LEN = 512
+\
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
+try:
+    nltk.data.find("tokenizers/punkt")
+except LookupError:
+    nltk.download("punkt")
+def generate_titles(text: str, num_titles: int = 3, temperature: float = 0.7):
+    text = (text or "").strip()
+    if not text:
+        return ["Введите текст статьи выше."]
+    enc = tokenizer(["summarize: " + text], return_tensors="pt")
+    ids  = enc["input_ids"][0]
+    mask = enc["attention_mask"][0]
+    num_tokens = len(ids)
+    num_spans = max(1, math.ceil(num_tokens / MAX_INPUT_LEN))
+    overlap = math.ceil((num_spans * MAX_INPUT_LEN - num_tokens) / max(num_spans - 1, 1)) if num_spans > 1 else 0
+    spans = []
     start = 0
     for i in range(num_spans):
+        b0 = start + MAX_INPUT_LEN * i
+        b1 = start + MAX_INPUT_LEN * (i + 1)
+        spans.append([max(0, b0), min(num_tokens, b1)])
         start -= overlap
+    chosen = [spans[i % len(spans)] for i in range(num_titles)]
+    batch_ids  = [ids[b0:b1]  for (b0, b1) in chosen]
+    batch_mask = [mask[b0:b1] for (b0, b1) in chosen]
+    batch = {"input_ids": torch.stack(batch_ids), "attention_mask": torch.stack(batch_mask)}
+    with torch.no_grad():
+        outputs = model.generate(
+            **batch,
+            do_sample=True,
+            temperature=float(temperature),
+            max_length=64,
+            num_beams=1
+        )
+    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    titles = [ (nltk.sent_tokenize(d.strip())[0] if d.strip() else "").strip() for d in decoded ]
+    return titles
 demo = gr.Interface(
     fn=generate_titles,
     inputs=[
         gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="Temperature"),
     ],
     outputs=gr.List(label="Generated titles"),
+    title="T5 Title Generator",
+    description="Generate candidate titles for articles using your fine-tuned T5 model."
 )
 if __name__ == "__main__":