IotaCluster commited on
Commit
649792d
·
verified ·
1 Parent(s): feedcad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -70
app.py CHANGED
@@ -1,71 +1,25 @@
1
- import re
2
  import gradio as gr
3
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
-
5
- # Use a stronger instruction-tuned model
6
- # Options: "google/flan-t5-base", "google/flan-t5-large" (ensure your Space has enough RAM/GPU)
7
- MODEL_NAME = "google/flan-t5-large"
8
-
9
-
10
- def load_cleaner():
11
- """
12
- Lazily load the tokenizer and model to avoid startup caching issues.
13
- """
14
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
15
- model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
16
- return tokenizer, model
17
-
18
- # Initialize as None; will be loaded on first request
19
- tokenizer, model = None, None
20
-
21
-
22
- def clean_email(raw_text: str) -> str:
23
- global tokenizer, model
24
- # Lazy load
25
- if tokenizer is None or model is None:
26
- tokenizer, model = load_cleaner()
27
-
28
- # Remove confidentiality notice
29
- raw_text = re.sub(
30
- r"\*\*CONFIDENTIALITY NOTICE:[\s\S]*$", "", raw_text,
31
- flags=re.IGNORECASE
32
- )
33
- # Build prompt
34
- prompt = (
35
- "Please rewrite this email so that all signatures, forwarded headers, "
36
- "image placeholders, social‑media links, and confidentiality footers are removed. "
37
- "Preserve only the core message:\n\n" + raw_text
38
- )
39
- # Tokenize input (up to 1024 tokens)
40
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
41
-
42
- # Generate cleaned output (minimum and maximum 1024 tokens)
43
- outputs = model.generate(
44
- **inputs,
45
- max_length=1024,
46
- min_length=1024,
47
- num_beams=5,
48
- early_stopping=True
49
- )
50
- # Decode
51
- cleaned = tokenizer.decode(outputs[0], skip_special_tokens=True)
52
- return cleaned
53
-
54
- # Build Gradio interface
55
- def main():
56
- with gr.Blocks() as demo:
57
- gr.Markdown(
58
- "# Email Cleaner"
59
- "\nPaste your raw email below and click **Clean**—signatures, headers, links, and footers will be stripped out."
60
- )
61
- with gr.Row():
62
- inp = gr.Textbox(lines=15, placeholder="Paste raw email here...", label="Raw Email")
63
- out = gr.Textbox(lines=40, label="Cleaned Email (1024 tokens minimum)")
64
- btn = gr.Button("Clean")
65
- btn.click(fn=clean_email, inputs=inp, outputs=out)
66
-
67
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)
68
-
69
-
70
- if __name__ == "__main__":
71
- main()
 
 
1
  import gradio as gr
2
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
3
+
4
+ # Load the model and tokenizer
5
+ model_name = "t5-small"
6
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
7
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
8
+
9
+ # Define the summarization function
10
+ def summarize_text(text):
11
+ input_text = "summarize: " + text.strip()
12
+ input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=500, truncation=True)
13
+ summary_ids = model.generate(input_ids, max_length=140, min_length=40, length_penalty=2.0, num_beams=2, early_stopping=True)
14
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
15
+ return summary
16
+
17
+ # Gradio interface
18
+ iface = gr.Interface(fn=summarize_text,
19
+ inputs=gr.Textbox(lines=15, placeholder="Paste your text here..."),
20
+ outputs=gr.Textbox(label="Summary"),
21
+ title="T5 Text Summarizer",
22
+ description="Enter any long English text to get a summarized version using the T5 model.")
23
+
24
+ # Launch
25
+ iface.launch()