Thibault Clérice commited on
Commit
3801f51
·
1 Parent(s): 45f7c20

Demo online !

Browse files
Files changed (2) hide show
  1. app.py +74 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ import torch
4
+
5
+
6
+ # Load model and tokenizer
7
+ MODEL_NAME = "comma-project/normalization-byt5-small"
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
10
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
11
+
12
+
13
+ def normalize_text(text: str) -> str:
14
+ """
15
+ Normalize input text using ByT5.
16
+ """
17
+
18
+ if not text.strip():
19
+ return ""
20
+
21
+ # Tokenize
22
+ inputs = tokenizer(
23
+ text,
24
+ return_tensors="pt",
25
+ truncation=True,
26
+ padding=True,
27
+ max_length=1024,
28
+ )
29
+
30
+ # Generate
31
+ with torch.no_grad():
32
+ outputs = model.generate(
33
+ **inputs,
34
+ max_length=1024,
35
+ num_beams=2,
36
+ early_stopping=True,
37
+ )
38
+
39
+ # Decode
40
+ normalized = tokenizer.decode(
41
+ outputs[0],
42
+ skip_special_tokens=True,
43
+ )
44
+
45
+ return normalized
46
+
47
+
48
+ # Gradio interface
49
+ demo = gr.Interface(
50
+ fn=normalize_text,
51
+ inputs=gr.Textbox(
52
+ label="Input Text",
53
+ placeholder="Enter text to normalize...",
54
+ lines=4,
55
+ ),
56
+ outputs=gr.Textbox(
57
+ label="Normalized Text",
58
+ lines=4,
59
+ ),
60
+ title="Text Normalization with ByT5",
61
+ description="Normalize noisy or non-standard text using the ByT5 model.",
62
+ theme="soft",
63
+ examples=[
64
+ ["Scͥbo uobiᷤᷤ ñ pauli ł donati."],
65
+ ["""⁊ pitie mlt' lelasce
66
+ P ities li dist. uai a ton peire
67
+ Nelaissier. """, """Uer̃ ab his qͥ ita dissert̃
68
+ q̃ri debet. qͥd ꝑ amorem dei. quidq ꝑ amorẽ
69
+ boni tẽꝑalis ueluit intellig̾e."""]
70
+ ],
71
+ )
72
+
73
+ if __name__ == "__main__":
74
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ transformers>=4.35.0
3
+ torch>=2.0.0
4
+ sentencepiece>=0.1.99
5
+ accelerate>=0.25.0