Badhon commited on
Commit
ea715a3
·
verified ·
1 Parent(s): f30c460

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -32
app.py CHANGED
@@ -2,56 +2,55 @@ import os
2
  import gradio as gr
3
  from transformers import pipeline
4
 
5
- model_id = "Badhon/Bangla_punctuation_restore"
 
 
 
 
 
 
 
 
 
 
6
 
7
- # Token classification pipeline
8
  punctuator = pipeline(
9
  "token-classification",
10
- model=model_id,
11
  aggregation_strategy="simple",
12
- token=os.getenv("HF_TOKEN") # required if model is private
13
  )
14
 
15
- def restore_punctuation(text):
16
  if not text.strip():
17
  return ""
18
 
19
- tokens = punctuator(text)
20
- words = text.split()
21
 
22
- output = []
23
- word_idx = 0
24
 
25
- for token in tokens:
26
- word = words[word_idx]
27
- label = token["entity_group"]
 
28
 
29
- output.append(word)
 
 
30
 
31
- if label == "COMMA":
32
- output.append(",")
33
- elif label == "PERIOD":
34
- output.append("।")
35
- elif label == "QUESTION":
36
- output.append("?")
37
- elif label == "EXCLAMATION":
38
- output.append("!")
39
- elif label == "COLON":
40
- output.append(":")
41
- elif label == "SEMICOLON":
42
- output.append(";")
43
 
44
- word_idx += 1
45
 
46
- return "".join(output)
47
-
48
- # Gradio UI
49
  demo = gr.Interface(
50
  fn=restore_punctuation,
51
- inputs=gr.Textbox(lines=4, placeholder="Enter Bangla text without punctuation"),
52
  outputs="text",
53
- title="Bangla Punctuation Restoration (BERT)",
54
- description="Bangla punctuation restoration using sagor-bert-base token classification"
55
  )
56
 
57
  demo.launch()
 
2
  import gradio as gr
3
  from transformers import pipeline
4
 
5
+ MODEL_ID = "Badhon/Bangla_punctuation_restore"
6
+
7
+ LABEL_TO_PUNCT = {
8
+ "COMMA": "،",
9
+ "DARI": "।",
10
+ "QUESTION": "?",
11
+ "EXCLAMATION": "!",
12
+ "SEMICOLON": ";",
13
+ "COLON": ":",
14
+ "HYPHEN": "-"
15
+ }
16
 
 
17
  punctuator = pipeline(
18
  "token-classification",
19
+ model=MODEL_ID,
20
  aggregation_strategy="simple",
21
+ token=os.getenv("HF_TOKEN")
22
  )
23
 
24
+ def restore_punctuation(text: str) -> str:
25
  if not text.strip():
26
  return ""
27
 
28
+ preds = punctuator(text)
 
29
 
30
+ output = text
31
+ offset = 0
32
 
33
+ for p in preds:
34
+ label = p["entity_group"]
35
+ if label == "O":
36
+ continue
37
 
38
+ punct = LABEL_TO_PUNCT.get(label)
39
+ if not punct:
40
+ continue
41
 
42
+ end = p["end"] + offset
43
+ output = output[:end] + punct + output[end:]
44
+ offset += len(punct)
 
 
 
 
 
 
 
 
 
45
 
46
+ return output
47
 
 
 
 
48
  demo = gr.Interface(
49
  fn=restore_punctuation,
50
+ inputs=gr.Textbox(lines=4, placeholder="বাংলা টেক্সট লিখুন (যতিচিহ্ন ছাড়া)"),
51
  outputs="text",
52
+ title="Bangla Punctuation Restoration",
53
+ description="sagor-bert-base based Bangla punctuation restoration"
54
  )
55
 
56
  demo.launch()