File size: 1,228 Bytes
7821f40
3193bb9
 
 
ea715a3
 
 
 
 
 
 
 
 
 
 
3193bb9
7821f40
f30c460
ea715a3
f30c460
ea715a3
7821f40
 
ea715a3
f30c460
3193bb9
f30c460
ea715a3
f30c460
ea715a3
 
f30c460
ea715a3
 
 
 
f30c460
ea715a3
 
 
f30c460
ea715a3
 
 
f30c460
ea715a3
f30c460
3193bb9
 
ea715a3
3193bb9
ea715a3
 
3193bb9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import gradio as gr
from transformers import pipeline

MODEL_ID = "Badhon/Bangla_punctuation_restore"

LABEL_TO_PUNCT = {
    "COMMA": "،",
    "DARI": "।",
    "QUESTION": "?",
    "EXCLAMATION": "!",
    "SEMICOLON": ";",
    "COLON": ":",
    "HYPHEN": "-"
}

punctuator = pipeline(
    "token-classification",
    model=MODEL_ID,
    aggregation_strategy="simple",
    token=os.getenv("HF_TOKEN")
)

def restore_punctuation(text: str) -> str:
    if not text.strip():
        return ""

    preds = punctuator(text)

    output = text
    offset = 0

    for p in preds:
        label = p["entity_group"]
        if label == "O":
            continue

        punct = LABEL_TO_PUNCT.get(label)
        if not punct:
            continue

        end = p["end"] + offset
        output = output[:end] + punct + output[end:]
        offset += len(punct)

    return output

demo = gr.Interface(
    fn=restore_punctuation,
    inputs=gr.Textbox(lines=4, placeholder="বাংলা টেক্সট লিখুন (যতিচিহ্ন ছাড়া)"),
    outputs="text",
    title="Bangla Punctuation Restoration",
    description="sagor-bert-base based Bangla punctuation restoration"
)

demo.launch()