hellosindh commited on
Commit
e3be0eb
·
verified ·
1 Parent(s): b27b19b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +207 -0
app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import sentencepiece as spm
4
+ import os
5
+ from transformers import RobertaForTokenClassification
6
+
7
+ # ─── Load model & tokenizer ───────────────────────
8
+ MODEL_PATH = "hellosindh/sindhi-bert-ner"
9
+ SP_MODEL = "sindhi_bpe_32k.model"
10
+
11
+ print("Loading model...", flush=True)
12
+ model = RobertaForTokenClassification.from_pretrained(
13
+ MODEL_PATH
14
+ )
15
+ model.eval()
16
+
17
+ print("Loading tokenizer...", flush=True)
18
+ sp = spm.SentencePieceProcessor()
19
+ sp.Load(SP_MODEL)
20
+
21
+ # ─── Tag config ───────────────────────────────────
22
+ ID2TAG = model.config.id2label
23
+
24
+ BOS_ID = 2
25
+ EOS_ID = 3
26
+
27
+ # Entity colors for highlighting
28
+ COLORS = {
29
+ "PERSON": "#FF6B6B",
30
+ "LOCATION": "#4ECDC4",
31
+ "ORGANIZATION": "#45B7D1",
32
+ "DATE_TIME": "#96CEB4",
33
+ "EVENT": "#FFEAA7",
34
+ "LITERARY_WORK":"#DDA0DD",
35
+ "PROFESSION": "#98D8C8",
36
+ "TITLE": "#F7DC6F",
37
+ "LANGUAGE": "#BB8FCE",
38
+ "FIELD": "#85C1E9",
39
+ "LAW": "#F0B27A",
40
+ "GROUP": "#82E0AA",
41
+ "MISC": "#BDC3C7",
42
+ }
43
+
44
+ # ─── Prediction function ──────────────────────────
45
+ def predict_ner(sentence):
46
+ if not sentence.strip():
47
+ return "", []
48
+
49
+ words = sentence.split()
50
+
51
+ # Tokenize
52
+ input_ids = [BOS_ID]
53
+ word_map = [-1] # maps token → word index
54
+
55
+ for i, word in enumerate(words):
56
+ subwords = sp.EncodeAsIds(word)
57
+ if not subwords:
58
+ continue
59
+ for j, sw in enumerate(subwords):
60
+ input_ids.append(sw)
61
+ word_map.append(i if j == 0 else -1)
62
+
63
+ input_ids.append(EOS_ID)
64
+ word_map.append(-1)
65
+
66
+ # Run model
67
+ tensor = torch.tensor([input_ids])
68
+ with torch.no_grad():
69
+ logits = model(tensor).logits[0]
70
+
71
+ preds = torch.argmax(logits, dim=-1).tolist()
72
+
73
+ # Collect word-level predictions
74
+ word_tags = {}
75
+ for pos, (pred, wid) in enumerate(zip(preds, word_map)):
76
+ if wid >= 0:
77
+ word_tags[wid] = ID2TAG[pred]
78
+
79
+ # ─── Build highlighted HTML ───────────────────
80
+ html_parts = []
81
+ entities = []
82
+
83
+ i = 0
84
+ while i < len(words):
85
+ tag = word_tags.get(i, "O")
86
+
87
+ if tag.startswith("B-"):
88
+ entity_type = tag[2:]
89
+ entity_words = [words[i]]
90
+
91
+ # Collect I- continuation tokens
92
+ j = i + 1
93
+ while j < len(words):
94
+ next_tag = word_tags.get(j, "O")
95
+ if next_tag == f"I-{entity_type}":
96
+ entity_words.append(words[j])
97
+ j += 1
98
+ else:
99
+ break
100
+
101
+ entity_text = " ".join(entity_words)
102
+ color = COLORS.get(entity_type, "#BDC3C7")
103
+
104
+ html_parts.append(
105
+ f'<mark style="background:{color}; '
106
+ f'padding:2px 6px; border-radius:4px; '
107
+ f'margin:2px; font-weight:bold;" '
108
+ f'title="{entity_type}">'
109
+ f'{entity_text} '
110
+ f'<span style="font-size:0.75em; '
111
+ f'opacity:0.8;">[{entity_type}]</span>'
112
+ f'</mark>'
113
+ )
114
+
115
+ entities.append((entity_text, entity_type))
116
+ i = j
117
+
118
+ else:
119
+ html_parts.append(words[i])
120
+ i += 1
121
+
122
+ html = '<p dir="rtl" style="font-size:1.2em; ' \
123
+ 'line-height:2.5em; text-align:right;">' + \
124
+ " ".join(html_parts) + "</p>"
125
+
126
+ # Build entity table
127
+ table = []
128
+ for text, etype in entities:
129
+ table.append([text, etype])
130
+
131
+ return html, table
132
+
133
+ # ─── Example sentences ────────────────────────────
134
+ examples = [
135
+ ["شيخ اياز شڪارپور ۾ پيدا ٿيو"],
136
+ ["سنڌ يونيورسٽي حيدرآباد ۾ آھي"],
137
+ ["پاڪستان ڏکڻ ايشيا ۾ آھي"],
138
+ ["ڊاڪٽر محمد علي 1990ع ۾ سنڌ آيو"],
139
+ ]
140
+
141
+ # ─── Gradio Interface ─────────────────────────────
142
+ with gr.Blocks(
143
+ theme=gr.themes.Soft(),
144
+ title="Sindhi NER"
145
+ ) as demo:
146
+
147
+ gr.Markdown("""
148
+ # 🏷️ Sindhi Named Entity Recognizer
149
+ ### سنڌي نالن جي سڃاڻپ جو اوزار
150
+
151
+ First Sindhi NER model — trained on 22,777 annotated sentences!
152
+
153
+ **Recognizes:** Person · Location · Organization ·
154
+ Date/Time · Event · Literary Work · and 15 more types
155
+ """)
156
+
157
+ with gr.Row():
158
+ with gr.Column():
159
+ text_input = gr.Textbox(
160
+ label="سنڌي جملو لکو (Enter Sindhi text)",
161
+ placeholder="شيخ اياز شڪارپور ۾ پيدا ٿيو",
162
+ lines=3,
163
+ rtl=True
164
+ )
165
+ submit_btn = gr.Button(
166
+ "🔍 Entities ڳوليو",
167
+ variant="primary"
168
+ )
169
+
170
+ with gr.Row():
171
+ highlighted = gr.HTML(
172
+ label="Highlighted Entities"
173
+ )
174
+
175
+ with gr.Row():
176
+ entity_table = gr.Dataframe(
177
+ headers=["Entity", "Type"],
178
+ label="Entities Found",
179
+ wrap=True
180
+ )
181
+
182
+ # Color legend
183
+ gr.Markdown("""
184
+ ### Legend
185
+ 🔴 Person &nbsp; 🟦 Location &nbsp;
186
+ 🔵 Organization &nbsp; 🟢 Date/Time &nbsp;
187
+ 🟡 Event &nbsp; 🟣 Literary Work
188
+ """)
189
+
190
+ gr.Examples(
191
+ examples=examples,
192
+ inputs=text_input
193
+ )
194
+
195
+ submit_btn.click(
196
+ fn=predict_ner,
197
+ inputs=text_input,
198
+ outputs=[highlighted, entity_table]
199
+ )
200
+
201
+ text_input.submit(
202
+ fn=predict_ner,
203
+ inputs=text_input,
204
+ outputs=[highlighted, entity_table]
205
+ )
206
+
207
+ demo.launch()