roymukund commited on
Commit
ec86c24
Β·
verified Β·
1 Parent(s): 11a0c0f

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +115 -95
  2. predict.py +70 -66
  3. utils.py +166 -34
app.py CHANGED
@@ -1,95 +1,115 @@
1
- import gradio as gr
2
- from predict import predict
3
-
4
- # Map dropdown option β†’ model path
5
- LANGUAGE_MODELS = {
6
- "Odia": "models/odia-pos-16K.pkl",
7
- "Punjabi": "models/punjabi-pos.pkl",
8
- "Dogri": "models/dogri-pos.pkl"
9
- }
10
-
11
- def process_file(language, file_obj, file_type):
12
- model_path = LANGUAGE_MODELS.get(language)
13
- if not model_path:
14
- raise ValueError(f"No model available for {language}")
15
-
16
- input_path = file_obj.name
17
- output_path = f"result_{language}.txt"
18
-
19
- result_file = predict(input_path, model_path, file_type, output_path)
20
-
21
- with open(result_file, "r", encoding="utf-8") as f:
22
- preview = f.read(500)
23
-
24
- return result_file, preview
25
-
26
-
27
- def main():
28
-
29
- with gr.Blocks(css="""
30
- .download-box {
31
- background: linear-gradient(90deg, #00c6ff, #0072ff);
32
- padding: 20px;
33
- border-radius: 12px;
34
- text-align: center;
35
- color: white;
36
- font-weight: bold;
37
- font-size: 18px;
38
- box-shadow: 0px 4px 8px rgba(0,0,0,0.1);
39
- }
40
- .download-box .wrap.svelte-1ipelgc {
41
- justify-content: center !important;
42
- }
43
- .block-label {
44
- color: black !important;
45
- font-size: 18px !important;
46
- font-weight: 600 !important;
47
- }
48
- """) as demo:
49
- gr.HTML(
50
- """
51
-
52
- <h1>🌍 Multilingual POS Tagger</h1>
53
- <p>Upload text or CoNLL files and get POS-tagged output</p>
54
-
55
- """
56
- )
57
-
58
- with gr.Row():
59
- with gr.Column(scale=1):
60
- language = gr.Dropdown(
61
- ["Odia", "Punjabi", "Dogri"],
62
- label="🌐 Select Language",
63
- value="Odia"
64
- )
65
- file_in = gr.File(
66
- label="πŸ“‚ Upload Input File",
67
- file_types=[".txt", ".conll"]
68
- )
69
- file_type = gr.Radio(
70
- ["plain", "conll"],
71
- label="πŸ“„ File Type",
72
- value="plain"
73
- )
74
- submit = gr.Button("πŸš€ Run POS Tagger", variant="primary")
75
-
76
- with gr.Column(scale=1):
77
- # gr.HTML("<div class='download-box'>⬇️ Download Your Tagged File Below</div>")
78
- output_file = gr.File(label="Download", file_types=[".txt", ".conll"])
79
- preview_text = gr.Textbox(
80
- label="πŸ‘€ Preview (first 500 chars)",
81
- interactive=False,
82
- lines=15,
83
- placeholder="Output will appear here..."
84
- )
85
-
86
- submit.click(process_file, inputs=[language, file_in, file_type],
87
- outputs=[output_file, preview_text])
88
-
89
- demo.launch()
90
-
91
-
92
-
93
-
94
- if __name__ == "__main__":
95
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from predict import predict
3
+
4
+ # Map dropdown option β†’ model path
5
+ LANGUAGE_MODELS = {
6
+ "Odia": "models/odia-pos-16K.pkl",
7
+ "Punjabi": "models/punjabi-pos.pkl",
8
+ "Dogri": "models/dogri-pos.pkl"
9
+ }
10
+
11
+ def highlight_ssf(text):
12
+ """Add simple HTML highlighting for SSF structure and POS tags."""
13
+ import re
14
+
15
+ # Highlight sentence tags <Sentence ...> and brackets
16
+ text = re.sub(r"(&lt;/?Sentence[^&]*&gt;)", r"<span style='color:green; font-style:italic;'>\1</span>", text)
17
+ text = re.sub(r"(\(\(|\)\))", r"<span style='color:green; font-style:italic;'>\1</span>", text)
18
+
19
+ # Highlight <fs ...>
20
+ text = re.sub(r"(&lt;fs[^&]*&gt;)", r"<span style='color:darkorange;'>\1</span>", text)
21
+
22
+ # Highlight POS tags (3rd column) β†’ blue & bold
23
+ def repl_pos(match):
24
+ return f"{match.group(1)}<span style='color:blue; font-weight:bold;'>{match.group(2)}</span>{match.group(3)}"
25
+ text = re.sub(r"^(\s*\d+\t[^\t]+\t)([^\t]+)(.*)$", repl_pos, text, flags=re.MULTILINE)
26
+
27
+ return f"<pre style='font-family:monospace;'>{text}</pre>"
28
+
29
+
30
+ def process_file(language, file_obj, file_type):
31
+ model_path = LANGUAGE_MODELS.get(language)
32
+ if not model_path:
33
+ raise ValueError(f"No model available for {language}")
34
+
35
+ input_path = file_obj.name
36
+ output_path = f"result_{language}.txt"
37
+
38
+ result_file = predict(input_path, model_path, file_type, output_path)
39
+
40
+ with open(result_file, "r", encoding="utf-8") as f:
41
+ preview_raw = f.read(2000) # first ~2000 chars for preview
42
+
43
+ # If SSF, apply highlighting
44
+ if file_type == "ssf":
45
+ preview = highlight_ssf(preview_raw.replace("<", "&lt;").replace(">", "&gt;"))
46
+ else:
47
+ preview = f"<pre>{preview_raw}</pre>"
48
+
49
+ return result_file, preview
50
+
51
+
52
+ def main():
53
+
54
+ with gr.Blocks(css="""
55
+ .download-box {
56
+ background: linear-gradient(90deg, #00c6ff, #0072ff);
57
+ padding: 20px;
58
+ border-radius: 12px;
59
+ text-align: center;
60
+ color: white;
61
+ font-weight: bold;
62
+ font-size: 18px;
63
+ box-shadow: 0px 4px 8px rgba(0,0,0,0.1);
64
+ }
65
+ .download-box .wrap.svelte-1ipelgc {
66
+ justify-content: center !important;
67
+ }
68
+ .block-label {
69
+ color: black !important;
70
+ font-size: 18px !important;
71
+ font-weight: 600 !important;
72
+ }
73
+ """) as demo:
74
+ gr.HTML(
75
+ """
76
+
77
+ <h1>🌍 Multilingual POS Tagger</h1>
78
+ <p>Upload text or CoNLL files and get POS-tagged output</p>
79
+
80
+ """
81
+ )
82
+
83
+ with gr.Row():
84
+ with gr.Column(scale=1):
85
+ language = gr.Dropdown(
86
+ ["Odia", "Punjabi", "Dogri"],
87
+ label="🌐 Select Language",
88
+ value="Odia"
89
+ )
90
+ file_in = gr.File(
91
+ label="πŸ“‚ Upload Input File",
92
+ file_types=[".txt", ".conll"]
93
+ )
94
+ file_type = gr.Radio(
95
+ ["plain", "conll", "ssf"],
96
+ label="πŸ“„ File Type",
97
+ value="plain"
98
+ )
99
+ submit = gr.Button("πŸš€ Run POS Tagger", variant="primary")
100
+
101
+ with gr.Column(scale=1):
102
+ output_file = gr.File(label="⬇️ Download Tagged File", file_types=[".txt", ".conll", ".ssf"])
103
+ preview_text = gr.HTML(label="πŸ‘€ Preview (first lines)")
104
+
105
+ submit.click(process_file, inputs=[language, file_in, file_type],
106
+ outputs=[output_file, preview_text])
107
+
108
+
109
+ demo.launch()
110
+
111
+
112
+
113
+
114
+ if __name__ == "__main__":
115
+ main()
predict.py CHANGED
@@ -1,66 +1,70 @@
1
- import _pickle as cPickle
2
- from utils import plain_to_conll, conll_to_output
3
- import os
4
-
5
- def word_features(sent, i):
6
- word = sent[i][0]
7
- if i == 0: prevword = '<START>'
8
- else: prevword = sent[i - 1][0]
9
- if i <= 1: prev2word = '<START>'
10
- else: prev2word = sent[i - 2][0]
11
- if i == len(sent) - 1: nextword = '<END>'
12
- else: nextword = sent[i + 1][0]
13
-
14
- return {
15
- 'word': word,
16
- 'prevword': prevword,
17
- 'nextword': nextword,
18
- 'suff_1': word[-1:], 'suff_2': word[-2:], 'suff_3': word[-3:], 'suff_4': word[-4:],
19
- 'pref_1': word[:1], 'pref_2': word[:2], 'pref_3': word[:3], 'pref_4': word[:4],
20
- 'prev2word': prev2word
21
- }
22
-
23
- def sent2features(sent):
24
- return [word_features(sent, i) for i in range(len(sent))]
25
-
26
- def load_and_predict(input_file, model, output_file):
27
- with open(model, 'rb') as fid:
28
- crf = cPickle.load(fid)
29
-
30
- test_data = []
31
- with open(input_file, encoding="utf8") as fr:
32
- temp = []
33
- for line in fr:
34
- line = line.strip()
35
- if line != "":
36
- chunk = (line.split("\t")[0], '')
37
- temp.append(chunk)
38
- else:
39
- if temp:
40
- test_data.append(temp)
41
- temp = []
42
-
43
- X_test1 = [sent2features(s) for s in test_data]
44
- y_pred1 = crf.predict(X_test1)
45
-
46
- with open(output_file, 'w', encoding="utf-8") as f:
47
- for i in range(len(test_data)):
48
- for j in range(len(test_data[i])):
49
- f.write(test_data[i][j][0] + "\t" + y_pred1[i][j] + "\n")
50
- f.write("\n")
51
- return output_file
52
-
53
-
54
- def predict(input_file, model, file_type, output_file="output.txt"):
55
- temp_conll = "temp_input.conll"
56
- tagged_conll = "tagged_output.conll"
57
-
58
- if file_type == "plain":
59
- plain_to_conll(input_file, temp_conll)
60
- load_and_predict(temp_conll, model, tagged_conll)
61
- conll_to_output(tagged_conll, output_file)
62
- else:
63
- load_and_predict(input_file, model, tagged_conll)
64
- os.replace(tagged_conll, output_file)
65
-
66
- return output_file
 
 
 
 
 
1
+ import _pickle as cPickle
2
+ from utils import plain_to_conll, conll_to_output, ssf_to_conll, conll_to_ssf
3
+ import os
4
+
5
+ def word_features(sent, i):
6
+ word = sent[i][0]
7
+ if i == 0: prevword = '<START>'
8
+ else: prevword = sent[i - 1][0]
9
+ if i <= 1: prev2word = '<START>'
10
+ else: prev2word = sent[i - 2][0]
11
+ if i == len(sent) - 1: nextword = '<END>'
12
+ else: nextword = sent[i + 1][0]
13
+
14
+ return {
15
+ 'word': word,
16
+ 'prevword': prevword,
17
+ 'nextword': nextword,
18
+ 'suff_1': word[-1:], 'suff_2': word[-2:], 'suff_3': word[-3:], 'suff_4': word[-4:],
19
+ 'pref_1': word[:1], 'pref_2': word[:2], 'pref_3': word[:3], 'pref_4': word[:4],
20
+ 'prev2word': prev2word
21
+ }
22
+
23
+ def sent2features(sent):
24
+ return [word_features(sent, i) for i in range(len(sent))]
25
+
26
+ def load_and_predict(input_file, model, output_file):
27
+ with open(model, 'rb') as fid:
28
+ crf = cPickle.load(fid)
29
+
30
+ test_data = []
31
+ with open(input_file, encoding="utf8") as fr:
32
+ temp = []
33
+ for line in fr:
34
+ line = line.strip()
35
+ if line != "":
36
+ chunk = (line.split("\t")[0], '')
37
+ temp.append(chunk)
38
+ else:
39
+ if temp:
40
+ test_data.append(temp)
41
+ temp = []
42
+
43
+ X_test1 = [sent2features(s) for s in test_data]
44
+ y_pred1 = crf.predict(X_test1)
45
+
46
+ with open(output_file, 'w', encoding="utf-8") as f:
47
+ for i in range(len(test_data)):
48
+ for j in range(len(test_data[i])):
49
+ f.write(test_data[i][j][0] + "\t" + y_pred1[i][j] + "\n")
50
+ f.write("\n")
51
+ return output_file
52
+
53
+
54
+ def predict(input_file, model, file_type, output_file="output.txt"):
55
+ temp_conll = "temp_input.conll"
56
+ tagged_conll = "tagged_output.conll"
57
+
58
+ if file_type == "plain":
59
+ plain_to_conll(input_file, temp_conll)
60
+ load_and_predict(temp_conll, model, tagged_conll)
61
+ conll_to_output(tagged_conll, output_file)
62
+ elif file_type == "ssf":
63
+ ssf_to_conll(input_file, temp_conll)
64
+ load_and_predict(temp_conll, model, tagged_conll)
65
+ conll_to_ssf(tagged_conll, input_file, output_file)
66
+ else:
67
+ load_and_predict(input_file, model, tagged_conll)
68
+ os.replace(tagged_conll, output_file)
69
+
70
+ return output_file
utils.py CHANGED
@@ -1,34 +1,166 @@
1
- import re
2
-
3
- def plain_to_conll(input_file, temp_file):
4
- """Convert plain sentences (one per line) into CoNLL format with dummy tags."""
5
- with open(input_file, "r", encoding="utf-8") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
6
- for line in f_in:
7
- line = line.strip()
8
- if not line:
9
- continue
10
- # split by whitespace only (keeps Unicode tokens intact)
11
- tokens = line.split()
12
- for tok in tokens:
13
- f_out.write(f"{tok}\t\n") # only token, no label
14
- f_out.write("\n")
15
-
16
-
17
- def conll_to_output(conll_file, output_file):
18
- """Convert conll output to token_POS sentences."""
19
- with open(conll_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
20
- sentence = []
21
- for line in f_in:
22
- line = line.strip()
23
- if not line:
24
- if sentence:
25
- f_out.write(" ".join(sentence) + "\n")
26
- sentence = []
27
- continue
28
- parts = line.split("\t")
29
- if len(parts) >= 2:
30
- token, pos = parts[0], parts[1]
31
- sentence.append(f"{token}||{pos}")
32
- if sentence:
33
- f_out.write(" ".join(sentence) + "\n")
34
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ # ---------- Plain & CoNLL (unchanged/safer) ----------
4
+ def plain_to_conll(input_file, temp_file):
5
+ with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
6
+ for line in f_in:
7
+ line = line.strip()
8
+ if not line:
9
+ f_out.write("\n")
10
+ continue
11
+ for tok in line.split():
12
+ f_out.write(f"{tok}\t\n")
13
+ f_out.write("\n")
14
+
15
+ def conll_to_output(conll_file, output_file):
16
+ with open(conll_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
17
+ sent = []
18
+ for line in f_in:
19
+ line = line.rstrip("\n")
20
+ if not line:
21
+ if sent:
22
+ f_out.write(" ".join(sent) + "\n")
23
+ sent = []
24
+ continue
25
+ parts = line.split("\t")
26
+ if len(parts) >= 2:
27
+ sent.append(f"{parts[0]}_{parts[1]}")
28
+ if sent:
29
+ f_out.write(" ".join(sent) + "\n")
30
+
31
+
32
+ # ---------- SSF helpers (robust) ----------
33
+ _token_line_re = re.compile(r"^\s*(\d+)\s+(\S+)(?:\s+(\S+))?(?:\s+(.*))?$")
34
+
35
+ def _is_structure(line: str) -> bool:
36
+ s = line.strip()
37
+ return (
38
+ s == "" or
39
+ s.startswith("<") or # <Sentence ...>, </Sentence>, XML-ish tags
40
+ s.startswith("((") or
41
+ s.startswith("))")
42
+ )
43
+
44
+ def _parse_token_line(raw: str):
45
+ """
46
+ Return (idx, token, pos, rest, used_tabs) or None if not a token line.
47
+ - Works with tabs or spaces.
48
+ - 'rest' is any trailing columns (e.g., <fs ...>).
49
+ - used_tabs: True if original line used tabs (preserve layout).
50
+ """
51
+ used_tabs = ("\t" in raw)
52
+ parts_tab = raw.split("\t") if used_tabs else None
53
+
54
+ if used_tabs and len(parts_tab) >= 2 and parts_tab[0].strip().isdigit():
55
+ idx = parts_tab[0].strip()
56
+ token = parts_tab[1].strip() if len(parts_tab) >= 2 else ""
57
+ pos = parts_tab[2].strip() if len(parts_tab) >= 3 else ""
58
+ rest = "\t".join(parts_tab[3:]) if len(parts_tab) >= 4 else ""
59
+ return idx, token, pos, rest, True
60
+
61
+ m = _token_line_re.match(raw)
62
+ if m:
63
+ idx, token, pos, rest = m.groups()
64
+ return idx, token, (pos or ""), (rest or ""), False
65
+
66
+ return None
67
+
68
+ def ssf_to_conll(input_file, temp_file):
69
+ """
70
+ Convert SSF (XML-style or classic) into CoNLL tokens.
71
+ - Only lines whose first column is an integer are treated as tokens.
72
+ - Writes a blank line at sentence boundaries.
73
+ """
74
+ with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
75
+ wrote_any_in_sentence = False
76
+ for raw in f_in:
77
+ line = raw.rstrip("\n")
78
+
79
+ # Sentence boundaries: start/end tags or classic brackets trigger newline
80
+ if line.strip().startswith("<Sentence"):
81
+ if wrote_any_in_sentence:
82
+ f_out.write("\n")
83
+ wrote_any_in_sentence = False
84
+ continue
85
+ if line.strip().startswith("</Sentence>") or line.strip().startswith("))"):
86
+ if wrote_any_in_sentence:
87
+ f_out.write("\n")
88
+ wrote_any_in_sentence = False
89
+ continue
90
+ if line.strip().startswith("(("):
91
+ if wrote_any_in_sentence:
92
+ f_out.write("\n")
93
+ wrote_any_in_sentence = False
94
+ continue
95
+
96
+ if _is_structure(line):
97
+ # blank or structural lines: ignore but do not break sentence unless handled above
98
+ continue
99
+
100
+ parsed = _parse_token_line(line)
101
+ if parsed:
102
+ _, token, _, _, _ = parsed
103
+ f_out.write(f"{token}\t\n")
104
+ wrote_any_in_sentence = True
105
+
106
+ # ensure trailing sentence closure gets a newline
107
+ if wrote_any_in_sentence:
108
+ f_out.write("\n")
109
+
110
+ def conll_to_ssf(conll_file, ssf_input_file, output_file):
111
+ """
112
+ Merge CRF predictions back into SSF.
113
+ - Replaces only the POS (3rd column), preserving index, token, and any trailing cols (e.g., <fs ...>).
114
+ - Preserves original tabs vs spaces layout when possible.
115
+ """
116
+ # Gather predictions (ignore blank lines)
117
+ preds = []
118
+ with open(conll_file, "r", encoding="utf-8") as f_in:
119
+ for line in f_in:
120
+ line = line.strip()
121
+ if not line:
122
+ continue
123
+ parts = line.split("\t")
124
+ if len(parts) >= 2:
125
+ preds.append((parts[0], parts[1])) # (token, pos)
126
+
127
+ p = 0
128
+ with open(ssf_input_file, "r", encoding="utf-8-sig") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
129
+ for raw in f_in:
130
+ line = raw.rstrip("\n")
131
+
132
+ # Write structural lines untouched
133
+ if _is_structure(line):
134
+ f_out.write(line + "\n")
135
+ continue
136
+
137
+ parsed = _parse_token_line(line)
138
+ if not parsed:
139
+ # Not a recognizable token line; write as-is
140
+ f_out.write(line + "\n")
141
+ continue
142
+
143
+ idx, token, old_pos, rest, used_tabs = parsed
144
+
145
+ # If we have a prediction, replace POS; otherwise keep old POS
146
+ if p < len(preds):
147
+ _, new_pos = preds[p]
148
+ p += 1
149
+ else:
150
+ new_pos = old_pos if old_pos else "UNK"
151
+
152
+ if used_tabs:
153
+ # preserve original tabbed structure
154
+ parts = line.split("\t")
155
+ # Ensure at least 3 columns
156
+ while len(parts) < 3:
157
+ parts.append("")
158
+ parts[2] = new_pos
159
+ out = "\t".join(parts)
160
+ else:
161
+ # Normalize to tabs for clarity if original used spaces
162
+ out = f"{idx}\t{token}\t{new_pos}"
163
+ if rest:
164
+ out += f"\t{rest}"
165
+
166
+ f_out.write(out + "\n")