Spaces:

roymukund
/

Indic-POS-tagger

Sleeping

App Files Files

roymukund commited on Aug 22, 2025

Commit

ec86c24

verified ·

1 Parent(s): 11a0c0f

Upload 3 files

Browse files

Files changed (3) hide show

app.py +115 -95
predict.py +70 -66
utils.py +166 -34

app.py CHANGED Viewed

@@ -1,95 +1,115 @@
-import gradio as gr
-from predict import predict
-# Map dropdown option → model path
-LANGUAGE_MODELS = {
-    "Odia": "models/odia-pos-16K.pkl",
-    "Punjabi": "models/punjabi-pos.pkl",
-    "Dogri": "models/dogri-pos.pkl"
-}
-def process_file(language, file_obj, file_type):
-    model_path = LANGUAGE_MODELS.get(language)
-    if not model_path:
-        raise ValueError(f"No model available for {language}")
-    input_path = file_obj.name
-    output_path = f"result_{language}.txt"
-    result_file = predict(input_path, model_path, file_type, output_path)
-    with open(result_file, "r", encoding="utf-8") as f:
-        preview = f.read(500)
-    return result_file, preview
-def main():
-     with gr.Blocks(css="""
-        .download-box {
-            background: linear-gradient(90deg, #00c6ff, #0072ff);
-            padding: 20px;
-            border-radius: 12px;
-            text-align: center;
-            color: white;
-            font-weight: bold;
-            font-size: 18px;
-            box-shadow: 0px 4px 8px rgba(0,0,0,0.1);
-        }
-        .download-box .wrap.svelte-1ipelgc {
-            justify-content: center !important;
-        }
-        .block-label {
-            color: black !important;
-            font-size: 18px !important;
-            font-weight: 600 !important;
-        }
-    """) as demo:
-        gr.HTML(
-            """
-                <h1>🌍 Multilingual POS Tagger</h1>
-                <p>Upload text or CoNLL files and get POS-tagged output</p>
-            """
-        )
-        with gr.Row():
-            with gr.Column(scale=1):
-                language = gr.Dropdown(
-                    ["Odia", "Punjabi", "Dogri"],
-                    label="🌐 Select Language",
-                    value="Odia"
-                )
-                file_in = gr.File(
-                    label="📂 Upload Input File",
-                    file_types=[".txt", ".conll"]
-                )
-                file_type = gr.Radio(
-                    ["plain", "conll"],
-                    label="📄 File Type",
-                    value="plain"
-                )
-                submit = gr.Button("🚀 Run POS Tagger", variant="primary")
-            with gr.Column(scale=1):
-               # gr.HTML("<div class='download-box'>⬇️ Download Your Tagged File Below</div>")
-                output_file = gr.File(label="Download", file_types=[".txt", ".conll"])
-                preview_text = gr.Textbox(
-                    label="👀 Preview (first 500 chars)",
-                    interactive=False,
-                    lines=15,
-                    placeholder="Output will appear here..."
-                )
-        submit.click(process_file, inputs=[language, file_in, file_type],
-                     outputs=[output_file, preview_text])
-        demo.launch()
-if __name__ == "__main__":
-    main()

+import gradio as gr
+from predict import predict
+# Map dropdown option → model path
+LANGUAGE_MODELS = {
+    "Odia": "models/odia-pos-16K.pkl",
+    "Punjabi": "models/punjabi-pos.pkl",
+    "Dogri": "models/dogri-pos.pkl"
+}
+def highlight_ssf(text):
+    """Add simple HTML highlighting for SSF structure and POS tags."""
+    import re
+    # Highlight sentence tags <Sentence ...> and brackets
+    text = re.sub(r"(&lt;/?Sentence[^&]*&gt;)", r"<span style='color:green; font-style:italic;'>\1</span>", text)
+    text = re.sub(r"(\(\(|\)\))", r"<span style='color:green; font-style:italic;'>\1</span>", text)
+    # Highlight <fs ...>
+    text = re.sub(r"(&lt;fs[^&]*&gt;)", r"<span style='color:darkorange;'>\1</span>", text)
+    # Highlight POS tags (3rd column) → blue & bold
+    def repl_pos(match):
+        return f"{match.group(1)}<span style='color:blue; font-weight:bold;'>{match.group(2)}</span>{match.group(3)}"
+    text = re.sub(r"^(\s*\d+\t[^\t]+\t)([^\t]+)(.*)$", repl_pos, text, flags=re.MULTILINE)
+    return f"<pre style='font-family:monospace;'>{text}</pre>"
+def process_file(language, file_obj, file_type):
+    model_path = LANGUAGE_MODELS.get(language)
+    if not model_path:
+        raise ValueError(f"No model available for {language}")
+    input_path = file_obj.name
+    output_path = f"result_{language}.txt"
+    result_file = predict(input_path, model_path, file_type, output_path)
+    with open(result_file, "r", encoding="utf-8") as f:
+        preview_raw = f.read(2000)  # first ~2000 chars for preview
+    # If SSF, apply highlighting
+    if file_type == "ssf":
+        preview = highlight_ssf(preview_raw.replace("<", "&lt;").replace(">", "&gt;"))
+    else:
+        preview = f"<pre>{preview_raw}</pre>"
+    return result_file, preview
+def main():
+     with gr.Blocks(css="""
+        .download-box {
+            background: linear-gradient(90deg, #00c6ff, #0072ff);
+            padding: 20px;
+            border-radius: 12px;
+            text-align: center;
+            color: white;
+            font-weight: bold;
+            font-size: 18px;
+            box-shadow: 0px 4px 8px rgba(0,0,0,0.1);
+        }
+        .download-box .wrap.svelte-1ipelgc {
+            justify-content: center !important;
+        }
+        .block-label {
+            color: black !important;
+            font-size: 18px !important;
+            font-weight: 600 !important;
+        }
+    """) as demo:
+        gr.HTML(
+            """
+                <h1>🌍 Multilingual POS Tagger</h1>
+                <p>Upload text or CoNLL files and get POS-tagged output</p>
+            """
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                language = gr.Dropdown(
+                    ["Odia", "Punjabi", "Dogri"],
+                    label="🌐 Select Language",
+                    value="Odia"
+                )
+                file_in = gr.File(
+                    label="📂 Upload Input File",
+                    file_types=[".txt", ".conll"]
+                )
+                file_type = gr.Radio(
+                    ["plain", "conll", "ssf"],
+                    label="📄 File Type",
+                    value="plain"
+                )
+                submit = gr.Button("🚀 Run POS Tagger", variant="primary")
+            with gr.Column(scale=1):
+                output_file = gr.File(label="⬇️ Download Tagged File", file_types=[".txt", ".conll", ".ssf"])
+                preview_text = gr.HTML(label="👀 Preview (first lines)")
+        submit.click(process_file, inputs=[language, file_in, file_type],
+                     outputs=[output_file, preview_text])
+        demo.launch()
+if __name__ == "__main__":
+    main()

predict.py CHANGED Viewed

@@ -1,66 +1,70 @@
-import _pickle as cPickle
-from utils import plain_to_conll, conll_to_output
-import os
-def word_features(sent, i):
-    word = sent[i][0]
-    if i == 0: prevword = '<START>'
-    else: prevword = sent[i - 1][0]
-    if i <= 1: prev2word = '<START>'
-    else: prev2word = sent[i - 2][0]
-    if i == len(sent) - 1: nextword = '<END>'
-    else: nextword = sent[i + 1][0]
-    return {
-        'word': word,
-        'prevword': prevword,
-        'nextword': nextword,
-        'suff_1': word[-1:], 'suff_2': word[-2:], 'suff_3': word[-3:], 'suff_4': word[-4:],
-        'pref_1': word[:1], 'pref_2': word[:2], 'pref_3': word[:3], 'pref_4': word[:4],
-        'prev2word': prev2word
-    }
-def sent2features(sent):
-    return [word_features(sent, i) for i in range(len(sent))]
-def load_and_predict(input_file, model, output_file):
-    with open(model, 'rb') as fid:
-        crf = cPickle.load(fid)
-    test_data = []
-    with open(input_file, encoding="utf8") as fr:
-        temp = []
-        for line in fr:
-            line = line.strip()
-            if line != "":
-                chunk = (line.split("\t")[0], '')
-                temp.append(chunk)
-            else:
-                if temp:
-                    test_data.append(temp)
-                temp = []
-    X_test1 = [sent2features(s) for s in test_data]
-    y_pred1 = crf.predict(X_test1)
-    with open(output_file, 'w', encoding="utf-8") as f:
-        for i in range(len(test_data)):
-            for j in range(len(test_data[i])):
-                f.write(test_data[i][j][0] + "\t" + y_pred1[i][j] + "\n")
-            f.write("\n")
-    return output_file
-def predict(input_file, model, file_type, output_file="output.txt"):
-    temp_conll = "temp_input.conll"
-    tagged_conll = "tagged_output.conll"
-    if file_type == "plain":
-        plain_to_conll(input_file, temp_conll)
-        load_and_predict(temp_conll, model, tagged_conll)
-        conll_to_output(tagged_conll, output_file)
-    else:
-        load_and_predict(input_file, model, tagged_conll)
-        os.replace(tagged_conll, output_file)
-    return output_file

+import _pickle as cPickle
+from utils import plain_to_conll, conll_to_output, ssf_to_conll, conll_to_ssf
+import os
+def word_features(sent, i):
+    word = sent[i][0]
+    if i == 0: prevword = '<START>'
+    else: prevword = sent[i - 1][0]
+    if i <= 1: prev2word = '<START>'
+    else: prev2word = sent[i - 2][0]
+    if i == len(sent) - 1: nextword = '<END>'
+    else: nextword = sent[i + 1][0]
+    return {
+        'word': word,
+        'prevword': prevword,
+        'nextword': nextword,
+        'suff_1': word[-1:], 'suff_2': word[-2:], 'suff_3': word[-3:], 'suff_4': word[-4:],
+        'pref_1': word[:1], 'pref_2': word[:2], 'pref_3': word[:3], 'pref_4': word[:4],
+        'prev2word': prev2word
+    }
+def sent2features(sent):
+    return [word_features(sent, i) for i in range(len(sent))]
+def load_and_predict(input_file, model, output_file):
+    with open(model, 'rb') as fid:
+        crf = cPickle.load(fid)
+    test_data = []
+    with open(input_file, encoding="utf8") as fr:
+        temp = []
+        for line in fr:
+            line = line.strip()
+            if line != "":
+                chunk = (line.split("\t")[0], '')
+                temp.append(chunk)
+            else:
+                if temp:
+                    test_data.append(temp)
+                temp = []
+    X_test1 = [sent2features(s) for s in test_data]
+    y_pred1 = crf.predict(X_test1)
+    with open(output_file, 'w', encoding="utf-8") as f:
+        for i in range(len(test_data)):
+            for j in range(len(test_data[i])):
+                f.write(test_data[i][j][0] + "\t" + y_pred1[i][j] + "\n")
+            f.write("\n")
+    return output_file
+def predict(input_file, model, file_type, output_file="output.txt"):
+    temp_conll = "temp_input.conll"
+    tagged_conll = "tagged_output.conll"
+    if file_type == "plain":
+        plain_to_conll(input_file, temp_conll)
+        load_and_predict(temp_conll, model, tagged_conll)
+        conll_to_output(tagged_conll, output_file)
+    elif file_type == "ssf":
+        ssf_to_conll(input_file, temp_conll)
+        load_and_predict(temp_conll, model, tagged_conll)
+        conll_to_ssf(tagged_conll, input_file, output_file)
+    else:
+        load_and_predict(input_file, model, tagged_conll)
+        os.replace(tagged_conll, output_file)
+    return output_file

utils.py CHANGED Viewed

@@ -1,34 +1,166 @@
-import re
-def plain_to_conll(input_file, temp_file):
-    """Convert plain sentences (one per line) into CoNLL format with dummy tags."""
-    with open(input_file, "r", encoding="utf-8") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
-        for line in f_in:
-            line = line.strip()
-            if not line:
-                continue
-            # split by whitespace only (keeps Unicode tokens intact)
-            tokens = line.split()
-            for tok in tokens:
-                f_out.write(f"{tok}\t\n")  # only token, no label
-            f_out.write("\n")
-def conll_to_output(conll_file, output_file):
-    """Convert conll output to token_POS sentences."""
-    with open(conll_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
-        sentence = []
-        for line in f_in:
-            line = line.strip()
-            if not line:
-                if sentence:
-                    f_out.write(" ".join(sentence) + "\n")
-                    sentence = []
-                continue
-            parts = line.split("\t")
-            if len(parts) >= 2:
-                token, pos = parts[0], parts[1]
-                sentence.append(f"{token}||{pos}")
-        if sentence:
-            f_out.write(" ".join(sentence) + "\n")

+import re
+# ---------- Plain & CoNLL (unchanged/safer) ----------
+def plain_to_conll(input_file, temp_file):
+    with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
+        for line in f_in:
+            line = line.strip()
+            if not line:
+                f_out.write("\n")
+                continue
+            for tok in line.split():
+                f_out.write(f"{tok}\t\n")
+            f_out.write("\n")
+def conll_to_output(conll_file, output_file):
+    with open(conll_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
+        sent = []
+        for line in f_in:
+            line = line.rstrip("\n")
+            if not line:
+                if sent:
+                    f_out.write(" ".join(sent) + "\n")
+                    sent = []
+                continue
+            parts = line.split("\t")
+            if len(parts) >= 2:
+                sent.append(f"{parts[0]}_{parts[1]}")
+        if sent:
+            f_out.write(" ".join(sent) + "\n")
+# ---------- SSF helpers (robust) ----------
+_token_line_re = re.compile(r"^\s*(\d+)\s+(\S+)(?:\s+(\S+))?(?:\s+(.*))?$")
+def _is_structure(line: str) -> bool:
+    s = line.strip()
+    return (
+        s == "" or
+        s.startswith("<") or  # <Sentence ...>, </Sentence>, XML-ish tags
+        s.startswith("((") or
+        s.startswith("))")
+    )
+def _parse_token_line(raw: str):
+    """
+    Return (idx, token, pos, rest, used_tabs) or None if not a token line.
+    - Works with tabs or spaces.
+    - 'rest' is any trailing columns (e.g., <fs ...>).
+    - used_tabs: True if original line used tabs (preserve layout).
+    """
+    used_tabs = ("\t" in raw)
+    parts_tab = raw.split("\t") if used_tabs else None
+    if used_tabs and len(parts_tab) >= 2 and parts_tab[0].strip().isdigit():
+        idx = parts_tab[0].strip()
+        token = parts_tab[1].strip() if len(parts_tab) >= 2 else ""
+        pos = parts_tab[2].strip() if len(parts_tab) >= 3 else ""
+        rest = "\t".join(parts_tab[3:]) if len(parts_tab) >= 4 else ""
+        return idx, token, pos, rest, True
+    m = _token_line_re.match(raw)
+    if m:
+        idx, token, pos, rest = m.groups()
+        return idx, token, (pos or ""), (rest or ""), False
+    return None
+def ssf_to_conll(input_file, temp_file):
+    """
+    Convert SSF (XML-style or classic) into CoNLL tokens.
+    - Only lines whose first column is an integer are treated as tokens.
+    - Writes a blank line at sentence boundaries.
+    """
+    with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
+        wrote_any_in_sentence = False
+        for raw in f_in:
+            line = raw.rstrip("\n")
+            # Sentence boundaries: start/end tags or classic brackets trigger newline
+            if line.strip().startswith("<Sentence"):
+                if wrote_any_in_sentence:
+                    f_out.write("\n")
+                wrote_any_in_sentence = False
+                continue
+            if line.strip().startswith("</Sentence>") or line.strip().startswith("))"):
+                if wrote_any_in_sentence:
+                    f_out.write("\n")
+                wrote_any_in_sentence = False
+                continue
+            if line.strip().startswith("(("):
+                if wrote_any_in_sentence:
+                    f_out.write("\n")
+                wrote_any_in_sentence = False
+                continue
+            if _is_structure(line):
+                # blank or structural lines: ignore but do not break sentence unless handled above
+                continue
+            parsed = _parse_token_line(line)
+            if parsed:
+                _, token, _, _, _ = parsed
+                f_out.write(f"{token}\t\n")
+                wrote_any_in_sentence = True
+        # ensure trailing sentence closure gets a newline
+        if wrote_any_in_sentence:
+            f_out.write("\n")
+def conll_to_ssf(conll_file, ssf_input_file, output_file):
+    """
+    Merge CRF predictions back into SSF.
+    - Replaces only the POS (3rd column), preserving index, token, and any trailing cols (e.g., <fs ...>).
+    - Preserves original tabs vs spaces layout when possible.
+    """
+    # Gather predictions (ignore blank lines)
+    preds = []
+    with open(conll_file, "r", encoding="utf-8") as f_in:
+        for line in f_in:
+            line = line.strip()
+            if not line:
+                continue
+            parts = line.split("\t")
+            if len(parts) >= 2:
+                preds.append((parts[0], parts[1]))  # (token, pos)
+    p = 0
+    with open(ssf_input_file, "r", encoding="utf-8-sig") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
+        for raw in f_in:
+            line = raw.rstrip("\n")
+            # Write structural lines untouched
+            if _is_structure(line):
+                f_out.write(line + "\n")
+                continue
+            parsed = _parse_token_line(line)
+            if not parsed:
+                # Not a recognizable token line; write as-is
+                f_out.write(line + "\n")
+                continue
+            idx, token, old_pos, rest, used_tabs = parsed
+            # If we have a prediction, replace POS; otherwise keep old POS
+            if p < len(preds):
+                _, new_pos = preds[p]
+                p += 1
+            else:
+                new_pos = old_pos if old_pos else "UNK"
+            if used_tabs:
+                # preserve original tabbed structure
+                parts = line.split("\t")
+                # Ensure at least 3 columns
+                while len(parts) < 3:
+                    parts.append("")
+                parts[2] = new_pos
+                out = "\t".join(parts)
+            else:
+                # Normalize to tabs for clarity if original used spaces
+                out = f"{idx}\t{token}\t{new_pos}"
+                if rest:
+                    out += f"\t{rest}"
+            f_out.write(out + "\n")