Spaces:

roymukund
/

Indic-POS-tagger

Sleeping

File size: 6,172 Bytes

ec86c24

import re

# ---------- Plain & CoNLL (unchanged/safer) ----------
def plain_to_conll(input_file, temp_file):
    with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
        for line in f_in:
            line = line.strip()
            if not line:
                f_out.write("\n")
                continue
            for tok in line.split():
                f_out.write(f"{tok}\t\n")
            f_out.write("\n")

def conll_to_output(conll_file, output_file):
    with open(conll_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
        sent = []
        for line in f_in:
            line = line.rstrip("\n")
            if not line:
                if sent:
                    f_out.write(" ".join(sent) + "\n")
                    sent = []
                continue
            parts = line.split("\t")
            if len(parts) >= 2:
                sent.append(f"{parts[0]}_{parts[1]}")
        if sent:
            f_out.write(" ".join(sent) + "\n")


# ---------- SSF helpers (robust) ----------
_token_line_re = re.compile(r"^\s*(\d+)\s+(\S+)(?:\s+(\S+))?(?:\s+(.*))?$")

def _is_structure(line: str) -> bool:
    s = line.strip()
    return (
        s == "" or
        s.startswith("<") or  # <Sentence ...>, </Sentence>, XML-ish tags
        s.startswith("((") or
        s.startswith("))")
    )

def _parse_token_line(raw: str):
    """

    Return (idx, token, pos, rest, used_tabs) or None if not a token line.

    - Works with tabs or spaces.

    - 'rest' is any trailing columns (e.g., <fs ...>).

    - used_tabs: True if original line used tabs (preserve layout).

    """
    used_tabs = ("\t" in raw)
    parts_tab = raw.split("\t") if used_tabs else None

    if used_tabs and len(parts_tab) >= 2 and parts_tab[0].strip().isdigit():
        idx = parts_tab[0].strip()
        token = parts_tab[1].strip() if len(parts_tab) >= 2 else ""
        pos = parts_tab[2].strip() if len(parts_tab) >= 3 else ""
        rest = "\t".join(parts_tab[3:]) if len(parts_tab) >= 4 else ""
        return idx, token, pos, rest, True

    m = _token_line_re.match(raw)
    if m:
        idx, token, pos, rest = m.groups()
        return idx, token, (pos or ""), (rest or ""), False

    return None

def ssf_to_conll(input_file, temp_file):
    """

    Convert SSF (XML-style or classic) into CoNLL tokens.

    - Only lines whose first column is an integer are treated as tokens.

    - Writes a blank line at sentence boundaries.

    """
    with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
        wrote_any_in_sentence = False
        for raw in f_in:
            line = raw.rstrip("\n")

            # Sentence boundaries: start/end tags or classic brackets trigger newline
            if line.strip().startswith("<Sentence"):
                if wrote_any_in_sentence:
                    f_out.write("\n")
                wrote_any_in_sentence = False
                continue
            if line.strip().startswith("</Sentence>") or line.strip().startswith("))"):
                if wrote_any_in_sentence:
                    f_out.write("\n")
                wrote_any_in_sentence = False
                continue
            if line.strip().startswith("(("):
                if wrote_any_in_sentence:
                    f_out.write("\n")
                wrote_any_in_sentence = False
                continue

            if _is_structure(line):
                # blank or structural lines: ignore but do not break sentence unless handled above
                continue

            parsed = _parse_token_line(line)
            if parsed:
                _, token, _, _, _ = parsed
                f_out.write(f"{token}\t\n")
                wrote_any_in_sentence = True

        # ensure trailing sentence closure gets a newline
        if wrote_any_in_sentence:
            f_out.write("\n")

def conll_to_ssf(conll_file, ssf_input_file, output_file):
    """

    Merge CRF predictions back into SSF.

    - Replaces only the POS (3rd column), preserving index, token, and any trailing cols (e.g., <fs ...>).

    - Preserves original tabs vs spaces layout when possible.

    """
    # Gather predictions (ignore blank lines)
    preds = []
    with open(conll_file, "r", encoding="utf-8") as f_in:
        for line in f_in:
            line = line.strip()
            if not line:
                continue
            parts = line.split("\t")
            if len(parts) >= 2:
                preds.append((parts[0], parts[1]))  # (token, pos)

    p = 0
    with open(ssf_input_file, "r", encoding="utf-8-sig") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
        for raw in f_in:
            line = raw.rstrip("\n")

            # Write structural lines untouched
            if _is_structure(line):
                f_out.write(line + "\n")
                continue

            parsed = _parse_token_line(line)
            if not parsed:
                # Not a recognizable token line; write as-is
                f_out.write(line + "\n")
                continue

            idx, token, old_pos, rest, used_tabs = parsed

            # If we have a prediction, replace POS; otherwise keep old POS
            if p < len(preds):
                _, new_pos = preds[p]
                p += 1
            else:
                new_pos = old_pos if old_pos else "UNK"

            if used_tabs:
                # preserve original tabbed structure
                parts = line.split("\t")
                # Ensure at least 3 columns
                while len(parts) < 3:
                    parts.append("")
                parts[2] = new_pos
                out = "\t".join(parts)
            else:
                # Normalize to tabs for clarity if original used spaces
                out = f"{idx}\t{token}\t{new_pos}"
                if rest:
                    out += f"\t{rest}"

            f_out.write(out + "\n")