File size: 6,172 Bytes
ec86c24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import re

# ---------- Plain & CoNLL (unchanged/safer) ----------
def plain_to_conll(input_file, temp_file):
    with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
        for line in f_in:
            line = line.strip()
            if not line:
                f_out.write("\n")
                continue
            for tok in line.split():
                f_out.write(f"{tok}\t\n")
            f_out.write("\n")

def conll_to_output(conll_file, output_file):
    with open(conll_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
        sent = []
        for line in f_in:
            line = line.rstrip("\n")
            if not line:
                if sent:
                    f_out.write(" ".join(sent) + "\n")
                    sent = []
                continue
            parts = line.split("\t")
            if len(parts) >= 2:
                sent.append(f"{parts[0]}_{parts[1]}")
        if sent:
            f_out.write(" ".join(sent) + "\n")


# ---------- SSF helpers (robust) ----------
_token_line_re = re.compile(r"^\s*(\d+)\s+(\S+)(?:\s+(\S+))?(?:\s+(.*))?$")

def _is_structure(line: str) -> bool:
    s = line.strip()
    return (
        s == "" or
        s.startswith("<") or  # <Sentence ...>, </Sentence>, XML-ish tags
        s.startswith("((") or
        s.startswith("))")
    )

def _parse_token_line(raw: str):
    """

    Return (idx, token, pos, rest, used_tabs) or None if not a token line.

    - Works with tabs or spaces.

    - 'rest' is any trailing columns (e.g., <fs ...>).

    - used_tabs: True if original line used tabs (preserve layout).

    """
    used_tabs = ("\t" in raw)
    parts_tab = raw.split("\t") if used_tabs else None

    if used_tabs and len(parts_tab) >= 2 and parts_tab[0].strip().isdigit():
        idx = parts_tab[0].strip()
        token = parts_tab[1].strip() if len(parts_tab) >= 2 else ""
        pos = parts_tab[2].strip() if len(parts_tab) >= 3 else ""
        rest = "\t".join(parts_tab[3:]) if len(parts_tab) >= 4 else ""
        return idx, token, pos, rest, True

    m = _token_line_re.match(raw)
    if m:
        idx, token, pos, rest = m.groups()
        return idx, token, (pos or ""), (rest or ""), False

    return None

def ssf_to_conll(input_file, temp_file):
    """

    Convert SSF (XML-style or classic) into CoNLL tokens.

    - Only lines whose first column is an integer are treated as tokens.

    - Writes a blank line at sentence boundaries.

    """
    with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
        wrote_any_in_sentence = False
        for raw in f_in:
            line = raw.rstrip("\n")

            # Sentence boundaries: start/end tags or classic brackets trigger newline
            if line.strip().startswith("<Sentence"):
                if wrote_any_in_sentence:
                    f_out.write("\n")
                wrote_any_in_sentence = False
                continue
            if line.strip().startswith("</Sentence>") or line.strip().startswith("))"):
                if wrote_any_in_sentence:
                    f_out.write("\n")
                wrote_any_in_sentence = False
                continue
            if line.strip().startswith("(("):
                if wrote_any_in_sentence:
                    f_out.write("\n")
                wrote_any_in_sentence = False
                continue

            if _is_structure(line):
                # blank or structural lines: ignore but do not break sentence unless handled above
                continue

            parsed = _parse_token_line(line)
            if parsed:
                _, token, _, _, _ = parsed
                f_out.write(f"{token}\t\n")
                wrote_any_in_sentence = True

        # ensure trailing sentence closure gets a newline
        if wrote_any_in_sentence:
            f_out.write("\n")

def conll_to_ssf(conll_file, ssf_input_file, output_file):
    """

    Merge CRF predictions back into SSF.

    - Replaces only the POS (3rd column), preserving index, token, and any trailing cols (e.g., <fs ...>).

    - Preserves original tabs vs spaces layout when possible.

    """
    # Gather predictions (ignore blank lines)
    preds = []
    with open(conll_file, "r", encoding="utf-8") as f_in:
        for line in f_in:
            line = line.strip()
            if not line:
                continue
            parts = line.split("\t")
            if len(parts) >= 2:
                preds.append((parts[0], parts[1]))  # (token, pos)

    p = 0
    with open(ssf_input_file, "r", encoding="utf-8-sig") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
        for raw in f_in:
            line = raw.rstrip("\n")

            # Write structural lines untouched
            if _is_structure(line):
                f_out.write(line + "\n")
                continue

            parsed = _parse_token_line(line)
            if not parsed:
                # Not a recognizable token line; write as-is
                f_out.write(line + "\n")
                continue

            idx, token, old_pos, rest, used_tabs = parsed

            # If we have a prediction, replace POS; otherwise keep old POS
            if p < len(preds):
                _, new_pos = preds[p]
                p += 1
            else:
                new_pos = old_pos if old_pos else "UNK"

            if used_tabs:
                # preserve original tabbed structure
                parts = line.split("\t")
                # Ensure at least 3 columns
                while len(parts) < 3:
                    parts.append("")
                parts[2] = new_pos
                out = "\t".join(parts)
            else:
                # Normalize to tabs for clarity if original used spaces
                out = f"{idx}\t{token}\t{new_pos}"
                if rest:
                    out += f"\t{rest}"

            f_out.write(out + "\n")