import re
# ---------- Plain & CoNLL (unchanged/safer) ----------
def plain_to_conll(input_file, temp_file):
with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
for line in f_in:
line = line.strip()
if not line:
f_out.write("\n")
continue
for tok in line.split():
f_out.write(f"{tok}\t\n")
f_out.write("\n")
def conll_to_output(conll_file, output_file):
with open(conll_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
sent = []
for line in f_in:
line = line.rstrip("\n")
if not line:
if sent:
f_out.write(" ".join(sent) + "\n")
sent = []
continue
parts = line.split("\t")
if len(parts) >= 2:
sent.append(f"{parts[0]}_{parts[1]}")
if sent:
f_out.write(" ".join(sent) + "\n")
# ---------- SSF helpers (robust) ----------
_token_line_re = re.compile(r"^\s*(\d+)\s+(\S+)(?:\s+(\S+))?(?:\s+(.*))?$")
def _is_structure(line: str) -> bool:
s = line.strip()
return (
s == "" or
s.startswith("<") or # , , XML-ish tags
s.startswith("((") or
s.startswith("))")
)
def _parse_token_line(raw: str):
"""
Return (idx, token, pos, rest, used_tabs) or None if not a token line.
- Works with tabs or spaces.
- 'rest' is any trailing columns (e.g., ).
- used_tabs: True if original line used tabs (preserve layout).
"""
used_tabs = ("\t" in raw)
parts_tab = raw.split("\t") if used_tabs else None
if used_tabs and len(parts_tab) >= 2 and parts_tab[0].strip().isdigit():
idx = parts_tab[0].strip()
token = parts_tab[1].strip() if len(parts_tab) >= 2 else ""
pos = parts_tab[2].strip() if len(parts_tab) >= 3 else ""
rest = "\t".join(parts_tab[3:]) if len(parts_tab) >= 4 else ""
return idx, token, pos, rest, True
m = _token_line_re.match(raw)
if m:
idx, token, pos, rest = m.groups()
return idx, token, (pos or ""), (rest or ""), False
return None
def ssf_to_conll(input_file, temp_file):
"""
Convert SSF (XML-style or classic) into CoNLL tokens.
- Only lines whose first column is an integer are treated as tokens.
- Writes a blank line at sentence boundaries.
"""
with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
wrote_any_in_sentence = False
for raw in f_in:
line = raw.rstrip("\n")
# Sentence boundaries: start/end tags or classic brackets trigger newline
if line.strip().startswith("") or line.strip().startswith("))"):
if wrote_any_in_sentence:
f_out.write("\n")
wrote_any_in_sentence = False
continue
if line.strip().startswith("(("):
if wrote_any_in_sentence:
f_out.write("\n")
wrote_any_in_sentence = False
continue
if _is_structure(line):
# blank or structural lines: ignore but do not break sentence unless handled above
continue
parsed = _parse_token_line(line)
if parsed:
_, token, _, _, _ = parsed
f_out.write(f"{token}\t\n")
wrote_any_in_sentence = True
# ensure trailing sentence closure gets a newline
if wrote_any_in_sentence:
f_out.write("\n")
def conll_to_ssf(conll_file, ssf_input_file, output_file):
"""
Merge CRF predictions back into SSF.
- Replaces only the POS (3rd column), preserving index, token, and any trailing cols (e.g., ).
- Preserves original tabs vs spaces layout when possible.
"""
# Gather predictions (ignore blank lines)
preds = []
with open(conll_file, "r", encoding="utf-8") as f_in:
for line in f_in:
line = line.strip()
if not line:
continue
parts = line.split("\t")
if len(parts) >= 2:
preds.append((parts[0], parts[1])) # (token, pos)
p = 0
with open(ssf_input_file, "r", encoding="utf-8-sig") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
for raw in f_in:
line = raw.rstrip("\n")
# Write structural lines untouched
if _is_structure(line):
f_out.write(line + "\n")
continue
parsed = _parse_token_line(line)
if not parsed:
# Not a recognizable token line; write as-is
f_out.write(line + "\n")
continue
idx, token, old_pos, rest, used_tabs = parsed
# If we have a prediction, replace POS; otherwise keep old POS
if p < len(preds):
_, new_pos = preds[p]
p += 1
else:
new_pos = old_pos if old_pos else "UNK"
if used_tabs:
# preserve original tabbed structure
parts = line.split("\t")
# Ensure at least 3 columns
while len(parts) < 3:
parts.append("")
parts[2] = new_pos
out = "\t".join(parts)
else:
# Normalize to tabs for clarity if original used spaces
out = f"{idx}\t{token}\t{new_pos}"
if rest:
out += f"\t{rest}"
f_out.write(out + "\n")