Spaces:
Sleeping
Sleeping
File size: 6,172 Bytes
ec86c24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import re
# ---------- Plain & CoNLL (unchanged/safer) ----------
def plain_to_conll(input_file, temp_file):
with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
for line in f_in:
line = line.strip()
if not line:
f_out.write("\n")
continue
for tok in line.split():
f_out.write(f"{tok}\t\n")
f_out.write("\n")
def conll_to_output(conll_file, output_file):
with open(conll_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
sent = []
for line in f_in:
line = line.rstrip("\n")
if not line:
if sent:
f_out.write(" ".join(sent) + "\n")
sent = []
continue
parts = line.split("\t")
if len(parts) >= 2:
sent.append(f"{parts[0]}_{parts[1]}")
if sent:
f_out.write(" ".join(sent) + "\n")
# ---------- SSF helpers (robust) ----------
_token_line_re = re.compile(r"^\s*(\d+)\s+(\S+)(?:\s+(\S+))?(?:\s+(.*))?$")
def _is_structure(line: str) -> bool:
s = line.strip()
return (
s == "" or
s.startswith("<") or # <Sentence ...>, </Sentence>, XML-ish tags
s.startswith("((") or
s.startswith("))")
)
def _parse_token_line(raw: str):
"""
Return (idx, token, pos, rest, used_tabs) or None if not a token line.
- Works with tabs or spaces.
- 'rest' is any trailing columns (e.g., <fs ...>).
- used_tabs: True if original line used tabs (preserve layout).
"""
used_tabs = ("\t" in raw)
parts_tab = raw.split("\t") if used_tabs else None
if used_tabs and len(parts_tab) >= 2 and parts_tab[0].strip().isdigit():
idx = parts_tab[0].strip()
token = parts_tab[1].strip() if len(parts_tab) >= 2 else ""
pos = parts_tab[2].strip() if len(parts_tab) >= 3 else ""
rest = "\t".join(parts_tab[3:]) if len(parts_tab) >= 4 else ""
return idx, token, pos, rest, True
m = _token_line_re.match(raw)
if m:
idx, token, pos, rest = m.groups()
return idx, token, (pos or ""), (rest or ""), False
return None
def ssf_to_conll(input_file, temp_file):
"""
Convert SSF (XML-style or classic) into CoNLL tokens.
- Only lines whose first column is an integer are treated as tokens.
- Writes a blank line at sentence boundaries.
"""
with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
wrote_any_in_sentence = False
for raw in f_in:
line = raw.rstrip("\n")
# Sentence boundaries: start/end tags or classic brackets trigger newline
if line.strip().startswith("<Sentence"):
if wrote_any_in_sentence:
f_out.write("\n")
wrote_any_in_sentence = False
continue
if line.strip().startswith("</Sentence>") or line.strip().startswith("))"):
if wrote_any_in_sentence:
f_out.write("\n")
wrote_any_in_sentence = False
continue
if line.strip().startswith("(("):
if wrote_any_in_sentence:
f_out.write("\n")
wrote_any_in_sentence = False
continue
if _is_structure(line):
# blank or structural lines: ignore but do not break sentence unless handled above
continue
parsed = _parse_token_line(line)
if parsed:
_, token, _, _, _ = parsed
f_out.write(f"{token}\t\n")
wrote_any_in_sentence = True
# ensure trailing sentence closure gets a newline
if wrote_any_in_sentence:
f_out.write("\n")
def conll_to_ssf(conll_file, ssf_input_file, output_file):
"""
Merge CRF predictions back into SSF.
- Replaces only the POS (3rd column), preserving index, token, and any trailing cols (e.g., <fs ...>).
- Preserves original tabs vs spaces layout when possible.
"""
# Gather predictions (ignore blank lines)
preds = []
with open(conll_file, "r", encoding="utf-8") as f_in:
for line in f_in:
line = line.strip()
if not line:
continue
parts = line.split("\t")
if len(parts) >= 2:
preds.append((parts[0], parts[1])) # (token, pos)
p = 0
with open(ssf_input_file, "r", encoding="utf-8-sig") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
for raw in f_in:
line = raw.rstrip("\n")
# Write structural lines untouched
if _is_structure(line):
f_out.write(line + "\n")
continue
parsed = _parse_token_line(line)
if not parsed:
# Not a recognizable token line; write as-is
f_out.write(line + "\n")
continue
idx, token, old_pos, rest, used_tabs = parsed
# If we have a prediction, replace POS; otherwise keep old POS
if p < len(preds):
_, new_pos = preds[p]
p += 1
else:
new_pos = old_pos if old_pos else "UNK"
if used_tabs:
# preserve original tabbed structure
parts = line.split("\t")
# Ensure at least 3 columns
while len(parts) < 3:
parts.append("")
parts[2] = new_pos
out = "\t".join(parts)
else:
# Normalize to tabs for clarity if original used spaces
out = f"{idx}\t{token}\t{new_pos}"
if rest:
out += f"\t{rest}"
f_out.write(out + "\n")
|