import re # ---------- Plain & CoNLL (unchanged/safer) ---------- def plain_to_conll(input_file, temp_file): with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out: for line in f_in: line = line.strip() if not line: f_out.write("\n") continue for tok in line.split(): f_out.write(f"{tok}\t\n") f_out.write("\n") def conll_to_output(conll_file, output_file): with open(conll_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out: sent = [] for line in f_in: line = line.rstrip("\n") if not line: if sent: f_out.write(" ".join(sent) + "\n") sent = [] continue parts = line.split("\t") if len(parts) >= 2: sent.append(f"{parts[0]}_{parts[1]}") if sent: f_out.write(" ".join(sent) + "\n") # ---------- SSF helpers (robust) ---------- _token_line_re = re.compile(r"^\s*(\d+)\s+(\S+)(?:\s+(\S+))?(?:\s+(.*))?$") def _is_structure(line: str) -> bool: s = line.strip() return ( s == "" or s.startswith("<") or # , , XML-ish tags s.startswith("((") or s.startswith("))") ) def _parse_token_line(raw: str): """ Return (idx, token, pos, rest, used_tabs) or None if not a token line. - Works with tabs or spaces. - 'rest' is any trailing columns (e.g., ). - used_tabs: True if original line used tabs (preserve layout). """ used_tabs = ("\t" in raw) parts_tab = raw.split("\t") if used_tabs else None if used_tabs and len(parts_tab) >= 2 and parts_tab[0].strip().isdigit(): idx = parts_tab[0].strip() token = parts_tab[1].strip() if len(parts_tab) >= 2 else "" pos = parts_tab[2].strip() if len(parts_tab) >= 3 else "" rest = "\t".join(parts_tab[3:]) if len(parts_tab) >= 4 else "" return idx, token, pos, rest, True m = _token_line_re.match(raw) if m: idx, token, pos, rest = m.groups() return idx, token, (pos or ""), (rest or ""), False return None def ssf_to_conll(input_file, temp_file): """ Convert SSF (XML-style or classic) into CoNLL tokens. - Only lines whose first column is an integer are treated as tokens. - Writes a blank line at sentence boundaries. """ with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out: wrote_any_in_sentence = False for raw in f_in: line = raw.rstrip("\n") # Sentence boundaries: start/end tags or classic brackets trigger newline if line.strip().startswith("") or line.strip().startswith("))"): if wrote_any_in_sentence: f_out.write("\n") wrote_any_in_sentence = False continue if line.strip().startswith("(("): if wrote_any_in_sentence: f_out.write("\n") wrote_any_in_sentence = False continue if _is_structure(line): # blank or structural lines: ignore but do not break sentence unless handled above continue parsed = _parse_token_line(line) if parsed: _, token, _, _, _ = parsed f_out.write(f"{token}\t\n") wrote_any_in_sentence = True # ensure trailing sentence closure gets a newline if wrote_any_in_sentence: f_out.write("\n") def conll_to_ssf(conll_file, ssf_input_file, output_file): """ Merge CRF predictions back into SSF. - Replaces only the POS (3rd column), preserving index, token, and any trailing cols (e.g., ). - Preserves original tabs vs spaces layout when possible. """ # Gather predictions (ignore blank lines) preds = [] with open(conll_file, "r", encoding="utf-8") as f_in: for line in f_in: line = line.strip() if not line: continue parts = line.split("\t") if len(parts) >= 2: preds.append((parts[0], parts[1])) # (token, pos) p = 0 with open(ssf_input_file, "r", encoding="utf-8-sig") as f_in, open(output_file, "w", encoding="utf-8") as f_out: for raw in f_in: line = raw.rstrip("\n") # Write structural lines untouched if _is_structure(line): f_out.write(line + "\n") continue parsed = _parse_token_line(line) if not parsed: # Not a recognizable token line; write as-is f_out.write(line + "\n") continue idx, token, old_pos, rest, used_tabs = parsed # If we have a prediction, replace POS; otherwise keep old POS if p < len(preds): _, new_pos = preds[p] p += 1 else: new_pos = old_pos if old_pos else "UNK" if used_tabs: # preserve original tabbed structure parts = line.split("\t") # Ensure at least 3 columns while len(parts) < 3: parts.append("") parts[2] = new_pos out = "\t".join(parts) else: # Normalize to tabs for clarity if original used spaces out = f"{idx}\t{token}\t{new_pos}" if rest: out += f"\t{rest}" f_out.write(out + "\n")