Spaces:
Sleeping
Sleeping
| import re | |
| # ---------- Plain & CoNLL (unchanged/safer) ---------- | |
| def plain_to_conll(input_file, temp_file): | |
| with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out: | |
| for line in f_in: | |
| line = line.strip() | |
| if not line: | |
| f_out.write("\n") | |
| continue | |
| for tok in line.split(): | |
| f_out.write(f"{tok}\t\n") | |
| f_out.write("\n") | |
| def conll_to_output(conll_file, output_file): | |
| with open(conll_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out: | |
| sent = [] | |
| for line in f_in: | |
| line = line.rstrip("\n") | |
| if not line: | |
| if sent: | |
| f_out.write(" ".join(sent) + "\n") | |
| sent = [] | |
| continue | |
| parts = line.split("\t") | |
| if len(parts) >= 2: | |
| sent.append(f"{parts[0]}_{parts[1]}") | |
| if sent: | |
| f_out.write(" ".join(sent) + "\n") | |
| # ---------- SSF helpers (robust) ---------- | |
| _token_line_re = re.compile(r"^\s*(\d+)\s+(\S+)(?:\s+(\S+))?(?:\s+(.*))?$") | |
| def _is_structure(line: str) -> bool: | |
| s = line.strip() | |
| return ( | |
| s == "" or | |
| s.startswith("<") or # <Sentence ...>, </Sentence>, XML-ish tags | |
| s.startswith("((") or | |
| s.startswith("))") | |
| ) | |
| def _parse_token_line(raw: str): | |
| """ | |
| Return (idx, token, pos, rest, used_tabs) or None if not a token line. | |
| - Works with tabs or spaces. | |
| - 'rest' is any trailing columns (e.g., <fs ...>). | |
| - used_tabs: True if original line used tabs (preserve layout). | |
| """ | |
| used_tabs = ("\t" in raw) | |
| parts_tab = raw.split("\t") if used_tabs else None | |
| if used_tabs and len(parts_tab) >= 2 and parts_tab[0].strip().isdigit(): | |
| idx = parts_tab[0].strip() | |
| token = parts_tab[1].strip() if len(parts_tab) >= 2 else "" | |
| pos = parts_tab[2].strip() if len(parts_tab) >= 3 else "" | |
| rest = "\t".join(parts_tab[3:]) if len(parts_tab) >= 4 else "" | |
| return idx, token, pos, rest, True | |
| m = _token_line_re.match(raw) | |
| if m: | |
| idx, token, pos, rest = m.groups() | |
| return idx, token, (pos or ""), (rest or ""), False | |
| return None | |
| def ssf_to_conll(input_file, temp_file): | |
| """ | |
| Convert SSF (XML-style or classic) into CoNLL tokens. | |
| - Only lines whose first column is an integer are treated as tokens. | |
| - Writes a blank line at sentence boundaries. | |
| """ | |
| with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out: | |
| wrote_any_in_sentence = False | |
| for raw in f_in: | |
| line = raw.rstrip("\n") | |
| # Sentence boundaries: start/end tags or classic brackets trigger newline | |
| if line.strip().startswith("<Sentence"): | |
| if wrote_any_in_sentence: | |
| f_out.write("\n") | |
| wrote_any_in_sentence = False | |
| continue | |
| if line.strip().startswith("</Sentence>") or line.strip().startswith("))"): | |
| if wrote_any_in_sentence: | |
| f_out.write("\n") | |
| wrote_any_in_sentence = False | |
| continue | |
| if line.strip().startswith("(("): | |
| if wrote_any_in_sentence: | |
| f_out.write("\n") | |
| wrote_any_in_sentence = False | |
| continue | |
| if _is_structure(line): | |
| # blank or structural lines: ignore but do not break sentence unless handled above | |
| continue | |
| parsed = _parse_token_line(line) | |
| if parsed: | |
| _, token, _, _, _ = parsed | |
| f_out.write(f"{token}\t\n") | |
| wrote_any_in_sentence = True | |
| # ensure trailing sentence closure gets a newline | |
| if wrote_any_in_sentence: | |
| f_out.write("\n") | |
| def conll_to_ssf(conll_file, ssf_input_file, output_file): | |
| """ | |
| Merge CRF predictions back into SSF. | |
| - Replaces only the POS (3rd column), preserving index, token, and any trailing cols (e.g., <fs ...>). | |
| - Preserves original tabs vs spaces layout when possible. | |
| """ | |
| # Gather predictions (ignore blank lines) | |
| preds = [] | |
| with open(conll_file, "r", encoding="utf-8") as f_in: | |
| for line in f_in: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| parts = line.split("\t") | |
| if len(parts) >= 2: | |
| preds.append((parts[0], parts[1])) # (token, pos) | |
| p = 0 | |
| with open(ssf_input_file, "r", encoding="utf-8-sig") as f_in, open(output_file, "w", encoding="utf-8") as f_out: | |
| for raw in f_in: | |
| line = raw.rstrip("\n") | |
| # Write structural lines untouched | |
| if _is_structure(line): | |
| f_out.write(line + "\n") | |
| continue | |
| parsed = _parse_token_line(line) | |
| if not parsed: | |
| # Not a recognizable token line; write as-is | |
| f_out.write(line + "\n") | |
| continue | |
| idx, token, old_pos, rest, used_tabs = parsed | |
| # If we have a prediction, replace POS; otherwise keep old POS | |
| if p < len(preds): | |
| _, new_pos = preds[p] | |
| p += 1 | |
| else: | |
| new_pos = old_pos if old_pos else "UNK" | |
| if used_tabs: | |
| # preserve original tabbed structure | |
| parts = line.split("\t") | |
| # Ensure at least 3 columns | |
| while len(parts) < 3: | |
| parts.append("") | |
| parts[2] = new_pos | |
| out = "\t".join(parts) | |
| else: | |
| # Normalize to tabs for clarity if original used spaces | |
| out = f"{idx}\t{token}\t{new_pos}" | |
| if rest: | |
| out += f"\t{rest}" | |
| f_out.write(out + "\n") | |