Spaces:

roymukund
/

Indic-POS-tagger

Sleeping

App Files Files

Indic-POS-tagger / utils.py

roymukund

Upload 3 files

ec86c24 verified 5 months ago

raw

history blame

6.17 kB

	import re

	# ---------- Plain & CoNLL (unchanged/safer) ----------
	def plain_to_conll(input_file, temp_file):
	with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
	for line in f_in:
	line = line.strip()
	if not line:
	f_out.write("\n")
	continue
	for tok in line.split():
	f_out.write(f"{tok}\t\n")
	f_out.write("\n")

	def conll_to_output(conll_file, output_file):
	with open(conll_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
	sent = []
	for line in f_in:
	line = line.rstrip("\n")
	if not line:
	if sent:
	f_out.write(" ".join(sent) + "\n")
	sent = []
	continue
	parts = line.split("\t")
	if len(parts) >= 2:
	sent.append(f"{parts[0]}_{parts[1]}")
	if sent:
	f_out.write(" ".join(sent) + "\n")


	# ---------- SSF helpers (robust) ----------
	_token_line_re = re.compile(r"^\s(\d+)\s+(\S+)(?:\s+(\S+))?(?:\s+(.))?$")

	def _is_structure(line: str) -> bool:
	s = line.strip()
	return (
	s == "" or
	s.startswith("<") or # <Sentence ...>, </Sentence>, XML-ish tags
	s.startswith("((") or
	s.startswith("))")
	)

	def _parse_token_line(raw: str):
	"""
	Return (idx, token, pos, rest, used_tabs) or None if not a token line.
	- Works with tabs or spaces.
	- 'rest' is any trailing columns (e.g., <fs ...>).
	- used_tabs: True if original line used tabs (preserve layout).
	"""
	used_tabs = ("\t" in raw)
	parts_tab = raw.split("\t") if used_tabs else None

	if used_tabs and len(parts_tab) >= 2 and parts_tab[0].strip().isdigit():
	idx = parts_tab[0].strip()
	token = parts_tab[1].strip() if len(parts_tab) >= 2 else ""
	pos = parts_tab[2].strip() if len(parts_tab) >= 3 else ""
	rest = "\t".join(parts_tab[3:]) if len(parts_tab) >= 4 else ""
	return idx, token, pos, rest, True

	m = _token_line_re.match(raw)
	if m:
	idx, token, pos, rest = m.groups()
	return idx, token, (pos or ""), (rest or ""), False

	return None

	def ssf_to_conll(input_file, temp_file):
	"""
	Convert SSF (XML-style or classic) into CoNLL tokens.
	- Only lines whose first column is an integer are treated as tokens.
	- Writes a blank line at sentence boundaries.
	"""
	with open(input_file, "r", encoding="utf-8-sig") as f_in, open(temp_file, "w", encoding="utf-8") as f_out:
	wrote_any_in_sentence = False
	for raw in f_in:
	line = raw.rstrip("\n")

	# Sentence boundaries: start/end tags or classic brackets trigger newline
	if line.strip().startswith("<Sentence"):
	if wrote_any_in_sentence:
	f_out.write("\n")
	wrote_any_in_sentence = False
	continue
	if line.strip().startswith("</Sentence>") or line.strip().startswith("))"):
	if wrote_any_in_sentence:
	f_out.write("\n")
	wrote_any_in_sentence = False
	continue
	if line.strip().startswith("(("):
	if wrote_any_in_sentence:
	f_out.write("\n")
	wrote_any_in_sentence = False
	continue

	if _is_structure(line):
	# blank or structural lines: ignore but do not break sentence unless handled above
	continue

	parsed = _parse_token_line(line)
	if parsed:
	_, token, _, _, _ = parsed
	f_out.write(f"{token}\t\n")
	wrote_any_in_sentence = True

	# ensure trailing sentence closure gets a newline
	if wrote_any_in_sentence:
	f_out.write("\n")

	def conll_to_ssf(conll_file, ssf_input_file, output_file):
	"""
	Merge CRF predictions back into SSF.
	- Replaces only the POS (3rd column), preserving index, token, and any trailing cols (e.g., <fs ...>).
	- Preserves original tabs vs spaces layout when possible.
	"""
	# Gather predictions (ignore blank lines)
	preds = []
	with open(conll_file, "r", encoding="utf-8") as f_in:
	for line in f_in:
	line = line.strip()
	if not line:
	continue
	parts = line.split("\t")
	if len(parts) >= 2:
	preds.append((parts[0], parts[1])) # (token, pos)

	p = 0
	with open(ssf_input_file, "r", encoding="utf-8-sig") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
	for raw in f_in:
	line = raw.rstrip("\n")

	# Write structural lines untouched
	if _is_structure(line):
	f_out.write(line + "\n")
	continue

	parsed = _parse_token_line(line)
	if not parsed:
	# Not a recognizable token line; write as-is
	f_out.write(line + "\n")
	continue

	idx, token, old_pos, rest, used_tabs = parsed

	# If we have a prediction, replace POS; otherwise keep old POS
	if p < len(preds):
	_, new_pos = preds[p]
	p += 1
	else:
	new_pos = old_pos if old_pos else "UNK"

	if used_tabs:
	# preserve original tabbed structure
	parts = line.split("\t")
	# Ensure at least 3 columns
	while len(parts) < 3:
	parts.append("")
	parts[2] = new_pos
	out = "\t".join(parts)
	else:
	# Normalize to tabs for clarity if original used spaces
	out = f"{idx}\t{token}\t{new_pos}"
	if rest:
	out += f"\t{rest}"

	f_out.write(out + "\n")