Spaces:

nayan90k
/

semantic-ir-backend

Sleeping

semantic-ir-backend / backend /parse_data.py

Kamal Nayan Kumar

Initial commit

dbc6ebe about 2 months ago

3.79 kB

	import argparse
	import csv
	import json
	import sys
	from collections import defaultdict
	from pathlib import Path
	from typing import Dict, Iterable, List


	csv.field_size_limit(sys.maxsize)


	COLS = [
	"sent_id",
	"token_id",
	"word",
	"lemma",
	"upos",
	"xpos",
	"feats",
	"head",
	"deprel",
	"deps",
	"misc",
	"predicate_sense",
	"semantic_role",
	]


	def _iter_rows(path: Path) -> Iterable[List[str]]:
	with path.open("r", encoding="utf-8", newline="") as f:
	sample = f.read(4096)
	f.seek(0)
	delimiter = "\t" if "\t" in sample else ","
	reader = csv.reader(f, delimiter=delimiter)
	for row in reader:
	if not row:
	continue
	if len(row) == 1 and not row[0].strip():
	continue
	if row[0].startswith("#"):
	continue
	yield row


	def read_conllu_srl(path: Path) -> List[Dict[str, str]]:
	records: List[Dict[str, str]] = []
	for row in _iter_rows(path):
	if len(row) < len(COLS):
	continue
	row = row[: len(COLS)]

	sent_id = row[0].strip()
	token_id = row[1].strip()

	if not sent_id.isdigit():
	continue
	if not token_id.isdigit():
	continue

	records.append({col: value for col, value in zip(COLS, row)})

	return records


	def _join_tokens(tokens_with_misc: List[Dict[str, str]]) -> str:
	chunks: List[str] = []
	for token in tokens_with_misc:
	chunks.append(token["word"])
	if "SpaceAfter=No" not in token.get("misc", "_"):
	chunks.append(" ")
	return "".join(chunks).strip()


	def flatten_to_corpus(records: List[Dict[str, str]]) -> List[Dict[str, str]]:
	grouped: Dict[str, List[Dict[str, str]]] = defaultdict(list)
	for rec in records:
	grouped[rec["sent_id"]].append(rec)

	corpus: List[Dict[str, str]] = []
	for sent_id in sorted(grouped.keys(), key=lambda x: int(x)):
	toks = sorted(grouped[sent_id], key=lambda r: int(r["token_id"]))
	text = _join_tokens(toks)

	predicate = ""
	roles: Dict[str, List[str]] = defaultdict(list)

	for tok in toks:
	sense = tok.get("predicate_sense", "_")
	role = tok.get("semantic_role", "_")
	word = tok.get("word", "")

	if not predicate and sense and sense != "_":
	predicate = sense

	if role and role != "_":
	roles[role].append(word)

	doc: Dict[str, str] = {"id": sent_id, "text": text, "predicate": predicate}
	doc.update({role_name: " ".join(words) for role_name, words in roles.items()})
	corpus.append(doc)

	return corpus


	def main() -> None:
	parser = argparse.ArgumentParser(
	description="Parse CoNLL-U SRL data to flattened JSON corpus"
	)
	parser.add_argument(
	"--input-dir",
	type=Path,
	default=Path(__file__).resolve().parents[1] / "data",
	help="Directory containing CoNLL-U style TSV/CSV files",
	)
	parser.add_argument(
	"--output",
	type=Path,
	default=Path(__file__).resolve().parents[1] / "data" / "corpus.json",
	help="Output JSON corpus path",
	)
	args = parser.parse_args()

	data_files = [f for f in args.input_dir.glob("*.csv")]
	corpus: List[Dict[str, str]] = []
	for data_file in sorted(data_files):
	records = read_conllu_srl(data_file)
	corpus.extend(flatten_to_corpus(records))

	args.output.parent.mkdir(parents=True, exist_ok=True)
	with args.output.open("w", encoding="utf-8") as f:
	json.dump(corpus, f, ensure_ascii=False, indent=2)

	print(f"Wrote {len(corpus)} documents to {args.output}")


	if __name__ == "__main__":
	main()