AniFileBERT / mix_datasets.py
ModerRAS's picture
Add AniFileBERT model and training project
be5f706
raw
history blame
2.32 kB
"""Create a shuffled JSONL training mix from multiple anime parser datasets."""
import argparse
import json
import random
from pathlib import Path
from typing import Iterable
def iter_jsonl(path: Path, limit: int | None = None) -> Iterable[dict]:
count = 0
with path.open("r", encoding="utf-8") as handle:
for line in handle:
line = line.strip()
if not line:
continue
item = json.loads(line)
yield {"tokens": item["tokens"], "labels": item["labels"]}
count += 1
if limit is not None and count >= limit:
break
def main() -> None:
parser = argparse.ArgumentParser(description="Mix synthetic and weakly-labeled DMHY datasets")
parser.add_argument("--synthetic", default="data/synthetic.jsonl")
parser.add_argument("--dmhy", default="data/dmhy/dmhy_weak.jsonl")
parser.add_argument("--output", default="data/dmhy/mixed_train.jsonl")
parser.add_argument("--synthetic-limit", type=int, default=None)
parser.add_argument("--dmhy-limit", type=int, default=None)
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
random.seed(args.seed)
records = []
synthetic_count = 0
dmhy_count = 0
for item in iter_jsonl(Path(args.synthetic), args.synthetic_limit):
records.append(item)
synthetic_count += 1
for item in iter_jsonl(Path(args.dmhy), args.dmhy_limit):
records.append(item)
dmhy_count += 1
random.shuffle(records)
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", encoding="utf-8") as handle:
for item in records:
handle.write(json.dumps(item, ensure_ascii=False) + "\n")
manifest = {
"synthetic": args.synthetic,
"dmhy": args.dmhy,
"output": args.output,
"synthetic_count": synthetic_count,
"dmhy_count": dmhy_count,
"total_count": len(records),
"seed": args.seed,
}
output_path.with_suffix(".manifest.json").write_text(
json.dumps(manifest, ensure_ascii=False, indent=2),
encoding="utf-8",
)
print(json.dumps(manifest, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()