import json, argparse from datasets import Dataset from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer from training.utils import compute_metrics_ner parser = argparse.ArgumentParser() parser.add_argument("--model_name", default="bert-base-cased") parser.add_argument("--train_json", required=True, help="JSONL with {'tokens': [...], 'ner_tags': [...]} per line") parser.add_argument("--eval_json", required=True) parser.add_argument("--text_col", default="tokens") parser.add_argument("--label_col", default="ner_tags") parser.add_argument("--labels_file", default="training/labels_ner.json") parser.add_argument("--output_dir", default="./outputs/ner") parser.add_argument("--epochs", type=int, default=5) parser.add_argument("--batch_size", type=int, default=8) parser.add_argument("--lr", type=float, default=3e-5) args = parser.parse_args() def load_jsonl(path): rows = [] with open(path, "r", encoding="utf-8") as f: for line in f: rows.append(json.loads(line)) return rows train_rows = load_jsonl(args.train_json) eval_rows = load_jsonl(args.eval_json) with open(args.labels_file, "r") as f: label_list = json.load(f) # e.g., ["O","B-ORG","I-ORG","B-MONEY","I-MONEY","B-DATE","I-DATE","B-TICKER","I-TICKER"] tokenizer = AutoTokenizer.from_pretrained(args.model_name) def align_labels_with_tokens(tokens, labels): # labels are per-token already; convert to ids label2id = {l:i for i,l in enumerate(label_list)} return [label2id[l] for l in labels] def encode_batch(batch): tokenized = tokenizer(batch[args.text_col], is_split_into_words=True, truncation=True, padding=True) encoded_labels = [] for i, labels in enumerate(batch[args.label_col]): word_ids = tokenized.word_ids(batch_index=i) label_ids = [] j = 0 for w_id in word_ids: if w_id is None: label_ids.append(-100) else: label_ids.append(align_labels_with_tokens(batch[args.text_col][i], labels)[w_id]) encoded_labels.append(label_ids) tokenized["labels"] = encoded_labels return tokenized train_ds = Dataset.from_list(train_rows).map(encode_batch, batched=True, remove_columns=[args.text_col, args.label_col]) eval_ds = Dataset.from_list(eval_rows).map(encode_batch, batched=True, remove_columns=[args.text_col, args.label_col]) model = AutoModelForTokenClassification.from_pretrained( args.model_name, num_labels=len(label_list), id2label={i:l for i,l in enumerate(label_list)}, label2id={l:i for i,l in enumerate(label_list)} ) data_collator = DataCollatorForTokenClassification(tokenizer) training_args = TrainingArguments( output_dir=args.output_dir, evaluation_strategy="epoch", learning_rate=args.lr, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, num_train_epochs=args.epochs, weight_decay=0.01, load_best_model_at_end=True, metric_for_best_model="f1", ) trainer = Trainer( model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=lambda p: compute_metrics_ner(p, label_list), ) trainer.train() trainer.save_model(args.output_dir) tokenizer.save_pretrained(args.output_dir)