NLP-intelligence / eval /evaluate_tokens.py
Nomio4640's picture
reorganized files
3773a26
"""
evaluate_tokens.py — Token-level seqeval evaluation matching the Colab training metric.
Unlike evaluate.py (which reconstructs text and runs the full NLP pipeline),
this script feeds pre-tokenized CoNLL words directly to the model, ensuring
the evaluation is identical to what Colab measured during training.
Run from NLP-intelligence/:
python eval/evaluate_tokens.py
python eval/evaluate_tokens.py --limit 500
"""
import os, sys, argparse
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
EVAL_LABELS = {"PER", "LOC", "ORG"} # MISC excluded — not in fine-tuned model
def parse_conll(path, limit=None):
sentences, labels = [], []
cur_w, cur_l = [], []
with open(path, encoding="utf-8") as f:
for line in f:
line = line.rstrip()
if line == "" or line.startswith("#"):
if cur_w:
sentences.append(cur_w)
labels.append(cur_l)
cur_w, cur_l = [], []
if limit and len(sentences) >= limit:
break
else:
parts = line.split()
if len(parts) >= 4:
cur_w.append(parts[0])
raw = parts[-1]
# Remap MISC → O so evaluation is PER/LOC/ORG only
cur_l.append("O" if "MISC" in raw else raw)
if cur_w:
sentences.append(cur_w)
labels.append(cur_l)
return sentences, labels
def predict_tokens(words_list, tokenizer, model, device, batch_size=32):
"""
Run token classification on pre-tokenized word lists.
Returns list of per-sentence label sequences aligned to original words.
"""
import torch
from torch.nn.functional import softmax
all_preds = []
for i in range(0, len(words_list), batch_size):
if i % 200 == 0:
print(f" {i}/{len(words_list)} sentences...", end="\r")
batch_words = words_list[i: i + batch_size]
enc = tokenizer(
batch_words,
is_split_into_words=True,
truncation=True,
max_length=512,
padding=True,
return_tensors="pt",
)
# keep BatchEncoding for word_ids() before moving tensors to device
word_ids_per_sent = [enc.word_ids(batch_index=b) for b in range(len(batch_words))]
model_input = {k: v.to(device) for k, v in enc.items()}
with torch.no_grad():
logits = model(**model_input).logits # (batch, seq, num_labels)
preds_ids = logits.argmax(-1).cpu().tolist()
for b_idx, words in enumerate(batch_words):
word_ids = word_ids_per_sent[b_idx]
word_preds = {}
for pos, wid in enumerate(word_ids):
if wid is None or wid in word_preds:
continue # skip [CLS]/[SEP]/padding and non-first subwords
word_preds[wid] = model.config.id2label[preds_ids[b_idx][pos]]
sent_preds = [word_preds.get(j, "O") for j in range(len(words))]
all_preds.append(sent_preds)
print()
return all_preds
def main(limit=None):
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from seqeval.metrics import (classification_report, f1_score,
precision_score, recall_score)
base = os.path.dirname(os.path.dirname(__file__))
test_path = os.path.join(base, "Data", "data", "test.txt")
model_path = os.path.join(base, "adapters", "ner_mongolian")
if not os.path.exists(model_path):
print(f"ERROR: Fine-tuned model not found at {model_path}")
print("Run fine-tuning first and place model at adapters/ner_mongolian/")
sys.exit(1)
print(f"Loading model from {model_path}...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path).to(device)
model.eval()
print(f"Model loaded on {device}")
print(f"Parsing {test_path}...")
sentences, true_labels = parse_conll(test_path, limit=limit)
print(f"Sentences: {len(sentences)}")
print("Running token-level prediction...")
pred_labels = predict_tokens(sentences, tokenizer, model, device)
print("\n" + "=" * 50)
print("NER EVALUATION RESULTS (Token-Level, seqeval)")
print("=" * 50)
print(classification_report(true_labels, pred_labels))
print(f"Overall F1: {f1_score(true_labels, pred_labels):.4f}")
print(f"Overall Precision: {precision_score(true_labels, pred_labels):.4f}")
print(f"Overall Recall: {recall_score(true_labels, pred_labels):.4f}")
print("=" * 50)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--limit", type=int, default=None,
help="Evaluate on first N sentences only")
args = parser.parse_args()
main(args.limit)