Spaces:
Sleeping
Sleeping
| """ | |
| evaluate_tokens.py — Token-level seqeval evaluation matching the Colab training metric. | |
| Unlike evaluate.py (which reconstructs text and runs the full NLP pipeline), | |
| this script feeds pre-tokenized CoNLL words directly to the model, ensuring | |
| the evaluation is identical to what Colab measured during training. | |
| Run from NLP-intelligence/: | |
| python eval/evaluate_tokens.py | |
| python eval/evaluate_tokens.py --limit 500 | |
| """ | |
| import os, sys, argparse | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| EVAL_LABELS = {"PER", "LOC", "ORG"} # MISC excluded — not in fine-tuned model | |
| def parse_conll(path, limit=None): | |
| sentences, labels = [], [] | |
| cur_w, cur_l = [], [] | |
| with open(path, encoding="utf-8") as f: | |
| for line in f: | |
| line = line.rstrip() | |
| if line == "" or line.startswith("#"): | |
| if cur_w: | |
| sentences.append(cur_w) | |
| labels.append(cur_l) | |
| cur_w, cur_l = [], [] | |
| if limit and len(sentences) >= limit: | |
| break | |
| else: | |
| parts = line.split() | |
| if len(parts) >= 4: | |
| cur_w.append(parts[0]) | |
| raw = parts[-1] | |
| # Remap MISC → O so evaluation is PER/LOC/ORG only | |
| cur_l.append("O" if "MISC" in raw else raw) | |
| if cur_w: | |
| sentences.append(cur_w) | |
| labels.append(cur_l) | |
| return sentences, labels | |
| def predict_tokens(words_list, tokenizer, model, device, batch_size=32): | |
| """ | |
| Run token classification on pre-tokenized word lists. | |
| Returns list of per-sentence label sequences aligned to original words. | |
| """ | |
| import torch | |
| from torch.nn.functional import softmax | |
| all_preds = [] | |
| for i in range(0, len(words_list), batch_size): | |
| if i % 200 == 0: | |
| print(f" {i}/{len(words_list)} sentences...", end="\r") | |
| batch_words = words_list[i: i + batch_size] | |
| enc = tokenizer( | |
| batch_words, | |
| is_split_into_words=True, | |
| truncation=True, | |
| max_length=512, | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| # keep BatchEncoding for word_ids() before moving tensors to device | |
| word_ids_per_sent = [enc.word_ids(batch_index=b) for b in range(len(batch_words))] | |
| model_input = {k: v.to(device) for k, v in enc.items()} | |
| with torch.no_grad(): | |
| logits = model(**model_input).logits # (batch, seq, num_labels) | |
| preds_ids = logits.argmax(-1).cpu().tolist() | |
| for b_idx, words in enumerate(batch_words): | |
| word_ids = word_ids_per_sent[b_idx] | |
| word_preds = {} | |
| for pos, wid in enumerate(word_ids): | |
| if wid is None or wid in word_preds: | |
| continue # skip [CLS]/[SEP]/padding and non-first subwords | |
| word_preds[wid] = model.config.id2label[preds_ids[b_idx][pos]] | |
| sent_preds = [word_preds.get(j, "O") for j in range(len(words))] | |
| all_preds.append(sent_preds) | |
| print() | |
| return all_preds | |
| def main(limit=None): | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification | |
| from seqeval.metrics import (classification_report, f1_score, | |
| precision_score, recall_score) | |
| base = os.path.dirname(os.path.dirname(__file__)) | |
| test_path = os.path.join(base, "Data", "data", "test.txt") | |
| model_path = os.path.join(base, "adapters", "ner_mongolian") | |
| if not os.path.exists(model_path): | |
| print(f"ERROR: Fine-tuned model not found at {model_path}") | |
| print("Run fine-tuning first and place model at adapters/ner_mongolian/") | |
| sys.exit(1) | |
| print(f"Loading model from {model_path}...") | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| model = AutoModelForTokenClassification.from_pretrained(model_path).to(device) | |
| model.eval() | |
| print(f"Model loaded on {device}") | |
| print(f"Parsing {test_path}...") | |
| sentences, true_labels = parse_conll(test_path, limit=limit) | |
| print(f"Sentences: {len(sentences)}") | |
| print("Running token-level prediction...") | |
| pred_labels = predict_tokens(sentences, tokenizer, model, device) | |
| print("\n" + "=" * 50) | |
| print("NER EVALUATION RESULTS (Token-Level, seqeval)") | |
| print("=" * 50) | |
| print(classification_report(true_labels, pred_labels)) | |
| print(f"Overall F1: {f1_score(true_labels, pred_labels):.4f}") | |
| print(f"Overall Precision: {precision_score(true_labels, pred_labels):.4f}") | |
| print(f"Overall Recall: {recall_score(true_labels, pred_labels):.4f}") | |
| print("=" * 50) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--limit", type=int, default=None, | |
| help="Evaluate on first N sentences only") | |
| args = parser.parse_args() | |
| main(args.limit) | |