NLP-intelligence / eval /evaluate.py
Nomio4640's picture
NER finetune
e1c327f
import os
import sys
import logging
# Add the project root to the python path so we can import nlp_core
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from nlp_core.ner_engine import NEREngine
from nlp_core.preprocessing import Preprocessor
def extract_entities_from_conll(lines):
"""
Extracts entities from a list of CoNLL-formatted lines for a single sentence.
Returns the reconstructed text and a list of entities: (type, string).
"""
words = []
entities = []
current_entity_type = None
current_entity_words = []
for line in lines:
parts = line.strip().split()
if len(parts) < 4:
continue
word = parts[0]
tag = parts[-1]
words.append(word)
if tag.startswith("B-"):
if current_entity_type:
entities.append((current_entity_type, " ".join(current_entity_words)))
current_entity_type = tag[2:]
current_entity_words = [word]
elif tag.startswith("I-"):
if current_entity_type == tag[2:]:
current_entity_words.append(word)
else:
if current_entity_type:
entities.append((current_entity_type, " ".join(current_entity_words)))
current_entity_type = tag[2:]
current_entity_words = [word]
else:
if current_entity_type:
entities.append((current_entity_type, " ".join(current_entity_words)))
current_entity_type = None
current_entity_words = []
if current_entity_type:
entities.append((current_entity_type, " ".join(current_entity_words)))
text = " ".join(words)
return text, entities
def evaluate_ner(test_file_path, limit=None):
print(f"Loading test data from {test_file_path}...")
with open(test_file_path, "r", encoding="utf-8") as f:
blocks = f.read().split("\n\n")
sentences = []
for block in blocks:
if not block.strip():
continue
text, true_ents = extract_entities_from_conll(block.split("\n"))
if text:
sentences.append((text, true_ents))
if limit:
sentences = sentences[:limit]
print(f"Loaded {len(sentences)} test sentences.")
preprocessor = Preprocessor()
ner = NEREngine()
true_positives = 0
false_positives = 0
false_negatives = 0
print("Running NER evaluation (this may take a while)...")
for i, (text, true_ents) in enumerate(sentences):
if i > 0 and i % 50 == 0:
print(f"Processed {i}/{len(sentences)} sentences...")
# Clean text specifically for NER
clean_text = preprocessor.preprocess_nlp(text)
predicted_results = ner.recognize(clean_text)
# Format predictions into (type, string) lowercased for fair comparison
# Strip dots so Д.Гантулга and Д. Гантулга both normalize to дгантулга
pred_ents = [(res.entity_group, res.word.replace(" ", "").replace(".", "").lower())
for res in predicted_results]
# Format true entities similarly — skip MISC since the fine-tuned model
# does not produce MISC labels (removed from training set)
true_ents_formatted = [
(t, w.replace(" ", "").replace(".", "").lower())
for t, w in true_ents
if t != "MISC"
]
# Calculate overlaps
for true_e in true_ents_formatted:
if true_e in pred_ents:
true_positives += 1
pred_ents.remove(true_e)
else:
false_negatives += 1
# Whatever is left in pred_ents are false positives
false_positives += len(pred_ents)
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print("\n" + "="*40)
print("NER EVALUATION RESULTS (Entity-Level Exact Match)")
print("="*40)
print(f"Sentences Evaluated: {len(sentences)}")
print(f"True Positives: {true_positives}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")
print("-" * 40)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("="*40)
if __name__ == "__main__":
test_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "test.txt")
if not os.path.exists(test_path):
print(f"Error: Could not find CoNLL test file at {test_path}")
else:
# Run on the first 500 sentences to get a quick estimate.
# Change limit=None to run on the entire test set.
evaluate_ner(test_path, limit=500)