File size: 5,101 Bytes
7726529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1c327f
 
 
 
 
 
 
 
 
 
 
7726529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import sys
import logging

# Add the project root to the python path so we can import nlp_core
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from nlp_core.ner_engine import NEREngine
from nlp_core.preprocessing import Preprocessor

def extract_entities_from_conll(lines):
    """
    Extracts entities from a list of CoNLL-formatted lines for a single sentence.
    Returns the reconstructed text and a list of entities: (type, string).
    """
    words = []
    entities = []
    current_entity_type = None
    current_entity_words = []

    for line in lines:
        parts = line.strip().split()
        if len(parts) < 4:
            continue
        word = parts[0]
        tag = parts[-1]
        
        words.append(word)

        if tag.startswith("B-"):
            if current_entity_type:
                entities.append((current_entity_type, " ".join(current_entity_words)))
            current_entity_type = tag[2:]
            current_entity_words = [word]
        elif tag.startswith("I-"):
            if current_entity_type == tag[2:]:
                current_entity_words.append(word)
            else:
                if current_entity_type:
                    entities.append((current_entity_type, " ".join(current_entity_words)))
                current_entity_type = tag[2:]
                current_entity_words = [word]
        else:
            if current_entity_type:
                entities.append((current_entity_type, " ".join(current_entity_words)))
                current_entity_type = None
                current_entity_words = []

    if current_entity_type:
        entities.append((current_entity_type, " ".join(current_entity_words)))

    text = " ".join(words)
    return text, entities

def evaluate_ner(test_file_path, limit=None):
    print(f"Loading test data from {test_file_path}...")
    
    with open(test_file_path, "r", encoding="utf-8") as f:
        blocks = f.read().split("\n\n")

    sentences = []
    for block in blocks:
        if not block.strip():
            continue
        text, true_ents = extract_entities_from_conll(block.split("\n"))
        if text:
            sentences.append((text, true_ents))

    if limit:
        sentences = sentences[:limit]

    print(f"Loaded {len(sentences)} test sentences.")
    
    preprocessor = Preprocessor()
    ner = NEREngine()
    
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    print("Running NER evaluation (this may take a while)...")
    for i, (text, true_ents) in enumerate(sentences):
        if i > 0 and i % 50 == 0:
            print(f"Processed {i}/{len(sentences)} sentences...")
            
        # Clean text specifically for NER
        clean_text = preprocessor.preprocess_nlp(text)
        
        predicted_results = ner.recognize(clean_text)
        
        # Format predictions into (type, string) lowercased for fair comparison
        # Strip dots so Д.Гантулга and Д. Гантулга both normalize to дгантулга
        pred_ents = [(res.entity_group, res.word.replace(" ", "").replace(".", "").lower())
                     for res in predicted_results]

        # Format true entities similarly — skip MISC since the fine-tuned model
        # does not produce MISC labels (removed from training set)
        true_ents_formatted = [
            (t, w.replace(" ", "").replace(".", "").lower())
            for t, w in true_ents
            if t != "MISC"
        ]
        
        # Calculate overlaps
        for true_e in true_ents_formatted:
            if true_e in pred_ents:
                true_positives += 1
                pred_ents.remove(true_e)
            else:
                false_negatives += 1
                
        # Whatever is left in pred_ents are false positives
        false_positives += len(pred_ents)

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print("\n" + "="*40)
    print("NER EVALUATION RESULTS (Entity-Level Exact Match)")
    print("="*40)
    print(f"Sentences Evaluated: {len(sentences)}")
    print(f"True Positives:      {true_positives}")
    print(f"False Positives:     {false_positives}")
    print(f"False Negatives:     {false_negatives}")
    print("-" * 40)
    print(f"Precision:           {precision:.4f}")
    print(f"Recall:              {recall:.4f}")
    print(f"F1 Score:            {f1:.4f}")
    print("="*40)

if __name__ == "__main__":
    test_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "test.txt")
    if not os.path.exists(test_path):
        print(f"Error: Could not find CoNLL test file at {test_path}")
    else:
        # Run on the first 500 sentences to get a quick estimate. 
        # Change limit=None to run on the entire test set.
        evaluate_ner(test_path, limit=500)