ga-POS-tagger / evaluate.py
laurencassidy's picture
Upload 14 files
1170fda verified
"""
Evaluate POS taggers on TwittIrish test set.
Compares:
1. Averaged Perceptron (our model)
2. Stanza (SOTA for Irish UD)
3. gaBERT-based tagger (if available)
"""
import json
from pathlib import Path
from collections import defaultdict
from train_perceptron import parse_conllu, PerceptronTagger
def classification_report(y_true, y_pred, labels=None):
"""Generate classification report for POS tags."""
if labels is None:
labels = sorted(set(y_true) | set(y_pred))
metrics = {}
for label in labels:
tp = sum(1 for t, p in zip(y_true, y_pred) if t == label and p == label)
fp = sum(1 for t, p in zip(y_true, y_pred) if t != label and p == label)
fn = sum(1 for t, p in zip(y_true, y_pred) if t == label and p != label)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
metrics[label] = {
'precision': precision,
'recall': recall,
'f1': f1,
'support': sum(1 for t in y_true if t == label)
}
# Overall accuracy
accuracy = sum(1 for t, p in zip(y_true, y_pred) if t == p) / len(y_true)
# Macro averages
macro_precision = sum(m['precision'] for m in metrics.values()) / len(metrics)
macro_recall = sum(m['recall'] for m in metrics.values()) / len(metrics)
macro_f1 = sum(m['f1'] for m in metrics.values()) / len(metrics)
return {
'per_tag': metrics,
'accuracy': accuracy,
'macro_precision': macro_precision,
'macro_recall': macro_recall,
'macro_f1': macro_f1
}
def print_report(report, model_name):
"""Print formatted classification report."""
print(f"\n{'=' * 60}")
print(f" {model_name}")
print(f"{'=' * 60}")
print(f"\n{'Tag':<12} {'Precision':>10} {'Recall':>10} {'F1':>10} {'Support':>10}")
print("-" * 52)
# Sort by support (most common first)
sorted_tags = sorted(report['per_tag'].items(),
key=lambda x: x[1]['support'], reverse=True)
for tag, metrics in sorted_tags[:15]: # Top 15 tags
print(f"{tag:<12} {metrics['precision']:>10.3f} {metrics['recall']:>10.3f} "
f"{metrics['f1']:>10.3f} {metrics['support']:>10}")
print("-" * 52)
print(f"{'Accuracy':<12} {report['accuracy']:>42.3f}")
print(f"{'Macro Avg':<12} {report['macro_precision']:>10.3f} {report['macro_recall']:>10.3f} "
f"{report['macro_f1']:>10.3f}")
def evaluate_perceptron(test_sents, model_path):
"""Evaluate our averaged perceptron model."""
tagger = PerceptronTagger.load(model_path)
y_true = []
y_pred = []
for sentence in test_sents:
tokens = [t for t, _ in sentence]
true_tags = [tag for _, tag in sentence]
predicted = tagger.tag(tokens)
pred_tags = [tag for _, tag in predicted]
y_true.extend(true_tags)
y_pred.extend(pred_tags)
return classification_report(y_true, y_pred)
def evaluate_stanza(test_sents):
"""Evaluate Stanza Irish model."""
try:
import stanza
except ImportError:
print("Stanza not installed. Run: pip install stanza")
return None
# Download Irish model if needed
try:
stanza.download('ga', processors='tokenize,pos', verbose=False)
except Exception:
pass
nlp = stanza.Pipeline('ga', processors='tokenize,pos', tokenize_pretokenized=True, verbose=False)
y_true = []
y_pred = []
for sentence in test_sents:
tokens = [t for t, _ in sentence]
true_tags = [tag for _, tag in sentence]
# Stanza expects list of lists for pretokenized
doc = nlp([tokens])
pred_tags = [word.upos for sent in doc.sentences for word in sent.words]
# Handle length mismatches
if len(pred_tags) == len(true_tags):
y_true.extend(true_tags)
y_pred.extend(pred_tags)
return classification_report(y_true, y_pred)
def evaluate_gabert(test_sents):
"""Evaluate gaBERT-based POS tagger (placeholder for future implementation)."""
# TODO: Implement gaBERT-based tagger
# Model: DCU-NLP/bert-base-irish-cased-v1
# Would need to fine-tune for POS tagging task
return None
def main():
data_dir = Path(__file__).parent.parent / 'data' / 'UD_Irish-TwittIrish-master'
model_path = Path(__file__).parent / 'perceptron_model.pkl'
print("Loading test data...")
test_sents = parse_conllu(data_dir / 'ga_twittirish-ud-test.conllu')
print(f" {len(test_sents)} test sentences")
results = {}
# Evaluate Perceptron
print("\nEvaluating Averaged Perceptron...")
if model_path.exists():
perceptron_report = evaluate_perceptron(test_sents, model_path)
print_report(perceptron_report, "Averaged Perceptron (Ours)")
results['perceptron'] = {
'accuracy': perceptron_report['accuracy'],
'macro_f1': perceptron_report['macro_f1']
}
else:
print(" Model not found. Run train_perceptron.py first.")
# Evaluate Stanza
print("\nEvaluating Stanza...")
stanza_report = evaluate_stanza(test_sents)
if stanza_report:
print_report(stanza_report, "Stanza (SOTA)")
results['stanza'] = {
'accuracy': stanza_report['accuracy'],
'macro_f1': stanza_report['macro_f1']
}
# Summary comparison
print("\n" + "=" * 60)
print(" COMPARISON SUMMARY")
print("=" * 60)
print(f"\n{'Model':<25} {'Accuracy':>12} {'Macro F1':>12}")
print("-" * 50)
for model, metrics in results.items():
print(f"{model:<25} {metrics['accuracy']:>12.3f} {metrics['macro_f1']:>12.3f}")
# Save results
results_path = Path(__file__).parent / 'comparison_results.json'
with open(results_path, 'w') as f:
json.dump(results, f, indent=2)
print(f"\nResults saved to {results_path}")
if __name__ == '__main__':
main()