Spaces:
Sleeping
Sleeping
| """ | |
| Evaluate POS taggers on TwittIrish test set. | |
| Compares: | |
| 1. Averaged Perceptron (our model) | |
| 2. Stanza (SOTA for Irish UD) | |
| 3. gaBERT-based tagger (if available) | |
| """ | |
| import json | |
| from pathlib import Path | |
| from collections import defaultdict | |
| from train_perceptron import parse_conllu, PerceptronTagger | |
| def classification_report(y_true, y_pred, labels=None): | |
| """Generate classification report for POS tags.""" | |
| if labels is None: | |
| labels = sorted(set(y_true) | set(y_pred)) | |
| metrics = {} | |
| for label in labels: | |
| tp = sum(1 for t, p in zip(y_true, y_pred) if t == label and p == label) | |
| fp = sum(1 for t, p in zip(y_true, y_pred) if t != label and p == label) | |
| fn = sum(1 for t, p in zip(y_true, y_pred) if t == label and p != label) | |
| precision = tp / (tp + fp) if (tp + fp) > 0 else 0 | |
| recall = tp / (tp + fn) if (tp + fn) > 0 else 0 | |
| f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 | |
| metrics[label] = { | |
| 'precision': precision, | |
| 'recall': recall, | |
| 'f1': f1, | |
| 'support': sum(1 for t in y_true if t == label) | |
| } | |
| # Overall accuracy | |
| accuracy = sum(1 for t, p in zip(y_true, y_pred) if t == p) / len(y_true) | |
| # Macro averages | |
| macro_precision = sum(m['precision'] for m in metrics.values()) / len(metrics) | |
| macro_recall = sum(m['recall'] for m in metrics.values()) / len(metrics) | |
| macro_f1 = sum(m['f1'] for m in metrics.values()) / len(metrics) | |
| return { | |
| 'per_tag': metrics, | |
| 'accuracy': accuracy, | |
| 'macro_precision': macro_precision, | |
| 'macro_recall': macro_recall, | |
| 'macro_f1': macro_f1 | |
| } | |
| def print_report(report, model_name): | |
| """Print formatted classification report.""" | |
| print(f"\n{'=' * 60}") | |
| print(f" {model_name}") | |
| print(f"{'=' * 60}") | |
| print(f"\n{'Tag':<12} {'Precision':>10} {'Recall':>10} {'F1':>10} {'Support':>10}") | |
| print("-" * 52) | |
| # Sort by support (most common first) | |
| sorted_tags = sorted(report['per_tag'].items(), | |
| key=lambda x: x[1]['support'], reverse=True) | |
| for tag, metrics in sorted_tags[:15]: # Top 15 tags | |
| print(f"{tag:<12} {metrics['precision']:>10.3f} {metrics['recall']:>10.3f} " | |
| f"{metrics['f1']:>10.3f} {metrics['support']:>10}") | |
| print("-" * 52) | |
| print(f"{'Accuracy':<12} {report['accuracy']:>42.3f}") | |
| print(f"{'Macro Avg':<12} {report['macro_precision']:>10.3f} {report['macro_recall']:>10.3f} " | |
| f"{report['macro_f1']:>10.3f}") | |
| def evaluate_perceptron(test_sents, model_path): | |
| """Evaluate our averaged perceptron model.""" | |
| tagger = PerceptronTagger.load(model_path) | |
| y_true = [] | |
| y_pred = [] | |
| for sentence in test_sents: | |
| tokens = [t for t, _ in sentence] | |
| true_tags = [tag for _, tag in sentence] | |
| predicted = tagger.tag(tokens) | |
| pred_tags = [tag for _, tag in predicted] | |
| y_true.extend(true_tags) | |
| y_pred.extend(pred_tags) | |
| return classification_report(y_true, y_pred) | |
| def evaluate_stanza(test_sents): | |
| """Evaluate Stanza Irish model.""" | |
| try: | |
| import stanza | |
| except ImportError: | |
| print("Stanza not installed. Run: pip install stanza") | |
| return None | |
| # Download Irish model if needed | |
| try: | |
| stanza.download('ga', processors='tokenize,pos', verbose=False) | |
| except Exception: | |
| pass | |
| nlp = stanza.Pipeline('ga', processors='tokenize,pos', tokenize_pretokenized=True, verbose=False) | |
| y_true = [] | |
| y_pred = [] | |
| for sentence in test_sents: | |
| tokens = [t for t, _ in sentence] | |
| true_tags = [tag for _, tag in sentence] | |
| # Stanza expects list of lists for pretokenized | |
| doc = nlp([tokens]) | |
| pred_tags = [word.upos for sent in doc.sentences for word in sent.words] | |
| # Handle length mismatches | |
| if len(pred_tags) == len(true_tags): | |
| y_true.extend(true_tags) | |
| y_pred.extend(pred_tags) | |
| return classification_report(y_true, y_pred) | |
| def evaluate_gabert(test_sents): | |
| """Evaluate gaBERT-based POS tagger (placeholder for future implementation).""" | |
| # TODO: Implement gaBERT-based tagger | |
| # Model: DCU-NLP/bert-base-irish-cased-v1 | |
| # Would need to fine-tune for POS tagging task | |
| return None | |
| def main(): | |
| data_dir = Path(__file__).parent.parent / 'data' / 'UD_Irish-TwittIrish-master' | |
| model_path = Path(__file__).parent / 'perceptron_model.pkl' | |
| print("Loading test data...") | |
| test_sents = parse_conllu(data_dir / 'ga_twittirish-ud-test.conllu') | |
| print(f" {len(test_sents)} test sentences") | |
| results = {} | |
| # Evaluate Perceptron | |
| print("\nEvaluating Averaged Perceptron...") | |
| if model_path.exists(): | |
| perceptron_report = evaluate_perceptron(test_sents, model_path) | |
| print_report(perceptron_report, "Averaged Perceptron (Ours)") | |
| results['perceptron'] = { | |
| 'accuracy': perceptron_report['accuracy'], | |
| 'macro_f1': perceptron_report['macro_f1'] | |
| } | |
| else: | |
| print(" Model not found. Run train_perceptron.py first.") | |
| # Evaluate Stanza | |
| print("\nEvaluating Stanza...") | |
| stanza_report = evaluate_stanza(test_sents) | |
| if stanza_report: | |
| print_report(stanza_report, "Stanza (SOTA)") | |
| results['stanza'] = { | |
| 'accuracy': stanza_report['accuracy'], | |
| 'macro_f1': stanza_report['macro_f1'] | |
| } | |
| # Summary comparison | |
| print("\n" + "=" * 60) | |
| print(" COMPARISON SUMMARY") | |
| print("=" * 60) | |
| print(f"\n{'Model':<25} {'Accuracy':>12} {'Macro F1':>12}") | |
| print("-" * 50) | |
| for model, metrics in results.items(): | |
| print(f"{model:<25} {metrics['accuracy']:>12.3f} {metrics['macro_f1']:>12.3f}") | |
| # Save results | |
| results_path = Path(__file__).parent / 'comparison_results.json' | |
| with open(results_path, 'w') as f: | |
| json.dump(results, f, indent=2) | |
| print(f"\nResults saved to {results_path}") | |
| if __name__ == '__main__': | |
| main() | |