Spaces:

laurencassidy
/

ga-POS-tagger

Sleeping

App Files Files Community

ga-POS-tagger / evaluate.py

laurencassidy

Upload 14 files

1170fda verified 4 months ago

raw

history blame contribute delete

6.08 kB

	"""
	Evaluate POS taggers on TwittIrish test set.

	Compares:
	1. Averaged Perceptron (our model)
	2. Stanza (SOTA for Irish UD)
	3. gaBERT-based tagger (if available)
	"""

	import json
	from pathlib import Path
	from collections import defaultdict

	from train_perceptron import parse_conllu, PerceptronTagger


	def classification_report(y_true, y_pred, labels=None):
	"""Generate classification report for POS tags."""
	if labels is None:
	labels = sorted(set(y_true) \| set(y_pred))

	metrics = {}
	for label in labels:
	tp = sum(1 for t, p in zip(y_true, y_pred) if t == label and p == label)
	fp = sum(1 for t, p in zip(y_true, y_pred) if t != label and p == label)
	fn = sum(1 for t, p in zip(y_true, y_pred) if t == label and p != label)

	precision = tp / (tp + fp) if (tp + fp) > 0 else 0
	recall = tp / (tp + fn) if (tp + fn) > 0 else 0
	f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

	metrics[label] = {
	'precision': precision,
	'recall': recall,
	'f1': f1,
	'support': sum(1 for t in y_true if t == label)
	}

	# Overall accuracy
	accuracy = sum(1 for t, p in zip(y_true, y_pred) if t == p) / len(y_true)

	# Macro averages
	macro_precision = sum(m['precision'] for m in metrics.values()) / len(metrics)
	macro_recall = sum(m['recall'] for m in metrics.values()) / len(metrics)
	macro_f1 = sum(m['f1'] for m in metrics.values()) / len(metrics)

	return {
	'per_tag': metrics,
	'accuracy': accuracy,
	'macro_precision': macro_precision,
	'macro_recall': macro_recall,
	'macro_f1': macro_f1
	}


	def print_report(report, model_name):
	"""Print formatted classification report."""
	print(f"\n{'=' * 60}")
	print(f" {model_name}")
	print(f"{'=' * 60}")
	print(f"\n{'Tag':<12} {'Precision':>10} {'Recall':>10} {'F1':>10} {'Support':>10}")
	print("-" * 52)

	# Sort by support (most common first)
	sorted_tags = sorted(report['per_tag'].items(),
	key=lambda x: x[1]['support'], reverse=True)

	for tag, metrics in sorted_tags[:15]: # Top 15 tags
	print(f"{tag:<12} {metrics['precision']:>10.3f} {metrics['recall']:>10.3f} "
	f"{metrics['f1']:>10.3f} {metrics['support']:>10}")

	print("-" * 52)
	print(f"{'Accuracy':<12} {report['accuracy']:>42.3f}")
	print(f"{'Macro Avg':<12} {report['macro_precision']:>10.3f} {report['macro_recall']:>10.3f} "
	f"{report['macro_f1']:>10.3f}")


	def evaluate_perceptron(test_sents, model_path):
	"""Evaluate our averaged perceptron model."""
	tagger = PerceptronTagger.load(model_path)

	y_true = []
	y_pred = []

	for sentence in test_sents:
	tokens = [t for t, _ in sentence]
	true_tags = [tag for _, tag in sentence]
	predicted = tagger.tag(tokens)
	pred_tags = [tag for _, tag in predicted]

	y_true.extend(true_tags)
	y_pred.extend(pred_tags)

	return classification_report(y_true, y_pred)


	def evaluate_stanza(test_sents):
	"""Evaluate Stanza Irish model."""
	try:
	import stanza
	except ImportError:
	print("Stanza not installed. Run: pip install stanza")
	return None

	# Download Irish model if needed
	try:
	stanza.download('ga', processors='tokenize,pos', verbose=False)
	except Exception:
	pass

	nlp = stanza.Pipeline('ga', processors='tokenize,pos', tokenize_pretokenized=True, verbose=False)

	y_true = []
	y_pred = []

	for sentence in test_sents:
	tokens = [t for t, _ in sentence]
	true_tags = [tag for _, tag in sentence]

	# Stanza expects list of lists for pretokenized
	doc = nlp([tokens])
	pred_tags = [word.upos for sent in doc.sentences for word in sent.words]

	# Handle length mismatches
	if len(pred_tags) == len(true_tags):
	y_true.extend(true_tags)
	y_pred.extend(pred_tags)

	return classification_report(y_true, y_pred)


	def evaluate_gabert(test_sents):
	"""Evaluate gaBERT-based POS tagger (placeholder for future implementation)."""
	# TODO: Implement gaBERT-based tagger
	# Model: DCU-NLP/bert-base-irish-cased-v1
	# Would need to fine-tune for POS tagging task
	return None


	def main():
	data_dir = Path(__file__).parent.parent / 'data' / 'UD_Irish-TwittIrish-master'
	model_path = Path(__file__).parent / 'perceptron_model.pkl'

	print("Loading test data...")
	test_sents = parse_conllu(data_dir / 'ga_twittirish-ud-test.conllu')
	print(f" {len(test_sents)} test sentences")

	results = {}

	# Evaluate Perceptron
	print("\nEvaluating Averaged Perceptron...")
	if model_path.exists():
	perceptron_report = evaluate_perceptron(test_sents, model_path)
	print_report(perceptron_report, "Averaged Perceptron (Ours)")
	results['perceptron'] = {
	'accuracy': perceptron_report['accuracy'],
	'macro_f1': perceptron_report['macro_f1']
	}
	else:
	print(" Model not found. Run train_perceptron.py first.")

	# Evaluate Stanza
	print("\nEvaluating Stanza...")
	stanza_report = evaluate_stanza(test_sents)
	if stanza_report:
	print_report(stanza_report, "Stanza (SOTA)")
	results['stanza'] = {
	'accuracy': stanza_report['accuracy'],
	'macro_f1': stanza_report['macro_f1']
	}

	# Summary comparison
	print("\n" + "=" * 60)
	print(" COMPARISON SUMMARY")
	print("=" * 60)
	print(f"\n{'Model':<25} {'Accuracy':>12} {'Macro F1':>12}")
	print("-" * 50)
	for model, metrics in results.items():
	print(f"{model:<25} {metrics['accuracy']:>12.3f} {metrics['macro_f1']:>12.3f}")

	# Save results
	results_path = Path(__file__).parent / 'comparison_results.json'
	with open(results_path, 'w') as f:
	json.dump(results, f, indent=2)
	print(f"\nResults saved to {results_path}")


	if __name__ == '__main__':
	main()