Spaces:

Jaykay73
/

nextword-pidgin-api

Sleeping

App Files Files Community

nextword-pidgin-api / src /utils.py

JermaineAI

Fix API model loading: Copy src directory and update Dockerfile

ad18db6 about 2 months ago

raw

history blame contribute delete

2.12 kB

	"""
	Utility functions for the next-word prediction system.
	"""

	from typing import List, Tuple
	import math


	def format_predictions(predictions: List[Tuple[str, float]], show_percent: bool = True) -> str:
	"""
	Format prediction results for display.

	Args:
	predictions: List of (word, probability) tuples.
	show_percent: If True, show as percentage.

	Returns:
	Formatted string.
	"""
	lines = []
	for word, prob in predictions:
	if show_percent:
	lines.append(f" {word}: {prob*100:.2f}%")
	else:
	lines.append(f" {word}: {prob:.6f}")
	return "\n".join(lines)


	def calculate_entropy(probabilities: List[float]) -> float:
	"""
	Calculate entropy of a probability distribution.

	H(X) = -sum(p * log2(p))

	Args:
	probabilities: List of probabilities.

	Returns:
	Entropy in bits.
	"""
	entropy = 0.0
	for p in probabilities:
	if p > 0:
	entropy -= p * math.log2(p)
	return entropy


	def top_k_accuracy(
	model,
	test_sentences: List[List[str]],
	k: int = 5
	) -> float:
	"""
	Calculate top-k accuracy on test data.

	Measures what fraction of true next words appear in top-k predictions.

	Args:
	model: Trained TrigramLM instance.
	test_sentences: List of tokenized sentences with markers.
	k: Number of top predictions to consider.

	Returns:
	Accuracy as fraction between 0 and 1.
	"""
	correct = 0
	total = 0

	for sentence in test_sentences:
	if len(sentence) < 3:
	continue

	for i in range(2, len(sentence)):
	w1, w2 = sentence[i-2], sentence[i-1]
	true_word = sentence[i]

	# Get top-k predictions
	preds = model.get_context_distribution(w1, w2, top_k=k)
	pred_words = [w for w, _ in preds]

	if true_word in pred_words:
	correct += 1
	total += 1

	return correct / total if total > 0 else 0.0