Add inference.py

48c6574 verified about 2 months ago

6.96 kB

	"""
	Programming Paradigm Classification - Inference Script
	Uses trained SVM classifier and sentence embeddings for predictions
	"""

	import pickle
	import numpy as np
	from sentence_transformers import SentenceTransformer
	import sys


	class ProgrammingParadigmClassifier:
	"""Classifier for programming paradigm predictions."""

	def __init__(self, classifier_path='svm_classifier.pkl',
	model_name_path='sentence_model_name.txt',
	confidence_threshold=0.55):
	"""Initialize classifier and embedding model."""
	print("Loading trained SVM classifier...")
	with open(classifier_path, 'rb') as f:
	self.classifier = pickle.load(f)

	# Load the model name that was used during training
	print("Reading embedding model name from training...")
	with open(model_name_path, 'r') as f:
	model_name = f.read().strip()

	print(f"Loading sentence embedding model: {model_name}...")
	self.model = SentenceTransformer(model_name)
	self.confidence_threshold = confidence_threshold
	print(f"Models loaded! (Confidence threshold: {confidence_threshold})\n")

	def predict(self, text):
	"""Predict programming paradigm for given text with uncertainty handling."""
	# Generate embedding
	embedding = self.model.encode([text])

	# Get probabilities (handle both CalibratedClassifierCV and LinearSVC)
	if hasattr(self.classifier, 'predict_proba'):
	# CalibratedClassifierCV - has real probabilities
	probs = self.classifier.predict_proba(embedding)[0]
	else:
	# LinearSVC - use decision_function and convert to probabilities
	scores = self.classifier.decision_function(embedding)[0]
	# Softmax to convert scores to probabilities
	exp_scores = np.exp(scores - np.max(scores))
	probs = exp_scores / exp_scores.sum()

	prob_dict = dict(zip(self.classifier.classes_, probs))

	# Get top two probabilities for margin calculation
	sorted_indices = np.argsort(probs)[::-1]
	sorted_probs = probs[sorted_indices]
	max_prob = sorted_probs[0]
	second_max = sorted_probs[1] if len(sorted_probs) > 1 else 0.0
	margin = max_prob - second_max

	# Get class names for top two
	top_classes = self.classifier.classes_[sorted_indices]
	top_class = top_classes[0]
	second_class = top_classes[1] if len(top_classes) > 1 else None


	if max_prob > 0.25 and second_max > 0.25 and margin < 0.08:
	# Both classes are viable - return both
	prediction = f"{top_class} or {second_class}"
	elif max_prob < 0.30 or margin < 0.10:
	prediction = "unclear"
	else:
	prediction = top_class

	return prediction, prob_dict, max_prob

	def predict_batch(self, texts):
	"""Predict programming paradigms for multiple texts."""
	results = []
	for text in texts:
	prediction, probs, max_prob = self.predict(text)
	results.append({
	'text': text,
	'prediction': prediction,
	'probabilities': probs,
	'confidence': max_prob
	})
	return results

	def display_prediction(self, text, prediction, probs, max_prob):
	"""Display prediction results in formatted output."""
	print(f"\nInput: {text[:100]}{'...' if len(text) > 100 else ''}")

	# Format output for dual or single predictions
	if " or " in str(prediction):
	print(f"Predicted Paradigm: {prediction} (ambiguous - close call!)")
	elif prediction == "unclear":
	print(f"Predicted Paradigm: {prediction} (too uncertain)")
	else:
	print(f"Predicted Paradigm: {prediction} (confident)")

	# Get top 2 classes for margin display
	sorted_items = sorted(probs.items(), key=lambda x: x[1], reverse=True)
	top_class, top_prob = sorted_items[0]
	second_class, second_prob = sorted_items[1] if len(sorted_items) > 1 else (None, 0.0)
	margin = top_prob - second_prob

	print(f"Max: {top_class} ({top_prob:.3f}), 2nd: {second_class} ({second_prob:.3f}), Margin: {margin:.3f}")
	print("Class Probabilities:")
	for label, prob in sorted_items:
	print(f" {label:12s}: {prob:7.3f}")
	print("-" * 70)


	def main():
	"""Main inference pipeline."""
	print("=" * 70)
	print("Programming Paradigm Classification - Inference")
	print("=" * 70)

	# Initialize classifier
	clf = ProgrammingParadigmClassifier()

	# Example texts for inference
	test_texts = [
	"How do I make this function pure without any side effects?",
	"Why does my class hierarchy have so many levels of inheritance?",
	"What's the best way to center a div in CSS?",
	"This function just loops through the array and updates each element in place.",
	"I'm using lambda functions to transform this list with map and filter.",
	"How do I properly encapsulate private variables in my class?",
	"What's the most efficient way to iterate through this data structure?",
	"Can I use functional composition to chain these operations?"
	]

	# Run inference on all examples
	for text in test_texts:
	prediction, probs, max_prob = clf.predict(text)
	clf.display_prediction(text, prediction, probs, max_prob)

	print("\n" + "=" * 70)
	print("Inference complete!")
	print("=" * 70)


	def interactive_mode():
	"""Run classifier in interactive mode."""
	print("=" * 70)
	print("Programming Paradigm Classifier - Interactive Mode")
	print("=" * 70)
	print("Type 'quit' to exit\n")

	# Initialize classifier
	clf = ProgrammingParadigmClassifier()

	while True:
	try:
	text = input("\nEnter text to classify (or 'quit' to exit): ").strip()

	if text.lower() == 'quit':
	print("Exiting...")
	break

	if not text:
	print("Please enter some text.")
	continue

	prediction, probs, max_prob = clf.predict(text)
	clf.display_prediction(text, prediction, probs, max_prob)

	except KeyboardInterrupt:
	print("\n\nExiting...")
	break
	except Exception as e:
	print(f"Error: {e}")


	if __name__ == "__main__":
	if len(sys.argv) > 1 and sys.argv[1] == '--interactive':
	interactive_mode()
	else:
	main()