AryanPrakhar's picture
Add inference.py
48c6574 verified
"""
Programming Paradigm Classification - Inference Script
Uses trained SVM classifier and sentence embeddings for predictions
"""
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import sys
class ProgrammingParadigmClassifier:
"""Classifier for programming paradigm predictions."""
def __init__(self, classifier_path='svm_classifier.pkl',
model_name_path='sentence_model_name.txt',
confidence_threshold=0.55):
"""Initialize classifier and embedding model."""
print("Loading trained SVM classifier...")
with open(classifier_path, 'rb') as f:
self.classifier = pickle.load(f)
# Load the model name that was used during training
print("Reading embedding model name from training...")
with open(model_name_path, 'r') as f:
model_name = f.read().strip()
print(f"Loading sentence embedding model: {model_name}...")
self.model = SentenceTransformer(model_name)
self.confidence_threshold = confidence_threshold
print(f"Models loaded! (Confidence threshold: {confidence_threshold})\n")
def predict(self, text):
"""Predict programming paradigm for given text with uncertainty handling."""
# Generate embedding
embedding = self.model.encode([text])
# Get probabilities (handle both CalibratedClassifierCV and LinearSVC)
if hasattr(self.classifier, 'predict_proba'):
# CalibratedClassifierCV - has real probabilities
probs = self.classifier.predict_proba(embedding)[0]
else:
# LinearSVC - use decision_function and convert to probabilities
scores = self.classifier.decision_function(embedding)[0]
# Softmax to convert scores to probabilities
exp_scores = np.exp(scores - np.max(scores))
probs = exp_scores / exp_scores.sum()
prob_dict = dict(zip(self.classifier.classes_, probs))
# Get top two probabilities for margin calculation
sorted_indices = np.argsort(probs)[::-1]
sorted_probs = probs[sorted_indices]
max_prob = sorted_probs[0]
second_max = sorted_probs[1] if len(sorted_probs) > 1 else 0.0
margin = max_prob - second_max
# Get class names for top two
top_classes = self.classifier.classes_[sorted_indices]
top_class = top_classes[0]
second_class = top_classes[1] if len(top_classes) > 1 else None
if max_prob > 0.25 and second_max > 0.25 and margin < 0.08:
# Both classes are viable - return both
prediction = f"{top_class} or {second_class}"
elif max_prob < 0.30 or margin < 0.10:
prediction = "unclear"
else:
prediction = top_class
return prediction, prob_dict, max_prob
def predict_batch(self, texts):
"""Predict programming paradigms for multiple texts."""
results = []
for text in texts:
prediction, probs, max_prob = self.predict(text)
results.append({
'text': text,
'prediction': prediction,
'probabilities': probs,
'confidence': max_prob
})
return results
def display_prediction(self, text, prediction, probs, max_prob):
"""Display prediction results in formatted output."""
print(f"\nInput: {text[:100]}{'...' if len(text) > 100 else ''}")
# Format output for dual or single predictions
if " or " in str(prediction):
print(f"Predicted Paradigm: {prediction} (ambiguous - close call!)")
elif prediction == "unclear":
print(f"Predicted Paradigm: {prediction} (too uncertain)")
else:
print(f"Predicted Paradigm: {prediction} (confident)")
# Get top 2 classes for margin display
sorted_items = sorted(probs.items(), key=lambda x: x[1], reverse=True)
top_class, top_prob = sorted_items[0]
second_class, second_prob = sorted_items[1] if len(sorted_items) > 1 else (None, 0.0)
margin = top_prob - second_prob
print(f"Max: {top_class} ({top_prob:.3f}), 2nd: {second_class} ({second_prob:.3f}), Margin: {margin:.3f}")
print("Class Probabilities:")
for label, prob in sorted_items:
print(f" {label:12s}: {prob:7.3f}")
print("-" * 70)
def main():
"""Main inference pipeline."""
print("=" * 70)
print("Programming Paradigm Classification - Inference")
print("=" * 70)
# Initialize classifier
clf = ProgrammingParadigmClassifier()
# Example texts for inference
test_texts = [
"How do I make this function pure without any side effects?",
"Why does my class hierarchy have so many levels of inheritance?",
"What's the best way to center a div in CSS?",
"This function just loops through the array and updates each element in place.",
"I'm using lambda functions to transform this list with map and filter.",
"How do I properly encapsulate private variables in my class?",
"What's the most efficient way to iterate through this data structure?",
"Can I use functional composition to chain these operations?"
]
# Run inference on all examples
for text in test_texts:
prediction, probs, max_prob = clf.predict(text)
clf.display_prediction(text, prediction, probs, max_prob)
print("\n" + "=" * 70)
print("Inference complete!")
print("=" * 70)
def interactive_mode():
"""Run classifier in interactive mode."""
print("=" * 70)
print("Programming Paradigm Classifier - Interactive Mode")
print("=" * 70)
print("Type 'quit' to exit\n")
# Initialize classifier
clf = ProgrammingParadigmClassifier()
while True:
try:
text = input("\nEnter text to classify (or 'quit' to exit): ").strip()
if text.lower() == 'quit':
print("Exiting...")
break
if not text:
print("Please enter some text.")
continue
prediction, probs, max_prob = clf.predict(text)
clf.display_prediction(text, prediction, probs, max_prob)
except KeyboardInterrupt:
print("\n\nExiting...")
break
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
if len(sys.argv) > 1 and sys.argv[1] == '--interactive':
interactive_mode()
else:
main()