# evaluate.py import pandas as pd from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, classification_report ) from ml_utils import ScamDetectionService def run_evaluation(csv_path: str = "spam.csv", sample_size: int = 200): """ Evaluates the scam detection model on the SMS Spam Collection dataset. Uses a sample to keep runtime reasonable (zero-shot is slow on CPU). Args: csv_path: path to the downloaded spam.csv sample_size: how many rows to evaluate (200 is enough for metrics) """ print("Loading dataset...") df = pd.read_csv(csv_path, encoding='latin-1')[['v1', 'v2']] df.columns = ['label', 'text'] # Map ham/spam -> Safe/Scam df['true_label'] = df['label'].map({'ham': 'Safe', 'spam': 'Scam'}) # Sample evenly across both classes sample_df = df.groupby('true_label', group_keys=False).apply( lambda x: x.sample(min(len(x), sample_size // 2), random_state=42) ).reset_index(drop=True) print(f"Evaluating on {len(sample_df)} samples...") service = ScamDetectionService() predictions = [] for i, row in sample_df.iterrows(): result = service.analyze_text_scam(row['text']) # Collapse Suspicious -> Scam for binary eval pred = "Scam" if result["risk_level"] in ["Scam", "Suspicious"] else "Safe" predictions.append(pred) if (i + 1) % 20 == 0: print(f" Progress: {i + 1}/{len(sample_df)}") true_labels = sample_df['true_label'].tolist() print("\n========== EVALUATION RESULTS ==========") print(f"Accuracy : {accuracy_score(true_labels, predictions):.4f}") print(f"Precision : {precision_score(true_labels, predictions, pos_label='Scam'):.4f}") print(f"Recall : {recall_score(true_labels, predictions, pos_label='Scam'):.4f}") print(f"F1 Score : {f1_score(true_labels, predictions, pos_label='Scam'):.4f}") print("\nFull Report:") print(classification_report(true_labels, predictions)) print("=========================================") print("\nCopy these numbers into your README and resume.") if __name__ == "__main__": run_evaluation()