Add model validation script with Hugging Face Hub support

- Add validate.py to download and test model from HF Hub
- Create script to save model in joblib format
- Add huggingface-hub and joblib dependencies
- Include sklearn_model.joblib for HF Hub compatibility
- Update .gitignore to allow sklearn_model.joblib

Files changed (5) hide show

.gitignore +1 -2
pyproject.toml +2 -0
save_joblib_model.py +21 -0
sklearn_model.joblib +3 -0
validate.py +123 -0

.gitignore CHANGED Viewed

@@ -41,10 +41,9 @@ uv.lock
 .ipynb_checkpoints
 *.ipynb
-# Model files
 *.pkl
 *.pickle
-*.joblib
 *.h5
 *.hdf5
 *.pth

 .ipynb_checkpoints
 *.ipynb
+# Model files (keep sklearn_model.joblib for HF Hub)
 *.pkl
 *.pickle
 *.h5
 *.hdf5
 *.pth

pyproject.toml CHANGED Viewed

@@ -7,6 +7,8 @@ requires-python = ">=3.10"
 dependencies = [
     "scikit-learn>=1.3.0",
     "numpy>=1.24.0",
 ]
 [dependency-groups]

 dependencies = [
     "scikit-learn>=1.3.0",
     "numpy>=1.24.0",
+    "huggingface-hub>=0.35.0",
+    "joblib>=1.5.2",
 ]
 [dependency-groups]

save_joblib_model.py ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/env python3
+"""
+Script to save the trained model in joblib format for Hugging Face Hub.
+"""
+import joblib
+from model import SonarModel
+def main():
+    print("Loading trained model from pickle...")
+    model = SonarModel.load("sonar_model.pkl")
+    print("Saving model in joblib format...")
+    joblib.dump(model, "sklearn_model.joblib")
+    print("Model saved as sklearn_model.joblib")
+    print("\nThis file should be uploaded to Hugging Face Hub for the validate.py script to work.")
+if __name__ == "__main__":
+    main()

sklearn_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52ae232c454dc00511c23d8201c1287d4680c8949dee41a2f2ded4b8a6f7c8d2
+size 1322748

validate.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/env python3
+"""
+Validation script for the Sonar model.
+Downloads the model from Hugging Face Hub and validates it on test data.
+"""
+import numpy as np
+from huggingface_hub import hf_hub_download
+import joblib
+from sklearn.datasets import make_classification
+def download_model():
+    """Download the trained model from Hugging Face Hub."""
+    print("Downloading model from Hugging Face Hub...")
+    try:
+        model_path = hf_hub_download(
+            repo_id="undertheseanlp/sonar_core_1",
+            filename="sklearn_model.joblib",
+        )
+        model = joblib.load(model_path)
+        print("Model downloaded successfully!")
+        return model
+    except Exception as e:
+        print(f"Error downloading model: {e}")
+        print("Model file might not exist yet on Hugging Face. Using local model instead.")
+        # Fallback to local pickle file
+        from model import SonarModel
+        model = SonarModel.load("sonar_model.pkl")
+        return model
+def generate_validation_data(n_samples=200, n_features=60):
+    """Generate validation data similar to the Sonar dataset."""
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=40,
+        n_redundant=10,
+        n_repeated=5,
+        n_classes=2,
+        n_clusters_per_class=2,
+        weights=[0.5, 0.5],
+        flip_y=0.01,
+        random_state=123,  # Different seed from training
+    )
+    return X, y
+def validate_model(model, X, y):
+    """Validate the model on test data."""
+    print("\nValidating model...")
+    print(f"Validation data shape: X={X.shape}, y={y.shape}")
+    # Make predictions
+    y_pred = model.predict(X)
+    y_proba = model.predict_proba(X)
+    # Calculate accuracy
+    accuracy = (y_pred == y).mean()
+    print(f"\nValidation Accuracy: {accuracy:.4f}")
+    # Confusion matrix
+    from sklearn.metrics import confusion_matrix, classification_report
+    cm = confusion_matrix(y, y_pred)
+    print("\nConfusion Matrix:")
+    print(f"True Negatives:  {cm[0, 0]:3d} | False Positives: {cm[0, 1]:3d}")
+    print(f"False Negatives: {cm[1, 0]:3d} | True Positives:  {cm[1, 1]:3d}")
+    # Classification report
+    print("\nClassification Report:")
+    report = classification_report(
+        y, y_pred, target_names=["Rock (0)", "Mine (1)"], digits=3
+    )
+    print(report)
+    # Sample predictions
+    print("\nSample Predictions (first 5):")
+    for i in range(min(5, len(X))):
+        print(
+            f"  Sample {i + 1}: True={y[i]}, Predicted={y_pred[i]}, "
+            f"Probabilities=[{y_proba[i][0]:.3f}, {y_proba[i][1]:.3f}]"
+        )
+    # Feature importance (if available)
+    try:
+        importances = model.get_feature_importance()
+        top_features = np.argsort(importances)[-5:][::-1]
+        print("\nTop 5 Most Important Features:")
+        for i, feat_idx in enumerate(top_features, 1):
+            print(f"  {i}. Feature {feat_idx}: {importances[feat_idx]:.4f}")
+    except AttributeError:
+        pass
+    return accuracy
+def main():
+    print("=" * 60)
+    print("Sonar Model Validation Script")
+    print("=" * 60)
+    # Download model from Hugging Face
+    model = download_model()
+    # Generate validation data
+    print("\nGenerating validation data...")
+    X_val, y_val = generate_validation_data(n_samples=200)
+    # Validate model
+    accuracy = validate_model(model, X_val, y_val)
+    print("\n" + "=" * 60)
+    if accuracy > 0.8:
+        print("✅ Model validation successful!")
+    else:
+        print("⚠️  Model accuracy below expected threshold")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()