Spaces:

AlexVplle
/

predictif-training-1a2df142

Runtime error

File size: 6,313 Bytes

6670a76


import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
import json
from huggingface_hub import HfApi
import os

def main():
    print("Starting RandomForest training...")

    # Load dataset from URL
    import json
    with open("dataset_config.json", "r") as f:
        config = json.load(f)

    file_url = config["file_url"]
    print(f"Downloading dataset from: {file_url}")

    df = pd.read_csv(file_url)
    print(f"Dataset shape: {df.shape}")

    # Separate features and target
    feature_columns = [col for col in df.columns if col != 'label']
    X = df[feature_columns]
    y = df['label']

    print(f"Features: {feature_columns}")
    print(f"Classes: {y.unique().tolist()}")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train RandomForest
    rf = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        random_state=42
    )

    print("Training model...")
    rf.fit(X_train, y_train)

    # Evaluate
    y_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Save model
    joblib.dump(rf, "model.pkl")

    # Save metadata
    metadata = {
        "job_id": "1a2df142-8854-4cd1-bf73-e9f10c993d15",
        "model_name": "test-model-123",
        "accuracy": accuracy,
        "feature_names": feature_columns,
        "n_classes": len(y.unique()),
        "classes": y.unique().tolist()
    }

    with open("metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)

    print("Training completed successfully!")

    # Deploy inference Space
    deploy_inference_space()

def deploy_inference_space():
    print("Deploying inference Space...")

    token = os.getenv("HF_TOKEN")
    api = HfApi(token=token)
    user_info = api.whoami()
    username = user_info["name"]

    inference_space_name = "test-model-123-inference"
    inference_repo_id = f"{username}/{inference_space_name}"

    try:
        # Create inference Space
        api.create_repo(
            repo_id=inference_repo_id,
            repo_type="space",
            space_sdk="gradio"
        )

        # Upload inference app
        inference_app = generate_inference_app()
        api.upload_file(
            path_or_fileobj=inference_app.encode(),
            path_in_repo="app.py",
            repo_id=inference_repo_id,
            repo_type="space"
        )

        # Upload model and metadata
        with open("model.pkl", "rb") as f:
            api.upload_file(
                path_or_fileobj=f,
                path_in_repo="model.pkl",
                repo_id=inference_repo_id,
                repo_type="space"
            )

        with open("metadata.json", "rb") as f:
            api.upload_file(
                path_or_fileobj=f,
                path_in_repo="metadata.json",
                repo_id=inference_repo_id,
                repo_type="space"
            )

        print(f"Inference Space deployed: https://huggingface.co/spaces/{inference_repo_id}")

    except Exception as e:
        print(f"Failed to deploy inference Space: {e}")

def generate_inference_app():
    return '''
import gradio as gr
import joblib
import json
import pandas as pd
from fastapi import FastAPI
from fastapi.responses import JSONResponse
import uvicorn
import threading

# Load model and metadata
model = joblib.load("model.pkl")
with open("metadata.json", "r") as f:
    metadata = json.load(f)

feature_names = metadata["feature_names"]

def predict(*features):
    """Make prediction with the trained model"""

    # Create input DataFrame
    input_data = pd.DataFrame([list(features)], columns=feature_names)

    # Predict
    prediction = model.predict(input_data)[0]
    probabilities = model.predict_proba(input_data)[0]

    # Format results
    prob_dict = {f"Class {i}": prob for i, prob in enumerate(probabilities)}

    return f"Predicted Class: {prediction}", prob_dict

def predict_batch_from_url(file_url):
    """Make batch predictions from CSV URL"""
    try:
        # Download and process CSV
        df = pd.read_csv(file_url)

        # Check if columns match
        if not all(col in df.columns for col in feature_names):
            return {"error": f"CSV must contain columns: {feature_names}"}

        # Select only the feature columns
        X = df[feature_names]

        # Make predictions
        predictions = model.predict(X)
        probabilities = model.predict_proba(X)

        # Format results
        results = []
        for i, (pred, probs) in enumerate(zip(predictions, probabilities)):
            prob_dict = {f"Class {j}": float(prob) for j, prob in enumerate(probs)}
            results.append({
                "prediction": int(pred),
                "probabilities": prob_dict
            })

        return {"predictions": results}

    except Exception as e:
        return {"error": str(e)}

# FastAPI for batch predictions
app = FastAPI()

@app.post("/api/predict_batch")
async def api_predict_batch(request: dict):
    file_url = request.get("file_url")
    if not file_url:
        return JSONResponse({"error": "file_url is required"}, status_code=400)

    result = predict_batch_from_url(file_url)
    return JSONResponse(result)

# Gradio interface for single predictions
inputs = [gr.Number(label=name) for name in feature_names]
outputs = [
    gr.Textbox(label="Prediction"),
    gr.Label(label="Probabilities")
]

interface = gr.Interface(
    fn=predict,
    inputs=inputs,
    outputs=outputs,
    title=f"{metadata['model_name']} - ML Classifier",
    description=f"Accuracy: {metadata['accuracy']:.4f} | Features: {len(feature_names)}"
)

def run_fastapi():
    uvicorn.run(app, host="0.0.0.0", port=8000)

if __name__ == "__main__":
    # Start FastAPI in background
    fastapi_thread = threading.Thread(target=run_fastapi, daemon=True)
    fastapi_thread.start()

    # Start Gradio
    interface.launch(server_port=7860)
'''

if __name__ == "__main__":
    main()