import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report import joblib import json from huggingface_hub import HfApi import os def main(): print("Starting RandomForest training...") # Load dataset from URL import json with open("dataset_config.json", "r") as f: config = json.load(f) file_url = config["file_url"] print(f"Downloading dataset from: {file_url}") df = pd.read_csv(file_url) print(f"Dataset shape: {df.shape}") # Separate features and target feature_columns = [col for col in df.columns if col != 'label'] X = df[feature_columns] y = df['label'] print(f"Features: {feature_columns}") print(f"Classes: {y.unique().tolist()}") # Train-test split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Train RandomForest rf = RandomForestClassifier( n_estimators=100, max_depth=None, random_state=42 ) print("Training model...") rf.fit(X_train, y_train) # Evaluate y_pred = rf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"Accuracy: {accuracy:.4f}") print("\nClassification Report:") print(classification_report(y_test, y_pred)) # Save model joblib.dump(rf, "model.pkl") # Save metadata metadata = { "job_id": "1a2df142-8854-4cd1-bf73-e9f10c993d15", "model_name": "test-model-123", "accuracy": accuracy, "feature_names": feature_columns, "n_classes": len(y.unique()), "classes": y.unique().tolist() } with open("metadata.json", "w") as f: json.dump(metadata, f, indent=2) print("Training completed successfully!") # Deploy inference Space deploy_inference_space() def deploy_inference_space(): print("Deploying inference Space...") token = os.getenv("HF_TOKEN") api = HfApi(token=token) user_info = api.whoami() username = user_info["name"] inference_space_name = "test-model-123-inference" inference_repo_id = f"{username}/{inference_space_name}" try: # Create inference Space api.create_repo( repo_id=inference_repo_id, repo_type="space", space_sdk="gradio" ) # Upload inference app inference_app = generate_inference_app() api.upload_file( path_or_fileobj=inference_app.encode(), path_in_repo="app.py", repo_id=inference_repo_id, repo_type="space" ) # Upload model and metadata with open("model.pkl", "rb") as f: api.upload_file( path_or_fileobj=f, path_in_repo="model.pkl", repo_id=inference_repo_id, repo_type="space" ) with open("metadata.json", "rb") as f: api.upload_file( path_or_fileobj=f, path_in_repo="metadata.json", repo_id=inference_repo_id, repo_type="space" ) print(f"Inference Space deployed: https://huggingface.co/spaces/{inference_repo_id}") except Exception as e: print(f"Failed to deploy inference Space: {e}") def generate_inference_app(): return ''' import gradio as gr import joblib import json import pandas as pd from fastapi import FastAPI from fastapi.responses import JSONResponse import uvicorn import threading # Load model and metadata model = joblib.load("model.pkl") with open("metadata.json", "r") as f: metadata = json.load(f) feature_names = metadata["feature_names"] def predict(*features): """Make prediction with the trained model""" # Create input DataFrame input_data = pd.DataFrame([list(features)], columns=feature_names) # Predict prediction = model.predict(input_data)[0] probabilities = model.predict_proba(input_data)[0] # Format results prob_dict = {f"Class {i}": prob for i, prob in enumerate(probabilities)} return f"Predicted Class: {prediction}", prob_dict def predict_batch_from_url(file_url): """Make batch predictions from CSV URL""" try: # Download and process CSV df = pd.read_csv(file_url) # Check if columns match if not all(col in df.columns for col in feature_names): return {"error": f"CSV must contain columns: {feature_names}"} # Select only the feature columns X = df[feature_names] # Make predictions predictions = model.predict(X) probabilities = model.predict_proba(X) # Format results results = [] for i, (pred, probs) in enumerate(zip(predictions, probabilities)): prob_dict = {f"Class {j}": float(prob) for j, prob in enumerate(probs)} results.append({ "prediction": int(pred), "probabilities": prob_dict }) return {"predictions": results} except Exception as e: return {"error": str(e)} # FastAPI for batch predictions app = FastAPI() @app.post("/api/predict_batch") async def api_predict_batch(request: dict): file_url = request.get("file_url") if not file_url: return JSONResponse({"error": "file_url is required"}, status_code=400) result = predict_batch_from_url(file_url) return JSONResponse(result) # Gradio interface for single predictions inputs = [gr.Number(label=name) for name in feature_names] outputs = [ gr.Textbox(label="Prediction"), gr.Label(label="Probabilities") ] interface = gr.Interface( fn=predict, inputs=inputs, outputs=outputs, title=f"{metadata['model_name']} - ML Classifier", description=f"Accuracy: {metadata['accuracy']:.4f} | Features: {len(feature_names)}" ) def run_fastapi(): uvicorn.run(app, host="0.0.0.0", port=8000) if __name__ == "__main__": # Start FastAPI in background fastapi_thread = threading.Thread(target=run_fastapi, daemon=True) fastapi_thread.start() # Start Gradio interface.launch(server_port=7860) ''' if __name__ == "__main__": main()