AutoML / app.py
Al1Abdullah's picture
Initial commit of AutoML project
39e56b0
import matplotlib
matplotlib.use('Agg')
from flask import Flask, request, jsonify, send_from_directory
import pandas as pd
from rag.memory import store_dataset, get_dataset, store_model_results, get_model_results
from rag.rag_query import query_dataset_with_groq
from models.supervised import train_model as train_supervised
from models.unsupervised import train_unsupervised
from visuals.charts import (
plot_histogram, plot_bar, plot_scatter, plot_box, plot_pie, plot_heatmap,
plot_confusion_matrix, plot_roc_curve, plot_feature_importance,
plot_elbow_curve, plot_cluster_plot, plot_dendrogram, plot_tsne
)
import os
import logging
import json
import re
import matplotlib.pyplot as plt
import io
import base64
import numpy as np
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
app = Flask(__name__, static_folder='frontend')
dataset_name = "active_dataset"
@app.route('/')
def index():
"""Serve the main HTML page."""
return send_from_directory(app.static_folder, 'index.html')
@app.route('/<path:path>')
def static_proxy(path):
"""Serve static files from the frontend directory."""
return send_from_directory(app.static_folder, path)
@app.route('/api/upload', methods=['POST'])
def upload_csv():
"""Handle CSV file uploads, store the dataset, and return a success message."""
if 'file' not in request.files:
logging.warning("No file part in upload request.")
return jsonify({"error": "No file part"}), 400
file = request.files['file']
if file.filename == '':
logging.warning("No selected file in upload request.")
return jsonify({"error": "No selected file"}), 400
try:
df = pd.read_csv(file)
store_dataset(dataset_name, df)
logging.info(f"Uploaded {df.shape[0]} rows and {df.shape[1]} columns.")
return jsonify({"message": f"Uploaded {df.shape[0]} rows and {df.shape[1]} columns."})
except Exception as e:
logging.error(f"Error uploading file: {e}")
return jsonify({"error": str(e)}), 500
@app.route('/api/columns', methods=['GET'])
def get_columns():
"""Return a list of column names from the currently loaded dataset."""
df = get_dataset(dataset_name)
if df is not None:
return jsonify({"columns": list(df.columns)})
logging.info("No dataset loaded when requesting columns.")
return jsonify({"columns": []})
@app.route('/api/learning_type', methods=['GET'])
def get_learning_type():
"""Determine and return the learning type (supervised/unsupervised) and target column using LLM intelligence."""
df = get_dataset(dataset_name)
if df is None:
logging.warning("No dataset uploaded when requesting learning type.")
return jsonify({"error": "No dataset uploaded yet."}), 400
dtypes_str = df.dtypes.to_string()
prompt = (
"You are an expert data scientist. Your task is to analyze a dataset and determine its learning type (supervised or unsupervised). "
"If it's a supervised learning problem, you MUST identify the single target column that the other columns would predict. "
"A target column is typically a label, outcome, or value that is being predicted (e.g., 'price', 'churn', 'diagnosis', 'category', 'sales'). "
"If no such clear target column exists, it's an unsupervised problem. "
"Respond ONLY with a JSON object, and nothing else. Do NOT include any introductory/concluding remarks, explanations, or markdown outside the JSON. "
"The JSON must strictly follow this format: "
"{\"learning_type\": \"Supervised\", \"target_column\": \"your_target_column_name\"} "
"OR "
"{\"learning_type\": \"Unsupervised\", \"target_column\": null}. "
"\n\n"
f"COLUMNS AND DATA TYPES:\n{dtypes_str}\n\n"
f"DATA SAMPLE:\n{df.head().to_string()}"
)
try:
response_text = query_dataset_with_groq(dataset_name, prompt).strip()
logging.info(f"Raw LLM response for learning type: {response_text}")
# Attempt to parse the JSON response
try:
data = json.loads(response_text)
learning_type = data.get("learning_type", "Unsupervised")
target_column = data.get("target_column")
logging.info(f"Parsed LLM response - learning_type: {learning_type}, target_column: {target_column}")
# Validate the target column if learning_type is Supervised
if learning_type == "Supervised":
if target_column is None or target_column not in df.columns:
logging.warning(f"LLM suggested supervised learning but target column '{target_column}' is invalid or not found. Defaulting to Unsupervised.")
learning_type = "Unsupervised"
target_column = None
else:
# If LLM says unsupervised, ensure target_column is null
target_column = None
except json.JSONDecodeError:
logging.error(f"LLM response is not a valid JSON: {response_text}. Attempting regex fallback.")
# Fallback: Try to extract using regex if JSON parsing fails (less reliable)
match = re.search(r'"learning_type"\s*:\s*"(Supervised|Unsupervised)"(?:,\s*"target_column"\s*:\s*"?([a-zA-Z0-9_]+)?"?)?', response_text)
if match:
learning_type = match.group(1)
target_column = match.group(2) if match.group(2) else None
logging.info(f"Regex fallback parsed - learning_type: {learning_type}, target_column: {target_column}")
if learning_type == "Supervised" and (target_column is None or target_column not in df.columns):
logging.warning(f"Regex fallback: Invalid target column '{target_column}' for supervised. Defaulting to Unsupervised.")
learning_type = "Unsupervised"
target_column = None
elif learning_type == "Unsupervised":
target_column = None
else:
logging.error("Could not parse LLM response for learning type using regex fallback. Defaulting to Unsupervised.")
learning_type = "Unsupervised"
target_column = None
return jsonify({"learning_type": learning_type, "target_column": target_column})
except Exception as e:
logging.error(f"An unexpected error occurred while determining learning type: {str(e)}", exc_info=True)
# Fallback to a default in case of any error during Groq call or initial processing
return jsonify({"learning_type": "Unsupervised", "target_column": None})
@app.route('/api/train', methods=['POST'])
def train_model_api():
"""Handle model training requests for both supervised and unsupervised learning."""
data = request.json
model_name = data.get('model_name')
target_col = data.get('target_col')
learning_type = data.get('learning_type')
df = get_dataset(dataset_name)
if df is None:
logging.warning("No dataset uploaded when requesting model training.")
return jsonify({"error": "No dataset uploaded yet."}), 400
if learning_type == "Supervised":
if not target_col or target_col == 'None':
logging.warning("No target column provided for supervised training.")
return jsonify({"error": "Please select a target column for supervised learning."}), 400
model, metrics, y_test, y_pred, y_pred_proba, X_test = train_supervised(df, target_col, model_name)
if model:
store_model_results(dataset_name, model, y_test, y_pred, y_pred_proba, X_test)
logging.info(f"{model_name} trained successfully for supervised learning.")
return jsonify({"message": f"{model_name} trained successfully.", "metrics": metrics})
else:
logging.error(f"Failed to train {model_name} for supervised learning. Reason: {metrics}")
return jsonify({"error": f"Failed to train {model_name}. Reason: {metrics}"}), 500
else: # Unsupervised
model, result = train_unsupervised(df, model_name)
if model:
logging.info(f"{model_name} trained successfully for unsupervised learning.")
return jsonify({"message": f"{model_name} trained successfully.", "result": result.tolist() if hasattr(result, 'tolist') else result})
else:
logging.error(f"Failed to train {model_name} for unsupervised learning. Reason: {result}")
return jsonify({"error": f"Failed to train {model_name}. Reason: {result}"}), 500
@app.route('/api/plot', methods=['POST'])
def generate_plot_api():
"""Generate and return a plot based on the requested type and columns."""
data = request.json
plot_type = data.get('plot_type')
col1 = data.get('col1')
col2 = data.get('col2')
df = get_dataset(dataset_name)
if df is None:
logging.warning("No dataset loaded when requesting plot generation.")
return jsonify({"error": "No data loaded."}), 400
plot_functions = {
"Histogram": plot_histogram,
"Bar": plot_bar,
"Scatter": plot_scatter,
"Box": plot_box,
"Pie": plot_pie,
"Heatmap": plot_heatmap,
"Elbow Curve": plot_elbow_curve,
"Cluster Plot": plot_cluster_plot,
"Dendrogram": plot_dendrogram,
"t-SNE": plot_tsne,
"Confusion Matrix": plot_confusion_matrix,
"ROC Curve": plot_roc_curve,
"Feature Importance Plot": plot_feature_importance
}
if plot_type not in plot_functions:
logging.warning(f"Unsupported plot type requested: {plot_type}")
return jsonify({"error": "Plot not supported."}), 400
fig, err = None, None
try:
if plot_type == "Scatter":
fig, err = plot_functions[plot_type](df, col1, col2, data.get('color_col'))
elif plot_type == "Box":
fig, err = plot_functions[plot_type](df, col1, col2)
elif plot_type == "Heatmap":
fig, err = plot_functions[plot_type](df)
elif plot_type == "Elbow Curve":
from utils.data_cleaner import prepare_data
X_prepared, _ = prepare_data(df)
fig, err = plot_functions[plot_type](X_prepared)
elif plot_type == "Cluster Plot":
from utils.data_cleaner import prepare_data
from sklearn.cluster import KMeans
X_prepared, _ = prepare_data(df)
if X_prepared.empty:
return jsonify({"error": "Data is empty after cleaning for Cluster Plot."}), 400
# Perform KMeans clustering (e.g., with 3 clusters)
n_clusters = 3 # Default number of clusters
if len(X_prepared) < n_clusters:
n_clusters = len(X_prepared) # Adjust n_clusters if data points are fewer
if n_clusters == 0:
return jsonify({"error": "Not enough data points to form clusters."}), 400
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans.fit(X_prepared)
labels = kmeans.labels_
fig, err = plot_functions[plot_type](X_prepared, labels=labels)
elif plot_type == "Dendrogram":
from utils.data_cleaner import prepare_data
X_prepared, _ = prepare_data(df)
fig, err = plot_functions[plot_type](X_prepared)
elif plot_type == "t-SNE":
from utils.data_cleaner import prepare_data
X_prepared, _ = prepare_data(df)
fig, err = plot_functions[plot_type](X_prepared)
elif plot_type in ["Confusion Matrix", "ROC Curve", "Feature Importance Plot"]:
model_results = get_model_results(dataset_name)
if not model_results:
logging.warning(f"No trained model found for {plot_type} plot.")
return jsonify({"error": "No trained model found. Please train a supervised model first."}), 400
model = model_results['model']
y_test = model_results['y_test']
y_pred = model_results['y_pred']
y_pred_proba = model_results['y_pred_proba']
X_test = model_results['X_test']
if plot_type == "Confusion Matrix":
# Need to get class names. For simplicity, using unique values from y_test.
class_names = [str(c) for c in sorted(pd.Series(y_test).unique())]
fig, err = plot_functions[plot_type](y_test, y_pred, class_names)
elif plot_type == "ROC Curve":
if y_pred_proba is None:
logging.warning("ROC Curve requested but model does not provide probability predictions.")
return jsonify({"error": "ROC Curve requires probability predictions, which this model does not provide."}), 400
fig, err = plot_functions[plot_type](y_test, y_pred_proba)
elif plot_type == "Feature Importance Plot":
if not hasattr(model, 'feature_importances_'):
logging.warning("Feature Importance Plot requested but model does not have feature importances.")
return jsonify({"error": "Model does not have feature importances to plot."}), 400
# Feature names are from X_test columns
feature_names = X_test.columns.tolist()
fig, err = plot_functions[plot_type](model, feature_names)
else:
# Default case for plots that only need one column (e.g., Histogram, Bar, Pie)
fig, err = plot_functions[plot_type](df, col1)
if err:
logging.error(f"Plot generation error for {plot_type}: {err}")
return jsonify({"error": err}), 400
# Save plot to a BytesIO object and encode to base64
buf = io.BytesIO()
fig.savefig(buf, format='png', bbox_inches='tight')
plt.close(fig)
buf.seek(0)
img_str = base64.b64encode(buf.read()).decode('utf-8')
return jsonify({'image': img_str})
except Exception as e:
logging.error(f"An unexpected error occurred during plot generation for {plot_type}: {e}", exc_info=True)
return jsonify({"error": f"An internal server error occurred: {str(e)}"}), 500
@app.route('/api/plot_options', methods=['GET'])
def plot_options():
"""Return a list of available plot options based on the dataset's learning type."""
df = get_dataset(dataset_name)
if df is None:
logging.warning("No dataset uploaded when requesting plot options.")
return jsonify({"error": "No dataset uploaded yet."}), 400
# Get learning type from the dedicated endpoint
learning_type_response = get_learning_type()
learning_type_data = learning_type_response.get_json()
learning_type = learning_type_data.get('learning_type', 'Unsupervised')
if learning_type == "Supervised":
plots = ["Histogram", "Bar", "Scatter", "Box", "Pie", "Heatmap", "Confusion Matrix", "ROC Curve", "Feature Importance Plot"]
else:
plots = ["Histogram", "Bar", "Scatter", "Box", "Pie", "Heatmap", "Cluster Plot", "Elbow Curve", "Dendrogram", "t-SNE"]
# Ensure Scatter Plot is always available if there are at least two numeric columns
numeric_cols = df.select_dtypes(include=np.number).columns
if len(numeric_cols) >= 2 and "Scatter" not in plots:
plots.insert(2, "Scatter") # Insert at a reasonable position
return jsonify({"plots": plots})
@app.route('/api/ask', methods=['POST'])
def ask_question_api():
"""Handle user questions to the AI about the dataset."""
data = request.json
user_query = data.get('user_query')
if not user_query:
logging.warning("Empty user query received for AI assistant.")
return jsonify({"error": "Please ask a question."}), 400
answer = query_dataset_with_groq(dataset_name, user_query)
return jsonify({"answer": answer})
if __name__ == '__main__':
logging.info("Starting Flask application...")
app.run(debug=True, port=5001)