Spaces:
Sleeping
Sleeping
File size: 16,179 Bytes
aa68823 39e56b0 aa68823 39e56b0 aa68823 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 |
import matplotlib
matplotlib.use('Agg')
from flask import Flask, request, jsonify, send_from_directory
import pandas as pd
from rag.memory import store_dataset, get_dataset, store_model_results, get_model_results
from rag.rag_query import query_dataset_with_groq
from models.supervised import train_model as train_supervised
from models.unsupervised import train_unsupervised
from visuals.charts import (
plot_histogram, plot_bar, plot_scatter, plot_box, plot_pie, plot_heatmap,
plot_confusion_matrix, plot_roc_curve, plot_feature_importance,
plot_elbow_curve, plot_cluster_plot, plot_dendrogram, plot_tsne
)
import os
import logging
import json
import re
import matplotlib.pyplot as plt
import io
import base64
import numpy as np
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
app = Flask(__name__, static_folder='frontend')
dataset_name = "active_dataset"
@app.route('/')
def index():
"""Serve the main HTML page."""
return send_from_directory(app.static_folder, 'index.html')
@app.route('/<path:path>')
def static_proxy(path):
"""Serve static files from the frontend directory."""
return send_from_directory(app.static_folder, path)
@app.route('/api/upload', methods=['POST'])
def upload_csv():
"""Handle CSV file uploads, store the dataset, and return a success message."""
if 'file' not in request.files:
logging.warning("No file part in upload request.")
return jsonify({"error": "No file part"}), 400
file = request.files['file']
if file.filename == '':
logging.warning("No selected file in upload request.")
return jsonify({"error": "No selected file"}), 400
try:
df = pd.read_csv(file)
store_dataset(dataset_name, df)
logging.info(f"Uploaded {df.shape[0]} rows and {df.shape[1]} columns.")
return jsonify({"message": f"Uploaded {df.shape[0]} rows and {df.shape[1]} columns."})
except Exception as e:
logging.error(f"Error uploading file: {e}")
return jsonify({"error": str(e)}), 500
@app.route('/api/columns', methods=['GET'])
def get_columns():
"""Return a list of column names from the currently loaded dataset."""
df = get_dataset(dataset_name)
if df is not None:
return jsonify({"columns": list(df.columns)})
logging.info("No dataset loaded when requesting columns.")
return jsonify({"columns": []})
@app.route('/api/learning_type', methods=['GET'])
def get_learning_type():
"""Determine and return the learning type (supervised/unsupervised) and target column using LLM intelligence."""
df = get_dataset(dataset_name)
if df is None:
logging.warning("No dataset uploaded when requesting learning type.")
return jsonify({"error": "No dataset uploaded yet."}), 400
dtypes_str = df.dtypes.to_string()
prompt = (
"You are an expert data scientist. Your task is to analyze a dataset and determine its learning type (supervised or unsupervised). "
"If it's a supervised learning problem, you MUST identify the single target column that the other columns would predict. "
"A target column is typically a label, outcome, or value that is being predicted (e.g., 'price', 'churn', 'diagnosis', 'category', 'sales'). "
"If no such clear target column exists, it's an unsupervised problem. "
"Respond ONLY with a JSON object, and nothing else. Do NOT include any introductory/concluding remarks, explanations, or markdown outside the JSON. "
"The JSON must strictly follow this format: "
"{\"learning_type\": \"Supervised\", \"target_column\": \"your_target_column_name\"} "
"OR "
"{\"learning_type\": \"Unsupervised\", \"target_column\": null}. "
"\n\n"
f"COLUMNS AND DATA TYPES:\n{dtypes_str}\n\n"
f"DATA SAMPLE:\n{df.head().to_string()}"
)
try:
response_text = query_dataset_with_groq(dataset_name, prompt).strip()
logging.info(f"Raw LLM response for learning type: {response_text}")
# Attempt to parse the JSON response
try:
data = json.loads(response_text)
learning_type = data.get("learning_type", "Unsupervised")
target_column = data.get("target_column")
logging.info(f"Parsed LLM response - learning_type: {learning_type}, target_column: {target_column}")
# Validate the target column if learning_type is Supervised
if learning_type == "Supervised":
if target_column is None or target_column not in df.columns:
logging.warning(f"LLM suggested supervised learning but target column '{target_column}' is invalid or not found. Defaulting to Unsupervised.")
learning_type = "Unsupervised"
target_column = None
else:
# If LLM says unsupervised, ensure target_column is null
target_column = None
except json.JSONDecodeError:
logging.error(f"LLM response is not a valid JSON: {response_text}. Attempting regex fallback.")
# Fallback: Try to extract using regex if JSON parsing fails (less reliable)
match = re.search(r'"learning_type"\s*:\s*"(Supervised|Unsupervised)"(?:,\s*"target_column"\s*:\s*"?([a-zA-Z0-9_]+)?"?)?', response_text)
if match:
learning_type = match.group(1)
target_column = match.group(2) if match.group(2) else None
logging.info(f"Regex fallback parsed - learning_type: {learning_type}, target_column: {target_column}")
if learning_type == "Supervised" and (target_column is None or target_column not in df.columns):
logging.warning(f"Regex fallback: Invalid target column '{target_column}' for supervised. Defaulting to Unsupervised.")
learning_type = "Unsupervised"
target_column = None
elif learning_type == "Unsupervised":
target_column = None
else:
logging.error("Could not parse LLM response for learning type using regex fallback. Defaulting to Unsupervised.")
learning_type = "Unsupervised"
target_column = None
return jsonify({"learning_type": learning_type, "target_column": target_column})
except Exception as e:
logging.error(f"An unexpected error occurred while determining learning type: {str(e)}", exc_info=True)
# Fallback to a default in case of any error during Groq call or initial processing
return jsonify({"learning_type": "Unsupervised", "target_column": None})
@app.route('/api/train', methods=['POST'])
def train_model_api():
"""Handle model training requests for both supervised and unsupervised learning."""
data = request.json
model_name = data.get('model_name')
target_col = data.get('target_col')
learning_type = data.get('learning_type')
df = get_dataset(dataset_name)
if df is None:
logging.warning("No dataset uploaded when requesting model training.")
return jsonify({"error": "No dataset uploaded yet."}), 400
if learning_type == "Supervised":
if not target_col or target_col == 'None':
logging.warning("No target column provided for supervised training.")
return jsonify({"error": "Please select a target column for supervised learning."}), 400
model, metrics, y_test, y_pred, y_pred_proba, X_test = train_supervised(df, target_col, model_name)
if model:
store_model_results(dataset_name, model, y_test, y_pred, y_pred_proba, X_test)
logging.info(f"{model_name} trained successfully for supervised learning.")
return jsonify({"message": f"{model_name} trained successfully.", "metrics": metrics})
else:
logging.error(f"Failed to train {model_name} for supervised learning. Reason: {metrics}")
return jsonify({"error": f"Failed to train {model_name}. Reason: {metrics}"}), 500
else: # Unsupervised
model, result = train_unsupervised(df, model_name)
if model:
logging.info(f"{model_name} trained successfully for unsupervised learning.")
return jsonify({"message": f"{model_name} trained successfully.", "result": result.tolist() if hasattr(result, 'tolist') else result})
else:
logging.error(f"Failed to train {model_name} for unsupervised learning. Reason: {result}")
return jsonify({"error": f"Failed to train {model_name}. Reason: {result}"}), 500
@app.route('/api/plot', methods=['POST'])
def generate_plot_api():
"""Generate and return a plot based on the requested type and columns."""
data = request.json
plot_type = data.get('plot_type')
col1 = data.get('col1')
col2 = data.get('col2')
df = get_dataset(dataset_name)
if df is None:
logging.warning("No dataset loaded when requesting plot generation.")
return jsonify({"error": "No data loaded."}), 400
plot_functions = {
"Histogram": plot_histogram,
"Bar": plot_bar,
"Scatter": plot_scatter,
"Box": plot_box,
"Pie": plot_pie,
"Heatmap": plot_heatmap,
"Elbow Curve": plot_elbow_curve,
"Cluster Plot": plot_cluster_plot,
"Dendrogram": plot_dendrogram,
"t-SNE": plot_tsne,
"Confusion Matrix": plot_confusion_matrix,
"ROC Curve": plot_roc_curve,
"Feature Importance Plot": plot_feature_importance
}
if plot_type not in plot_functions:
logging.warning(f"Unsupported plot type requested: {plot_type}")
return jsonify({"error": "Plot not supported."}), 400
fig, err = None, None
try:
if plot_type == "Scatter":
fig, err = plot_functions[plot_type](df, col1, col2, data.get('color_col'))
elif plot_type == "Box":
fig, err = plot_functions[plot_type](df, col1, col2)
elif plot_type == "Heatmap":
fig, err = plot_functions[plot_type](df)
elif plot_type == "Elbow Curve":
from utils.data_cleaner import prepare_data
X_prepared, _ = prepare_data(df)
fig, err = plot_functions[plot_type](X_prepared)
elif plot_type == "Cluster Plot":
from utils.data_cleaner import prepare_data
from sklearn.cluster import KMeans
X_prepared, _ = prepare_data(df)
if X_prepared.empty:
return jsonify({"error": "Data is empty after cleaning for Cluster Plot."}), 400
# Perform KMeans clustering (e.g., with 3 clusters)
n_clusters = 3 # Default number of clusters
if len(X_prepared) < n_clusters:
n_clusters = len(X_prepared) # Adjust n_clusters if data points are fewer
if n_clusters == 0:
return jsonify({"error": "Not enough data points to form clusters."}), 400
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans.fit(X_prepared)
labels = kmeans.labels_
fig, err = plot_functions[plot_type](X_prepared, labels=labels)
elif plot_type == "Dendrogram":
from utils.data_cleaner import prepare_data
X_prepared, _ = prepare_data(df)
fig, err = plot_functions[plot_type](X_prepared)
elif plot_type == "t-SNE":
from utils.data_cleaner import prepare_data
X_prepared, _ = prepare_data(df)
fig, err = plot_functions[plot_type](X_prepared)
elif plot_type in ["Confusion Matrix", "ROC Curve", "Feature Importance Plot"]:
model_results = get_model_results(dataset_name)
if not model_results:
logging.warning(f"No trained model found for {plot_type} plot.")
return jsonify({"error": "No trained model found. Please train a supervised model first."}), 400
model = model_results['model']
y_test = model_results['y_test']
y_pred = model_results['y_pred']
y_pred_proba = model_results['y_pred_proba']
X_test = model_results['X_test']
if plot_type == "Confusion Matrix":
# Need to get class names. For simplicity, using unique values from y_test.
class_names = [str(c) for c in sorted(pd.Series(y_test).unique())]
fig, err = plot_functions[plot_type](y_test, y_pred, class_names)
elif plot_type == "ROC Curve":
if y_pred_proba is None:
logging.warning("ROC Curve requested but model does not provide probability predictions.")
return jsonify({"error": "ROC Curve requires probability predictions, which this model does not provide."}), 400
fig, err = plot_functions[plot_type](y_test, y_pred_proba)
elif plot_type == "Feature Importance Plot":
if not hasattr(model, 'feature_importances_'):
logging.warning("Feature Importance Plot requested but model does not have feature importances.")
return jsonify({"error": "Model does not have feature importances to plot."}), 400
# Feature names are from X_test columns
feature_names = X_test.columns.tolist()
fig, err = plot_functions[plot_type](model, feature_names)
else:
# Default case for plots that only need one column (e.g., Histogram, Bar, Pie)
fig, err = plot_functions[plot_type](df, col1)
if err:
logging.error(f"Plot generation error for {plot_type}: {err}")
return jsonify({"error": err}), 400
# Save plot to a BytesIO object and encode to base64
buf = io.BytesIO()
fig.savefig(buf, format='png', bbox_inches='tight')
plt.close(fig)
buf.seek(0)
img_str = base64.b64encode(buf.read()).decode('utf-8')
return jsonify({'image': img_str})
except Exception as e:
logging.error(f"An unexpected error occurred during plot generation for {plot_type}: {e}", exc_info=True)
return jsonify({"error": f"An internal server error occurred: {str(e)}"}), 500
@app.route('/api/plot_options', methods=['GET'])
def plot_options():
"""Return a list of available plot options based on the dataset's learning type."""
df = get_dataset(dataset_name)
if df is None:
logging.warning("No dataset uploaded when requesting plot options.")
return jsonify({"error": "No dataset uploaded yet."}), 400
# Get learning type from the dedicated endpoint
learning_type_response = get_learning_type()
learning_type_data = learning_type_response.get_json()
learning_type = learning_type_data.get('learning_type', 'Unsupervised')
if learning_type == "Supervised":
plots = ["Histogram", "Bar", "Scatter", "Box", "Pie", "Heatmap", "Confusion Matrix", "ROC Curve", "Feature Importance Plot"]
else:
plots = ["Histogram", "Bar", "Scatter", "Box", "Pie", "Heatmap", "Cluster Plot", "Elbow Curve", "Dendrogram", "t-SNE"]
# Ensure Scatter Plot is always available if there are at least two numeric columns
numeric_cols = df.select_dtypes(include=np.number).columns
if len(numeric_cols) >= 2 and "Scatter" not in plots:
plots.insert(2, "Scatter") # Insert at a reasonable position
return jsonify({"plots": plots})
@app.route('/api/ask', methods=['POST'])
def ask_question_api():
"""Handle user questions to the AI about the dataset."""
data = request.json
user_query = data.get('user_query')
if not user_query:
logging.warning("Empty user query received for AI assistant.")
return jsonify({"error": "Please ask a question."}), 400
answer = query_dataset_with_groq(dataset_name, user_query)
return jsonify({"answer": answer})
if __name__ == '__main__':
logging.info("Starting Flask application...")
app.run(debug=True, port=5001)
|