Spaces:
Sleeping
Sleeping
| import matplotlib | |
| matplotlib.use('Agg') | |
| from flask import Flask, request, jsonify, send_from_directory | |
| import pandas as pd | |
| from rag.memory import store_dataset, get_dataset, store_model_results, get_model_results | |
| from rag.rag_query import query_dataset_with_groq | |
| from models.supervised import train_model as train_supervised | |
| from models.unsupervised import train_unsupervised | |
| from visuals.charts import ( | |
| plot_histogram, plot_bar, plot_scatter, plot_box, plot_pie, plot_heatmap, | |
| plot_confusion_matrix, plot_roc_curve, plot_feature_importance, | |
| plot_elbow_curve, plot_cluster_plot, plot_dendrogram, plot_tsne | |
| ) | |
| import os | |
| import logging | |
| import json | |
| import re | |
| import matplotlib.pyplot as plt | |
| import io | |
| import base64 | |
| import numpy as np | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| app = Flask(__name__, static_folder='frontend') | |
| dataset_name = "active_dataset" | |
| def index(): | |
| """Serve the main HTML page.""" | |
| return send_from_directory(app.static_folder, 'index.html') | |
| def static_proxy(path): | |
| """Serve static files from the frontend directory.""" | |
| return send_from_directory(app.static_folder, path) | |
| def upload_csv(): | |
| """Handle CSV file uploads, store the dataset, and return a success message.""" | |
| if 'file' not in request.files: | |
| logging.warning("No file part in upload request.") | |
| return jsonify({"error": "No file part"}), 400 | |
| file = request.files['file'] | |
| if file.filename == '': | |
| logging.warning("No selected file in upload request.") | |
| return jsonify({"error": "No selected file"}), 400 | |
| try: | |
| df = pd.read_csv(file) | |
| store_dataset(dataset_name, df) | |
| logging.info(f"Uploaded {df.shape[0]} rows and {df.shape[1]} columns.") | |
| return jsonify({"message": f"Uploaded {df.shape[0]} rows and {df.shape[1]} columns."}) | |
| except Exception as e: | |
| logging.error(f"Error uploading file: {e}") | |
| return jsonify({"error": str(e)}), 500 | |
| def get_columns(): | |
| """Return a list of column names from the currently loaded dataset.""" | |
| df = get_dataset(dataset_name) | |
| if df is not None: | |
| return jsonify({"columns": list(df.columns)}) | |
| logging.info("No dataset loaded when requesting columns.") | |
| return jsonify({"columns": []}) | |
| def get_learning_type(): | |
| """Determine and return the learning type (supervised/unsupervised) and target column using LLM intelligence.""" | |
| df = get_dataset(dataset_name) | |
| if df is None: | |
| logging.warning("No dataset uploaded when requesting learning type.") | |
| return jsonify({"error": "No dataset uploaded yet."}), 400 | |
| dtypes_str = df.dtypes.to_string() | |
| prompt = ( | |
| "You are an expert data scientist. Your task is to analyze a dataset and determine its learning type (supervised or unsupervised). " | |
| "If it's a supervised learning problem, you MUST identify the single target column that the other columns would predict. " | |
| "A target column is typically a label, outcome, or value that is being predicted (e.g., 'price', 'churn', 'diagnosis', 'category', 'sales'). " | |
| "If no such clear target column exists, it's an unsupervised problem. " | |
| "Respond ONLY with a JSON object, and nothing else. Do NOT include any introductory/concluding remarks, explanations, or markdown outside the JSON. " | |
| "The JSON must strictly follow this format: " | |
| "{\"learning_type\": \"Supervised\", \"target_column\": \"your_target_column_name\"} " | |
| "OR " | |
| "{\"learning_type\": \"Unsupervised\", \"target_column\": null}. " | |
| "\n\n" | |
| f"COLUMNS AND DATA TYPES:\n{dtypes_str}\n\n" | |
| f"DATA SAMPLE:\n{df.head().to_string()}" | |
| ) | |
| try: | |
| response_text = query_dataset_with_groq(dataset_name, prompt).strip() | |
| logging.info(f"Raw LLM response for learning type: {response_text}") | |
| # Attempt to parse the JSON response | |
| try: | |
| data = json.loads(response_text) | |
| learning_type = data.get("learning_type", "Unsupervised") | |
| target_column = data.get("target_column") | |
| logging.info(f"Parsed LLM response - learning_type: {learning_type}, target_column: {target_column}") | |
| # Validate the target column if learning_type is Supervised | |
| if learning_type == "Supervised": | |
| if target_column is None or target_column not in df.columns: | |
| logging.warning(f"LLM suggested supervised learning but target column '{target_column}' is invalid or not found. Defaulting to Unsupervised.") | |
| learning_type = "Unsupervised" | |
| target_column = None | |
| else: | |
| # If LLM says unsupervised, ensure target_column is null | |
| target_column = None | |
| except json.JSONDecodeError: | |
| logging.error(f"LLM response is not a valid JSON: {response_text}. Attempting regex fallback.") | |
| # Fallback: Try to extract using regex if JSON parsing fails (less reliable) | |
| match = re.search(r'"learning_type"\s*:\s*"(Supervised|Unsupervised)"(?:,\s*"target_column"\s*:\s*"?([a-zA-Z0-9_]+)?"?)?', response_text) | |
| if match: | |
| learning_type = match.group(1) | |
| target_column = match.group(2) if match.group(2) else None | |
| logging.info(f"Regex fallback parsed - learning_type: {learning_type}, target_column: {target_column}") | |
| if learning_type == "Supervised" and (target_column is None or target_column not in df.columns): | |
| logging.warning(f"Regex fallback: Invalid target column '{target_column}' for supervised. Defaulting to Unsupervised.") | |
| learning_type = "Unsupervised" | |
| target_column = None | |
| elif learning_type == "Unsupervised": | |
| target_column = None | |
| else: | |
| logging.error("Could not parse LLM response for learning type using regex fallback. Defaulting to Unsupervised.") | |
| learning_type = "Unsupervised" | |
| target_column = None | |
| return jsonify({"learning_type": learning_type, "target_column": target_column}) | |
| except Exception as e: | |
| logging.error(f"An unexpected error occurred while determining learning type: {str(e)}", exc_info=True) | |
| # Fallback to a default in case of any error during Groq call or initial processing | |
| return jsonify({"learning_type": "Unsupervised", "target_column": None}) | |
| def train_model_api(): | |
| """Handle model training requests for both supervised and unsupervised learning.""" | |
| data = request.json | |
| model_name = data.get('model_name') | |
| target_col = data.get('target_col') | |
| learning_type = data.get('learning_type') | |
| df = get_dataset(dataset_name) | |
| if df is None: | |
| logging.warning("No dataset uploaded when requesting model training.") | |
| return jsonify({"error": "No dataset uploaded yet."}), 400 | |
| if learning_type == "Supervised": | |
| if not target_col or target_col == 'None': | |
| logging.warning("No target column provided for supervised training.") | |
| return jsonify({"error": "Please select a target column for supervised learning."}), 400 | |
| model, metrics, y_test, y_pred, y_pred_proba, X_test = train_supervised(df, target_col, model_name) | |
| if model: | |
| store_model_results(dataset_name, model, y_test, y_pred, y_pred_proba, X_test) | |
| logging.info(f"{model_name} trained successfully for supervised learning.") | |
| return jsonify({"message": f"{model_name} trained successfully.", "metrics": metrics}) | |
| else: | |
| logging.error(f"Failed to train {model_name} for supervised learning. Reason: {metrics}") | |
| return jsonify({"error": f"Failed to train {model_name}. Reason: {metrics}"}), 500 | |
| else: # Unsupervised | |
| model, result = train_unsupervised(df, model_name) | |
| if model: | |
| logging.info(f"{model_name} trained successfully for unsupervised learning.") | |
| return jsonify({"message": f"{model_name} trained successfully.", "result": result.tolist() if hasattr(result, 'tolist') else result}) | |
| else: | |
| logging.error(f"Failed to train {model_name} for unsupervised learning. Reason: {result}") | |
| return jsonify({"error": f"Failed to train {model_name}. Reason: {result}"}), 500 | |
| def generate_plot_api(): | |
| """Generate and return a plot based on the requested type and columns.""" | |
| data = request.json | |
| plot_type = data.get('plot_type') | |
| col1 = data.get('col1') | |
| col2 = data.get('col2') | |
| df = get_dataset(dataset_name) | |
| if df is None: | |
| logging.warning("No dataset loaded when requesting plot generation.") | |
| return jsonify({"error": "No data loaded."}), 400 | |
| plot_functions = { | |
| "Histogram": plot_histogram, | |
| "Bar": plot_bar, | |
| "Scatter": plot_scatter, | |
| "Box": plot_box, | |
| "Pie": plot_pie, | |
| "Heatmap": plot_heatmap, | |
| "Elbow Curve": plot_elbow_curve, | |
| "Cluster Plot": plot_cluster_plot, | |
| "Dendrogram": plot_dendrogram, | |
| "t-SNE": plot_tsne, | |
| "Confusion Matrix": plot_confusion_matrix, | |
| "ROC Curve": plot_roc_curve, | |
| "Feature Importance Plot": plot_feature_importance | |
| } | |
| if plot_type not in plot_functions: | |
| logging.warning(f"Unsupported plot type requested: {plot_type}") | |
| return jsonify({"error": "Plot not supported."}), 400 | |
| fig, err = None, None | |
| try: | |
| if plot_type == "Scatter": | |
| fig, err = plot_functions[plot_type](df, col1, col2, data.get('color_col')) | |
| elif plot_type == "Box": | |
| fig, err = plot_functions[plot_type](df, col1, col2) | |
| elif plot_type == "Heatmap": | |
| fig, err = plot_functions[plot_type](df) | |
| elif plot_type == "Elbow Curve": | |
| from utils.data_cleaner import prepare_data | |
| X_prepared, _ = prepare_data(df) | |
| fig, err = plot_functions[plot_type](X_prepared) | |
| elif plot_type == "Cluster Plot": | |
| from utils.data_cleaner import prepare_data | |
| from sklearn.cluster import KMeans | |
| X_prepared, _ = prepare_data(df) | |
| if X_prepared.empty: | |
| return jsonify({"error": "Data is empty after cleaning for Cluster Plot."}), 400 | |
| # Perform KMeans clustering (e.g., with 3 clusters) | |
| n_clusters = 3 # Default number of clusters | |
| if len(X_prepared) < n_clusters: | |
| n_clusters = len(X_prepared) # Adjust n_clusters if data points are fewer | |
| if n_clusters == 0: | |
| return jsonify({"error": "Not enough data points to form clusters."}), 400 | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) | |
| kmeans.fit(X_prepared) | |
| labels = kmeans.labels_ | |
| fig, err = plot_functions[plot_type](X_prepared, labels=labels) | |
| elif plot_type == "Dendrogram": | |
| from utils.data_cleaner import prepare_data | |
| X_prepared, _ = prepare_data(df) | |
| fig, err = plot_functions[plot_type](X_prepared) | |
| elif plot_type == "t-SNE": | |
| from utils.data_cleaner import prepare_data | |
| X_prepared, _ = prepare_data(df) | |
| fig, err = plot_functions[plot_type](X_prepared) | |
| elif plot_type in ["Confusion Matrix", "ROC Curve", "Feature Importance Plot"]: | |
| model_results = get_model_results(dataset_name) | |
| if not model_results: | |
| logging.warning(f"No trained model found for {plot_type} plot.") | |
| return jsonify({"error": "No trained model found. Please train a supervised model first."}), 400 | |
| model = model_results['model'] | |
| y_test = model_results['y_test'] | |
| y_pred = model_results['y_pred'] | |
| y_pred_proba = model_results['y_pred_proba'] | |
| X_test = model_results['X_test'] | |
| if plot_type == "Confusion Matrix": | |
| # Need to get class names. For simplicity, using unique values from y_test. | |
| class_names = [str(c) for c in sorted(pd.Series(y_test).unique())] | |
| fig, err = plot_functions[plot_type](y_test, y_pred, class_names) | |
| elif plot_type == "ROC Curve": | |
| if y_pred_proba is None: | |
| logging.warning("ROC Curve requested but model does not provide probability predictions.") | |
| return jsonify({"error": "ROC Curve requires probability predictions, which this model does not provide."}), 400 | |
| fig, err = plot_functions[plot_type](y_test, y_pred_proba) | |
| elif plot_type == "Feature Importance Plot": | |
| if not hasattr(model, 'feature_importances_'): | |
| logging.warning("Feature Importance Plot requested but model does not have feature importances.") | |
| return jsonify({"error": "Model does not have feature importances to plot."}), 400 | |
| # Feature names are from X_test columns | |
| feature_names = X_test.columns.tolist() | |
| fig, err = plot_functions[plot_type](model, feature_names) | |
| else: | |
| # Default case for plots that only need one column (e.g., Histogram, Bar, Pie) | |
| fig, err = plot_functions[plot_type](df, col1) | |
| if err: | |
| logging.error(f"Plot generation error for {plot_type}: {err}") | |
| return jsonify({"error": err}), 400 | |
| # Save plot to a BytesIO object and encode to base64 | |
| buf = io.BytesIO() | |
| fig.savefig(buf, format='png', bbox_inches='tight') | |
| plt.close(fig) | |
| buf.seek(0) | |
| img_str = base64.b64encode(buf.read()).decode('utf-8') | |
| return jsonify({'image': img_str}) | |
| except Exception as e: | |
| logging.error(f"An unexpected error occurred during plot generation for {plot_type}: {e}", exc_info=True) | |
| return jsonify({"error": f"An internal server error occurred: {str(e)}"}), 500 | |
| def plot_options(): | |
| """Return a list of available plot options based on the dataset's learning type.""" | |
| df = get_dataset(dataset_name) | |
| if df is None: | |
| logging.warning("No dataset uploaded when requesting plot options.") | |
| return jsonify({"error": "No dataset uploaded yet."}), 400 | |
| # Get learning type from the dedicated endpoint | |
| learning_type_response = get_learning_type() | |
| learning_type_data = learning_type_response.get_json() | |
| learning_type = learning_type_data.get('learning_type', 'Unsupervised') | |
| if learning_type == "Supervised": | |
| plots = ["Histogram", "Bar", "Scatter", "Box", "Pie", "Heatmap", "Confusion Matrix", "ROC Curve", "Feature Importance Plot"] | |
| else: | |
| plots = ["Histogram", "Bar", "Scatter", "Box", "Pie", "Heatmap", "Cluster Plot", "Elbow Curve", "Dendrogram", "t-SNE"] | |
| # Ensure Scatter Plot is always available if there are at least two numeric columns | |
| numeric_cols = df.select_dtypes(include=np.number).columns | |
| if len(numeric_cols) >= 2 and "Scatter" not in plots: | |
| plots.insert(2, "Scatter") # Insert at a reasonable position | |
| return jsonify({"plots": plots}) | |
| def ask_question_api(): | |
| """Handle user questions to the AI about the dataset.""" | |
| data = request.json | |
| user_query = data.get('user_query') | |
| if not user_query: | |
| logging.warning("Empty user query received for AI assistant.") | |
| return jsonify({"error": "Please ask a question."}), 400 | |
| answer = query_dataset_with_groq(dataset_name, user_query) | |
| return jsonify({"answer": answer}) | |
| if __name__ == '__main__': | |
| logging.info("Starting Flask application...") | |
| app.run(debug=True, port=5001) | |