File size: 16,179 Bytes
aa68823
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39e56b0
aa68823
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39e56b0
aa68823
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
import matplotlib
matplotlib.use('Agg')
from flask import Flask, request, jsonify, send_from_directory
import pandas as pd
from rag.memory import store_dataset, get_dataset, store_model_results, get_model_results
from rag.rag_query import query_dataset_with_groq
from models.supervised import train_model as train_supervised
from models.unsupervised import train_unsupervised
from visuals.charts import (
    plot_histogram, plot_bar, plot_scatter, plot_box, plot_pie, plot_heatmap,
    plot_confusion_matrix, plot_roc_curve, plot_feature_importance,
    plot_elbow_curve, plot_cluster_plot, plot_dendrogram, plot_tsne
)
import os
import logging
import json
import re
import matplotlib.pyplot as plt
import io
import base64
import numpy as np

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

app = Flask(__name__, static_folder='frontend')
dataset_name = "active_dataset"

@app.route('/')
def index():
    """Serve the main HTML page."""
    return send_from_directory(app.static_folder, 'index.html')

@app.route('/<path:path>')
def static_proxy(path):
    """Serve static files from the frontend directory."""
    return send_from_directory(app.static_folder, path)

@app.route('/api/upload', methods=['POST'])
def upload_csv():
    """Handle CSV file uploads, store the dataset, and return a success message."""
    if 'file' not in request.files:
        logging.warning("No file part in upload request.")
        return jsonify({"error": "No file part"}), 400
    file = request.files['file']
    if file.filename == '':
        logging.warning("No selected file in upload request.")
        return jsonify({"error": "No selected file"}), 400
    try:
        df = pd.read_csv(file)
        store_dataset(dataset_name, df)
        logging.info(f"Uploaded {df.shape[0]} rows and {df.shape[1]} columns.")
        return jsonify({"message": f"Uploaded {df.shape[0]} rows and {df.shape[1]} columns."})
    except Exception as e:
        logging.error(f"Error uploading file: {e}")
        return jsonify({"error": str(e)}), 500

@app.route('/api/columns', methods=['GET'])
def get_columns():
    """Return a list of column names from the currently loaded dataset."""
    df = get_dataset(dataset_name)
    if df is not None:
        return jsonify({"columns": list(df.columns)})
    logging.info("No dataset loaded when requesting columns.")
    return jsonify({"columns": []})

@app.route('/api/learning_type', methods=['GET'])
def get_learning_type():
    """Determine and return the learning type (supervised/unsupervised) and target column using LLM intelligence."""
    df = get_dataset(dataset_name)
    if df is None:
        logging.warning("No dataset uploaded when requesting learning type.")
        return jsonify({"error": "No dataset uploaded yet."}), 400

    dtypes_str = df.dtypes.to_string()
    prompt = (
        "You are an expert data scientist. Your task is to analyze a dataset and determine its learning type (supervised or unsupervised). "
        "If it's a supervised learning problem, you MUST identify the single target column that the other columns would predict. "
        "A target column is typically a label, outcome, or value that is being predicted (e.g., 'price', 'churn', 'diagnosis', 'category', 'sales'). "
        "If no such clear target column exists, it's an unsupervised problem. "
        "Respond ONLY with a JSON object, and nothing else. Do NOT include any introductory/concluding remarks, explanations, or markdown outside the JSON. "
        "The JSON must strictly follow this format: "
        "{\"learning_type\": \"Supervised\", \"target_column\": \"your_target_column_name\"} "
        "OR "
        "{\"learning_type\": \"Unsupervised\", \"target_column\": null}. "
        "\n\n"
        f"COLUMNS AND DATA TYPES:\n{dtypes_str}\n\n"
        f"DATA SAMPLE:\n{df.head().to_string()}"
    )

    try:
        response_text = query_dataset_with_groq(dataset_name, prompt).strip()
        logging.info(f"Raw LLM response for learning type: {response_text}")

        # Attempt to parse the JSON response
        try:
            data = json.loads(response_text)
            learning_type = data.get("learning_type", "Unsupervised")
            target_column = data.get("target_column")
            logging.info(f"Parsed LLM response - learning_type: {learning_type}, target_column: {target_column}")

            # Validate the target column if learning_type is Supervised
            if learning_type == "Supervised":
                if target_column is None or target_column not in df.columns:
                    logging.warning(f"LLM suggested supervised learning but target column '{target_column}' is invalid or not found. Defaulting to Unsupervised.")
                    learning_type = "Unsupervised"
                    target_column = None
            else:
                # If LLM says unsupervised, ensure target_column is null
                target_column = None

        except json.JSONDecodeError:
            logging.error(f"LLM response is not a valid JSON: {response_text}. Attempting regex fallback.")
            # Fallback: Try to extract using regex if JSON parsing fails (less reliable)
            match = re.search(r'"learning_type"\s*:\s*"(Supervised|Unsupervised)"(?:,\s*"target_column"\s*:\s*"?([a-zA-Z0-9_]+)?"?)?', response_text)
            if match:
                learning_type = match.group(1)
                target_column = match.group(2) if match.group(2) else None
                logging.info(f"Regex fallback parsed - learning_type: {learning_type}, target_column: {target_column}")

                if learning_type == "Supervised" and (target_column is None or target_column not in df.columns):
                    logging.warning(f"Regex fallback: Invalid target column '{target_column}' for supervised. Defaulting to Unsupervised.")
                    learning_type = "Unsupervised"
                    target_column = None
                elif learning_type == "Unsupervised":
                    target_column = None
            else:
                logging.error("Could not parse LLM response for learning type using regex fallback. Defaulting to Unsupervised.")
                learning_type = "Unsupervised"
                target_column = None

        return jsonify({"learning_type": learning_type, "target_column": target_column})

    except Exception as e:
        logging.error(f"An unexpected error occurred while determining learning type: {str(e)}", exc_info=True)
        # Fallback to a default in case of any error during Groq call or initial processing
        return jsonify({"learning_type": "Unsupervised", "target_column": None})

@app.route('/api/train', methods=['POST'])
def train_model_api():
    """Handle model training requests for both supervised and unsupervised learning."""
    data = request.json
    model_name = data.get('model_name')
    target_col = data.get('target_col')
    learning_type = data.get('learning_type')

    df = get_dataset(dataset_name)
    if df is None:
        logging.warning("No dataset uploaded when requesting model training.")
        return jsonify({"error": "No dataset uploaded yet."}), 400

    if learning_type == "Supervised":
        if not target_col or target_col == 'None':
            logging.warning("No target column provided for supervised training.")
            return jsonify({"error": "Please select a target column for supervised learning."}), 400
        
        model, metrics, y_test, y_pred, y_pred_proba, X_test = train_supervised(df, target_col, model_name)
        if model:
            store_model_results(dataset_name, model, y_test, y_pred, y_pred_proba, X_test)
            logging.info(f"{model_name} trained successfully for supervised learning.")
            return jsonify({"message": f"{model_name} trained successfully.", "metrics": metrics})
        else:
            logging.error(f"Failed to train {model_name} for supervised learning. Reason: {metrics}")
            return jsonify({"error": f"Failed to train {model_name}. Reason: {metrics}"}), 500
    else: # Unsupervised
        model, result = train_unsupervised(df, model_name)
        if model:
            logging.info(f"{model_name} trained successfully for unsupervised learning.")
            return jsonify({"message": f"{model_name} trained successfully.", "result": result.tolist() if hasattr(result, 'tolist') else result})
        else:
            logging.error(f"Failed to train {model_name} for unsupervised learning. Reason: {result}")
            return jsonify({"error": f"Failed to train {model_name}. Reason: {result}"}), 500

@app.route('/api/plot', methods=['POST'])
def generate_plot_api():
    """Generate and return a plot based on the requested type and columns."""
    data = request.json
    plot_type = data.get('plot_type')
    col1 = data.get('col1')
    col2 = data.get('col2')

    df = get_dataset(dataset_name)
    if df is None:
        logging.warning("No dataset loaded when requesting plot generation.")
        return jsonify({"error": "No data loaded."}), 400

    plot_functions = {
        "Histogram": plot_histogram,
        "Bar": plot_bar,
        "Scatter": plot_scatter,
        "Box": plot_box,
        "Pie": plot_pie,
        "Heatmap": plot_heatmap,
        "Elbow Curve": plot_elbow_curve,
        "Cluster Plot": plot_cluster_plot,
        "Dendrogram": plot_dendrogram,
        "t-SNE": plot_tsne,
        "Confusion Matrix": plot_confusion_matrix,
        "ROC Curve": plot_roc_curve,
        "Feature Importance Plot": plot_feature_importance
    }

    if plot_type not in plot_functions:
        logging.warning(f"Unsupported plot type requested: {plot_type}")
        return jsonify({"error": "Plot not supported."}), 400

    fig, err = None, None
    try:
        if plot_type == "Scatter":
            fig, err = plot_functions[plot_type](df, col1, col2, data.get('color_col'))
        elif plot_type == "Box":
            fig, err = plot_functions[plot_type](df, col1, col2)
        elif plot_type == "Heatmap":
            fig, err = plot_functions[plot_type](df)
        elif plot_type == "Elbow Curve":
            from utils.data_cleaner import prepare_data
            X_prepared, _ = prepare_data(df)
            fig, err = plot_functions[plot_type](X_prepared)
        elif plot_type == "Cluster Plot":
            from utils.data_cleaner import prepare_data
            from sklearn.cluster import KMeans
            X_prepared, _ = prepare_data(df)
            if X_prepared.empty:
                return jsonify({"error": "Data is empty after cleaning for Cluster Plot."}), 400
            # Perform KMeans clustering (e.g., with 3 clusters)
            n_clusters = 3 # Default number of clusters
            if len(X_prepared) < n_clusters:
                n_clusters = len(X_prepared) # Adjust n_clusters if data points are fewer
            if n_clusters == 0:
                return jsonify({"error": "Not enough data points to form clusters."}), 400
            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
            kmeans.fit(X_prepared)
            labels = kmeans.labels_
            fig, err = plot_functions[plot_type](X_prepared, labels=labels)
        elif plot_type == "Dendrogram":
            from utils.data_cleaner import prepare_data
            X_prepared, _ = prepare_data(df)
            fig, err = plot_functions[plot_type](X_prepared)
        elif plot_type == "t-SNE":
            from utils.data_cleaner import prepare_data
            X_prepared, _ = prepare_data(df)
            fig, err = plot_functions[plot_type](X_prepared)
        elif plot_type in ["Confusion Matrix", "ROC Curve", "Feature Importance Plot"]:
            model_results = get_model_results(dataset_name)
            if not model_results:
                logging.warning(f"No trained model found for {plot_type} plot.")
                return jsonify({"error": "No trained model found. Please train a supervised model first."}), 400
            
            model = model_results['model']
            y_test = model_results['y_test']
            y_pred = model_results['y_pred']
            y_pred_proba = model_results['y_pred_proba']
            X_test = model_results['X_test']

            if plot_type == "Confusion Matrix":
                # Need to get class names. For simplicity, using unique values from y_test.
                class_names = [str(c) for c in sorted(pd.Series(y_test).unique())]
                fig, err = plot_functions[plot_type](y_test, y_pred, class_names)
            elif plot_type == "ROC Curve":
                if y_pred_proba is None:
                    logging.warning("ROC Curve requested but model does not provide probability predictions.")
                    return jsonify({"error": "ROC Curve requires probability predictions, which this model does not provide."}), 400
                fig, err = plot_functions[plot_type](y_test, y_pred_proba)
            elif plot_type == "Feature Importance Plot":
                if not hasattr(model, 'feature_importances_'):
                    logging.warning("Feature Importance Plot requested but model does not have feature importances.")
                    return jsonify({"error": "Model does not have feature importances to plot."}), 400
                # Feature names are from X_test columns
                feature_names = X_test.columns.tolist()
                fig, err = plot_functions[plot_type](model, feature_names)
        else:
            # Default case for plots that only need one column (e.g., Histogram, Bar, Pie)
            fig, err = plot_functions[plot_type](df, col1)

        if err:
            logging.error(f"Plot generation error for {plot_type}: {err}")
            return jsonify({"error": err}), 400
        
        # Save plot to a BytesIO object and encode to base64
        buf = io.BytesIO()
        fig.savefig(buf, format='png', bbox_inches='tight')
        plt.close(fig)
        buf.seek(0)
        img_str = base64.b64encode(buf.read()).decode('utf-8')
        return jsonify({'image': img_str})
    except Exception as e:
        logging.error(f"An unexpected error occurred during plot generation for {plot_type}: {e}", exc_info=True)
        return jsonify({"error": f"An internal server error occurred: {str(e)}"}), 500

@app.route('/api/plot_options', methods=['GET'])
def plot_options():
    """Return a list of available plot options based on the dataset's learning type."""
    df = get_dataset(dataset_name)
    if df is None:
        logging.warning("No dataset uploaded when requesting plot options.")
        return jsonify({"error": "No dataset uploaded yet."}), 400

    # Get learning type from the dedicated endpoint
    learning_type_response = get_learning_type()
    learning_type_data = learning_type_response.get_json()
    learning_type = learning_type_data.get('learning_type', 'Unsupervised')

    if learning_type == "Supervised":
        plots = ["Histogram", "Bar", "Scatter", "Box", "Pie", "Heatmap", "Confusion Matrix", "ROC Curve", "Feature Importance Plot"]
    else:
        plots = ["Histogram", "Bar", "Scatter", "Box", "Pie", "Heatmap", "Cluster Plot", "Elbow Curve", "Dendrogram", "t-SNE"]

    # Ensure Scatter Plot is always available if there are at least two numeric columns
    numeric_cols = df.select_dtypes(include=np.number).columns
    if len(numeric_cols) >= 2 and "Scatter" not in plots:
        plots.insert(2, "Scatter") # Insert at a reasonable position
    
    return jsonify({"plots": plots})

@app.route('/api/ask', methods=['POST'])
def ask_question_api():
    """Handle user questions to the AI about the dataset."""
    data = request.json
    user_query = data.get('user_query')
    if not user_query:
        logging.warning("Empty user query received for AI assistant.")
        return jsonify({"error": "Please ask a question."}), 400
    
    answer = query_dataset_with_groq(dataset_name, user_query)
    return jsonify({"answer": answer})

if __name__ == '__main__':
    logging.info("Starting Flask application...")
    app.run(debug=True, port=5001)