Spaces:

Pulastya0
/

Data-Science-Agent

Running

App Files Files Community

Pulastya B commited on Jan 31

Commit

08646ab

1 Parent(s): 293e0b4

Fixed Scalability issues

Browse files

Files changed (3) hide show

src/api/app.py +172 -40
src/orchestrator.py +28 -12
src/tools/model_training.py +3 -4

src/api/app.py CHANGED Viewed

@@ -151,8 +151,77 @@ class ProgressEventManager:
         if session_id in self.session_status:
             del self.session_status[session_id]
-# Global event manager
-event_manager = ProgressEventManager()
 # Mount static files for React frontend
 frontend_path = Path(__file__).parent.parent.parent / "FRRONTEEEND" / "dist"
@@ -166,18 +235,19 @@ async def startup_event():
     """Initialize DataScienceCopilot on service startup."""
     global agent
     try:
-        logger.info("Initializing DataScienceCopilot...")
         provider = os.getenv("LLM_PROVIDER", "mistral")
-        # Disable compact prompts to enable multi-agent architecture
-        # Multi-agent system has focused prompts per specialist (~3K tokens each)
         use_compact = False  # Always use multi-agent routing
         agent = DataScienceCopilot(
             reasoning_effort="medium",
             provider=provider,
             use_compact_prompts=use_compact
         )
-        logger.info(f"✅ Agent initialized with provider: {agent.provider}")
         logger.info("🤖 Multi-agent architecture enabled with 5 specialists")
     except Exception as e:
         logger.error(f"❌ Failed to initialize agent: {e}")
@@ -311,34 +381,50 @@ class AnalysisRequest(BaseModel):
 def run_analysis_background(file_path: str, task_description: str, target_col: Optional[str],
                             use_cache: bool, max_iterations: int, session_id: str):
     """Background task to run analysis and emit events."""
     try:
-        logger.info(f"[BACKGROUND] Starting analysis for session {session_id}")
-        result = agent.analyze(
-            file_path=file_path,
-            task_description=task_description,
-            target_col=target_col,
-            use_cache=use_cache,
-            max_iterations=max_iterations
-        )
-        logger.info(f"[BACKGROUND] Analysis completed for session {session_id}")
-        # Send completion event
-        progress_manager.emit(session_id, {
-            "type": "analysis_complete",
-            "status": result.get("status"),
-            "message": "✅ Analysis completed successfully!",
-            "result": result
-        })
-    except Exception as e:
-        logger.error(f"[BACKGROUND] Analysis failed for session {session_id}: {e}")
-        progress_manager.emit(session_id, {
-            "type": "analysis_failed",
-            "error": str(e),
-            "message": f"❌ Analysis failed: {str(e)}"
-        })
 @app.post("/run-async")
@@ -357,9 +443,10 @@ async def run_analysis_async(
     if agent is None:
         raise HTTPException(status_code=503, detail="Agent not initialized")
-    # Get session UUID immediately
-    session_id = agent.session.session_id if hasattr(agent, 'session') and agent.session else "default"
-    logger.info(f"[ASYNC] Created session: {session_id}")
     # Handle file upload
     temp_file_path = None
@@ -372,6 +459,28 @@ async def run_analysis_async(
             shutil.copyfileobj(file.file, buffer)
         logger.info(f"[ASYNC] File saved: {file.filename}")
     # Start background analysis
     background_tasks.add_task(
@@ -427,20 +536,43 @@ async def run_analysis(
     if agent is None:
         raise HTTPException(status_code=503, detail="Agent not initialized")
     # Handle follow-up requests (no file, using session memory)
     if file is None:
         logger.info(f"Follow-up request without file, using session memory")
         logger.info(f"Task: {task_description}")
         # Get the agent's actual session UUID for SSE routing
-        actual_session_id = agent.session.session_id if hasattr(agent, 'session') and agent.session else "default"
         print(f"[SSE] Follow-up using agent session UUID: {actual_session_id}")
         # NO progress_callback - orchestrator emits directly to UUID
         try:
             # Agent's session memory should resolve file_path from context
-            result = agent.analyze(
                 file_path="",  # Empty - will be resolved by session memory
                 task_description=task_description,
                 target_col=target_col,
@@ -526,14 +658,14 @@ async def run_analysis(
         logger.info(f"File saved successfully: {file.filename} ({os.path.getsize(temp_file_path)} bytes)")
         # Get the agent's actual session UUID for SSE routing (BEFORE analyze())
-        actual_session_id = agent.session.session_id if hasattr(agent, 'session') and agent.session else "default"
         print(f"[SSE] File upload using agent session UUID: {actual_session_id}")
         # NO progress_callback - orchestrator emits directly to UUID
         # Call existing agent logic
         logger.info(f"Starting analysis with task: {task_description}")
-        result = agent.analyze(
             file_path=str(temp_file_path),
             task_description=task_description,
             target_col=target_col,

         if session_id in self.session_status:
             del self.session_status[session_id]
+# 👥 MULTI-USER SUPPORT: Per-session agent instances
+# Instead of one global agent, create isolated instances per session
+# This prevents users from interfering with each other's workflows
+agent_cache: Dict[str, DataScienceCopilot] = {}  # session_id -> agent instance
+agent_cache_lock = asyncio.Lock()
+MAX_CACHED_AGENTS = 10  # Limit memory usage
+logger.info("👥 Multi-user agent cache initialized")
+# Legacy global agent for backward compatibility (will be deprecated)
+agent = None
+# 👥 MULTI-USER SUPPORT: Per-session agent instances
+# Instead of one global agent, create isolated instances per session
+# This prevents users from interfering with each other's workflows
+agent_cache: Dict[str, DataScienceCopilot] = {}  # session_id -> agent instance
+agent_cache_lock = asyncio.Lock()
+MAX_CACHED_AGENTS = 10  # Limit memory usage
+logger.info("👥 Multi-user agent cache initialized")
+# Legacy global agent for backward compatibility (will be deprecated)
+agent = None
+async def get_agent_for_session(session_id: str) -> DataScienceCopilot:
+    """
+    Get or create an isolated agent instance for a session.
+    This ensures each user gets their own agent with isolated state,
+    preventing session collisions and race conditions.
+    Args:
+        session_id: Unique session identifier
+    Returns:
+        DataScienceCopilot instance for this session
+    """
+    async with agent_cache_lock:
+        # Return existing agent if cached
+        if session_id in agent_cache:
+            logger.info(f"[♻️] Reusing cached agent for session {session_id[:8]}...")
+            return agent_cache[session_id]
+        # Create new agent instance
+        logger.info(f"[🆕] Creating new agent for session {session_id[:8]}...")
+        provider = os.getenv("LLM_PROVIDER", "mistral")
+        new_agent = DataScienceCopilot(
+            reasoning_effort="medium",
+            provider=provider,
+            use_compact_prompts=False,  # Multi-agent architecture
+            session_id=session_id  # Pass session_id for isolation
+        )
+        # Cache management: Remove oldest if cache is full
+        if len(agent_cache) >= MAX_CACHED_AGENTS:
+            oldest_session = next(iter(agent_cache))
+            logger.info(f"[🗑️] Cache full, removing session {oldest_session[:8]}...")
+            del agent_cache[oldest_session]
+        agent_cache[session_id] = new_agent
+        logger.info(f"✅ Agent created for session {session_id[:8]} (cache: {len(agent_cache)}/{MAX_CACHED_AGENTS})")
+        return new_agent
+# 🔒 REQUEST QUEUING: Global lock to prevent concurrent workflows
+# This ensures only one analysis runs at a time, preventing:
+# - Race conditions on file writes
+# - Memory exhaustion from parallel model training
+# - Session state corruption
+workflow_lock = asyncio.Lock()
+logger.info("🔒 Workflow lock initialized for request queuing")
 # Mount static files for React frontend
 frontend_path = Path(__file__).parent.parent.parent / "FRRONTEEEND" / "dist"
     """Initialize DataScienceCopilot on service startup."""
     global agent
     try:
+        logger.info("Initializing legacy global agent for health checks...")
         provider = os.getenv("LLM_PROVIDER", "mistral")
         use_compact = False  # Always use multi-agent routing
+        # Create one agent for health checks only
+        # Real requests will use get_agent_for_session() for isolation
         agent = DataScienceCopilot(
             reasoning_effort="medium",
             provider=provider,
             use_compact_prompts=use_compact
         )
+        logger.info(f"✅ Health check agent initialized with provider: {agent.provider}")
+        logger.info("👥 Per-session agents enabled - each user gets isolated instance")
         logger.info("🤖 Multi-agent architecture enabled with 5 specialists")
     except Exception as e:
         logger.error(f"❌ Failed to initialize agent: {e}")
 def run_analysis_background(file_path: str, task_description: str, target_col: Optional[str],
                             use_cache: bool, max_iterations: int, session_id: str):
     """Background task to run analysis and emit events."""
+    async def _run_with_lock():
+        """Wrap analysis in lock to ensure sequential execution."""
+        async with workflow_lock:
+            try:
+                logger.info(f"[BACKGROUND] Starting analysis for session {session_id[:8]}...")
+                # 👥 Get isolated agent for this session
+                session_agent = await get_agent_for_session(session_id)
+                result = session_agent.analyze(
+                    file_path=file_path,
+                    task_description=task_description,
+                    target_col=target_col,
+                    use_cache=use_cache,
+                    max_iterations=max_iterations
+                )
+                logger.info(f"[BACKGROUND] Analysis completed for session {session_id[:8]}...")
+                # Send completion event
+                progress_manager.emit(session_id, {
+                    "type": "analysis_complete",
+                    "status": result.get("status"),
+                    "message": "✅ Analysis completed successfully!",
+                    "result": result
+                })
+            except Exception as e:
+                logger.error(f"[BACKGROUND] Analysis failed for session {session_id[:8]}...: {e}")
+                progress_manager.emit(session_id, {
+                    "type": "analysis_failed",
+                    "error": str(e),
+                    "message": f"❌ Analysis failed: {str(e)}"
+                })
+    # Run async function in event loop
+    import asyncio
     try:
+        loop = asyncio.get_event_loop()
+    except RuntimeError:
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+    loop.run_until_complete(_run_with_lock())
 @app.post("/run-async")
     if agent is None:
         raise HTTPException(status_code=503, detail="Agent not initialized")
+    # 🆔 Generate unique session ID for this request
+    import uuid
+    session_id = str(uuid.uuid4())
+    logger.info(f"[ASYNC] Created session: {session_id[:8]}...")
     # Handle file upload
     temp_file_path = None
             shutil.copyfileobj(file.file, buffer)
         logger.info(f"[ASYNC] File saved: {file.filename}")
+    else:
+        # 🛡️ VALIDATION: For follow-up queries, check if any cached agent has dataset
+        # Note: In true multi-user setup, you'd need session_id from frontend to match exact session
+        has_dataset = False
+        async with agent_cache_lock:
+            for cached_agent in agent_cache.values():
+                if hasattr(cached_agent, 'session') and cached_agent.session and cached_agent.session.last_dataset:
+                    has_dataset = True
+                    logger.info(f"[ASYNC] Follow-up query using cached session data")
+                    break
+        if not has_dataset:
+            logger.warning("[ASYNC] No file uploaded and no session dataset available")
+            return JSONResponse(
+                content={
+                    "success": False,
+                    "error": "No dataset available",
+                    "message": "Please upload a CSV, Excel, or Parquet file first.",
+                    "session_id": session_id
+                },
+                status_code=400
+            )
     # Start background analysis
     background_tasks.add_task(
     if agent is None:
         raise HTTPException(status_code=503, detail="Agent not initialized")
+    # 🆔 Generate or use provided session ID
+    if not session_id:
+        import uuid
+        session_id = str(uuid.uuid4())
+        logger.info(f"[SYNC] Created new session: {session_id[:8]}...")
+    else:
+        logger.info(f"[SYNC] Using provided session: {session_id[:8]}...")
+    # 👥 Get isolated agent for this session
+    session_agent = await get_agent_for_session(session_id)
     # Handle follow-up requests (no file, using session memory)
     if file is None:
         logger.info(f"Follow-up request without file, using session memory")
         logger.info(f"Task: {task_description}")
+        # 🛡️ VALIDATION: Check if session has a dataset
+        if not (hasattr(session_agent, 'session') and session_agent.session and session_agent.session.last_dataset):
+            logger.warning("No file uploaded and no session dataset available")
+            return JSONResponse(
+                content={
+                    "success": False,
+                    "error": "No dataset available",
+                    "message": "Please upload a CSV, Excel, or Parquet file first before asking questions."
+                },
+                status_code=400
+            )
         # Get the agent's actual session UUID for SSE routing
+        actual_session_id = session_agent.session.session_id if hasattr(session_agent, 'session') and session_agent.session else session_id
         print(f"[SSE] Follow-up using agent session UUID: {actual_session_id}")
         # NO progress_callback - orchestrator emits directly to UUID
         try:
             # Agent's session memory should resolve file_path from context
+            result = session_agent.analyze(
                 file_path="",  # Empty - will be resolved by session memory
                 task_description=task_description,
                 target_col=target_col,
         logger.info(f"File saved successfully: {file.filename} ({os.path.getsize(temp_file_path)} bytes)")
         # Get the agent's actual session UUID for SSE routing (BEFORE analyze())
+        actual_session_id = session_agent.session.session_id if hasattr(session_agent, 'session') and session_agent.session else session_id
         print(f"[SSE] File upload using agent session UUID: {actual_session_id}")
         # NO progress_callback - orchestrator emits directly to UUID
         # Call existing agent logic
         logger.info(f"Starting analysis with task: {task_description}")
+        result = session_agent.analyze(
             file_path=str(temp_file_path),
             task_description=task_description,
             target_col=target_col,

src/orchestrator.py CHANGED Viewed

@@ -402,7 +402,7 @@ class DataScienceCopilot:
             "split_data_strategically": split_data_strategically,
             # Advanced Training (3)
             "hyperparameter_tuning": hyperparameter_tuning,
-            "train_ensemble_models": train_ensemble_models,
             "perform_cross_validation": perform_cross_validation,
             # Business Intelligence (4)
             "perform_cohort_analysis": perform_cohort_analysis,
@@ -554,7 +554,8 @@ When you need to use a tool, respond with a JSON block like this:
 - Keywords: "train model", "predict", "classify", "build model", "forecast"
 - User wants: cleaning + feature engineering + model training
 - **ACTION**: Run full ML workflow (steps 1-15 below)
-- **Example**: "Train a model to predict earthquake magnitude" → Full pipeline
 **E. UNCLEAR/AMBIGUOUS REQUESTS** - Intent is not obvious:
 - User says: "analyze", "look at", "check", "review" (without specifics)
@@ -657,16 +658,16 @@ structure, variable relationships, and expected insights - not hardcoded domain
 8. encode_categorical(latest, method="auto", output="./outputs/data/encoded.csv")
 9. generate_eda_plots(encoded, target_col, output_dir="./outputs/plots/eda") - Generate EDA visualizations
 10. **ONLY IF USER EXPLICITLY REQUESTED ML**: train_baseline_models(encoded, target_col, task_type="auto")
-11. **HYPERPARAMETER TUNING (OPTIONAL - Smart Decision)**:
-    - ⚠️ **WARNING: This tool is VERY expensive and takes 5-10 minutes!**
-    - **When to use**:
-      * User explicitly says "optimize", "tune", "improve", "best model possible" → ALWAYS tune
-      * Best model score < 0.90 → Tune to improve (user expects good accuracy)
-      * Best model score > 0.95 → Skip tuning (already excellent)
     - **How**: hyperparameter_tuning(file_path=encoded, target_col=target_col, model_type="xgboost", n_trials=50)
     - **Large datasets (>100K rows)**: n_trials automatically reduced to 20 to prevent timeout
     - **Only tune the WINNING model** (don't waste time on others)
-    - **Map model names**: XGBoost→"xgboost", RandomForest→"random_forest", Ridge→"ridge", Lasso→use Ridge
     - **Note**: Time features should already be extracted in step 7 (create_time_features)
 12. **CROSS-VALIDATION (OPTIONAL - Production Models)**:
     - IF user says "validate", "production", "robust", "deploy" → ALWAYS cross-validate
@@ -836,7 +837,7 @@ Use specialized tools FIRST. Only use execute_python_code for:
 - train_baseline_models: Trains multiple models automatically
 - **⭐ execute_python_code**: Write and run custom Python code for ANY task not covered by tools (TRUE AI AGENT capability)
 - **execute_code_from_file**: Run existing Python scripts
-- Advanced: hyperparameter_tuning, train_ensemble_models, perform_eda_analysis, handle_imbalanced_data, perform_feature_scaling, detect_anomalies, detect_and_handle_multicollinearity, auto_feature_engineering, forecast_time_series, explain_predictions, generate_business_insights, perform_topic_modeling, extract_image_features, monitor_model_drift
 - NEW Advanced Insights: analyze_root_cause, detect_trends_and_seasonality, detect_anomalies_advanced, perform_hypothesis_testing, analyze_distribution, perform_segment_analysis
 - NEW Automation: auto_ml_pipeline (zero-config full pipeline), auto_feature_selection
 - NEW Visualization: generate_all_plots, generate_data_quality_plots, generate_eda_plots, generate_model_performance_plots, generate_feature_importance_plot
@@ -1020,7 +1021,7 @@ BEFORE calling any training tools, you MUST:
 **Your Tools (6 modeling-focused):**
 - train_baseline_models, hyperparameter_tuning
-- train_ensemble_models, perform_cross_validation
 - generate_model_report, detect_model_issues
 **Your Approach:**
@@ -2746,6 +2747,19 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
         # 🚀 LOCAL SCHEMA EXTRACTION (NO LLM) - Extract metadata before any LLM calls
         # Now that file_path is resolved from session if needed
         print("🔍 Extracting dataset schema locally (no LLM)...")
         schema_info = extract_schema_local(file_path, sample_rows=3)
@@ -3366,7 +3380,9 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
                     messages.append(response_message)
                 # 🚀 PARALLEL EXECUTION: Detect multiple independent tool calls
-                if len(tool_calls) > 1:
                     print(f"🚀 Detected {len(tool_calls)} tool calls - attempting parallel execution")
                     # Extract tool executions with proper weight classification

             "split_data_strategically": split_data_strategically,
             # Advanced Training (3)
             "hyperparameter_tuning": hyperparameter_tuning,
+            # "train_ensemble_models": train_ensemble_models,  # DISABLED - Too resource intensive for scale
             "perform_cross_validation": perform_cross_validation,
             # Business Intelligence (4)
             "perform_cohort_analysis": perform_cohort_analysis,
 - Keywords: "train model", "predict", "classify", "build model", "forecast"
 - User wants: cleaning + feature engineering + model training
 - **ACTION**: Run full ML workflow (steps 1-15 below)
+- **🎯 IMPORTANT**: ALWAYS generate ydata_profiling_report at the END of workflow for comprehensive final analysis
+- **Example**: "Train a model to predict earthquake magnitude" → Full pipeline + ydata_profiling_report at end
 **E. UNCLEAR/AMBIGUOUS REQUESTS** - Intent is not obvious:
 - User says: "analyze", "look at", "check", "review" (without specifics)
 8. encode_categorical(latest, method="auto", output="./outputs/data/encoded.csv")
 9. generate_eda_plots(encoded, target_col, output_dir="./outputs/plots/eda") - Generate EDA visualizations
 10. **ONLY IF USER EXPLICITLY REQUESTED ML**: train_baseline_models(encoded, target_col, task_type="auto")
+10b. **ALWAYS AFTER MODEL TRAINING**: generate_ydata_profiling_report(encoded, output_path="./outputs/reports/ydata_profile.html") - Comprehensive data analysis report
+11. **HYPERPARAMETER TUNING (⚠️ ONLY WHEN EXPLICITLY REQUESTED)**:
+    - ⚠️ **CRITICAL WARNING**: This is EXTREMELY expensive (5-10 minutes) and resource-intensive!
+    - ⚠️ **DO NOT USE UNLESS USER EXPLICITLY ASKS FOR IT**
+    - **ONLY use when user says**: "tune", "optimize", "hyperparameter", "improve model", "best parameters"
+    - **NEVER auto-trigger** based on scores - user must explicitly request it
     - **How**: hyperparameter_tuning(file_path=encoded, target_col=target_col, model_type="xgboost", n_trials=50)
     - **Large datasets (>100K rows)**: n_trials automatically reduced to 20 to prevent timeout
     - **Only tune the WINNING model** (don't waste time on others)
+    - **Map model names**: XGBoost→"xgboost", Ridge→"ridge", Lasso→use Ridge
     - **Note**: Time features should already be extracted in step 7 (create_time_features)
 12. **CROSS-VALIDATION (OPTIONAL - Production Models)**:
     - IF user says "validate", "production", "robust", "deploy" → ALWAYS cross-validate
 - train_baseline_models: Trains multiple models automatically
 - **⭐ execute_python_code**: Write and run custom Python code for ANY task not covered by tools (TRUE AI AGENT capability)
 - **execute_code_from_file**: Run existing Python scripts
+- Advanced: hyperparameter_tuning, perform_eda_analysis, handle_imbalanced_data, perform_feature_scaling, detect_anomalies, detect_and_handle_multicollinearity, auto_feature_engineering, forecast_time_series, explain_predictions, generate_business_insights, perform_topic_modeling, extract_image_features, monitor_model_drift
 - NEW Advanced Insights: analyze_root_cause, detect_trends_and_seasonality, detect_anomalies_advanced, perform_hypothesis_testing, analyze_distribution, perform_segment_analysis
 - NEW Automation: auto_ml_pipeline (zero-config full pipeline), auto_feature_selection
 - NEW Visualization: generate_all_plots, generate_data_quality_plots, generate_eda_plots, generate_model_performance_plots, generate_feature_importance_plot
 **Your Tools (6 modeling-focused):**
 - train_baseline_models, hyperparameter_tuning
+- perform_cross_validation
 - generate_model_report, detect_model_issues
 **Your Approach:**
         # 🚀 LOCAL SCHEMA EXTRACTION (NO LLM) - Extract metadata before any LLM calls
         # Now that file_path is resolved from session if needed
+        # 🛡️ VALIDATION: Ensure we have a valid file path
+        if not file_path or file_path == "":
+            error_msg = "No dataset file provided. Please upload a CSV, Excel, or Parquet file."
+            print(f"❌ {error_msg}")
+            return {
+                "status": "error",
+                "error": error_msg,
+                "summary": "Cannot proceed without a dataset file.",
+                "workflow_history": [],
+                "execution_time": 0.0
+            }
         print("🔍 Extracting dataset schema locally (no LLM)...")
         schema_info = extract_schema_local(file_path, sample_rows=3)
                     messages.append(response_message)
                 # 🚀 PARALLEL EXECUTION: Detect multiple independent tool calls
+                # ⚠️ DISABLED FOR STABILITY - Parallel execution causes race conditions and OOM errors
+                # Re-enable only after implementing proper request isolation per user
+                if len(tool_calls) > 1 and False:  # Disabled with "and False"
                     print(f"🚀 Detected {len(tool_calls)} tool calls - attempting parallel execution")
                     # Extract tool executions with proper weight classification

src/tools/model_training.py CHANGED Viewed

@@ -129,15 +129,15 @@ def train_baseline_models(file_path: str, target_col: str,
     # Train models based on task type
     import sys
-    print(f"\n🚀 Training {6 if task_type == 'classification' else 6} baseline models...", flush=True)
     print(f"   📊 Training set: {len(X_train):,} samples × {X_train.shape[1]} features", flush=True)
     print(f"   📊 Test set: {len(X_test):,} samples", flush=True)
     sys.stdout.flush()
     if task_type == "classification":
         models = {
             "logistic_regression": LogisticRegression(max_iter=1000, random_state=random_state),
-            "random_forest": RandomForestClassifier(n_estimators=100, random_state=random_state, n_jobs=-1),
             "xgboost": XGBClassifier(n_estimators=100, random_state=random_state, n_jobs=-1),
             "lightgbm": LGBMClassifier(n_estimators=100, random_state=random_state, n_jobs=-1, verbose=-1),
             "catboost": CatBoostClassifier(iterations=100, random_state=random_state, verbose=0, allow_writing_files=False)
@@ -213,7 +213,6 @@ def train_baseline_models(file_path: str, target_col: str,
         models = {
             "ridge": Ridge(random_state=random_state),
             "lasso": Lasso(random_state=random_state),
-            "random_forest": RandomForestRegressor(n_estimators=100, random_state=random_state, n_jobs=-1),
             "xgboost": XGBRegressor(n_estimators=100, random_state=random_state, n_jobs=-1),
             "lightgbm": LGBMRegressor(n_estimators=100, random_state=random_state, n_jobs=-1, verbose=-1),
             "catboost": CatBoostRegressor(iterations=100, random_state=random_state, verbose=0, allow_writing_files=False)
@@ -316,7 +315,7 @@ def train_baseline_models(file_path: str, target_col: str,
                 "suggested_model": best_model_name,
                 "reason": f"{best_model_name} is optimal for large datasets - fast training and good performance"
             }
-        elif best_model_name == "random_forest":
             # Find next best fast model
             fast_model_scores = {name: results["models"][name]["test_metrics"].get("r2" if task_type == "regression" else "f1", 0)
                                for name in fast_models if name in results["models"]}

     # Train models based on task type
     import sys
+    print(f"\n🚀 Training {5 if task_type == 'classification' else 5} baseline models...", flush=True)
     print(f"   📊 Training set: {len(X_train):,} samples × {X_train.shape[1]} features", flush=True)
     print(f"   📊 Test set: {len(X_test):,} samples", flush=True)
+    print(f"   ⚡ Note: Random Forest excluded to optimize compute resources", flush=True)
     sys.stdout.flush()
     if task_type == "classification":
         models = {
             "logistic_regression": LogisticRegression(max_iter=1000, random_state=random_state),
             "xgboost": XGBClassifier(n_estimators=100, random_state=random_state, n_jobs=-1),
             "lightgbm": LGBMClassifier(n_estimators=100, random_state=random_state, n_jobs=-1, verbose=-1),
             "catboost": CatBoostClassifier(iterations=100, random_state=random_state, verbose=0, allow_writing_files=False)
         models = {
             "ridge": Ridge(random_state=random_state),
             "lasso": Lasso(random_state=random_state),
             "xgboost": XGBRegressor(n_estimators=100, random_state=random_state, n_jobs=-1),
             "lightgbm": LGBMRegressor(n_estimators=100, random_state=random_state, n_jobs=-1, verbose=-1),
             "catboost": CatBoostRegressor(iterations=100, random_state=random_state, verbose=0, allow_writing_files=False)
                 "suggested_model": best_model_name,
                 "reason": f"{best_model_name} is optimal for large datasets - fast training and good performance"
             }
+        elif best_model_name == "random_forest_legacy":  # Disabled for compute optimization
             # Find next best fast model
             fast_model_scores = {name: results["models"][name]["test_metrics"].get("r2" if task_type == "regression" else "f1", 0)
                                for name in fast_models if name in results["models"]}