user-churn / version-history /project_context_extended_v2_2_roadmap.json
VasithaTilakumara
Version 2.0 - added LFS tracking for lsapp.tsv and updated features
53b92fc
{
"project_name": "AI-Driven Churn Prediction and Simulation Dashboard",
"goal": "To create an interactive dashboard that predicts user churn using ML models and simulates 'what-if' business scenarios using a local LLM (Ollama + LangChain).",
"tech_stack": {
"frontend": "Gradio + Plotly",
"ml": "Scikit-learn (Random Forest, Logistic Regression)",
"ai_layer": "LangChain + Ollama (Mistral or Gemma3)",
"language": "Python 3.12",
"data": "App usage logs (session_count, recency, avg_session_duration)"
},
"architecture": {
"data_layer": "Loads and preprocesses raw app logs into feature-engineered dataset",
"model_layer": "Trained ML model saved as random_forest_model.pkl",
"dashboard_layer": "Gradio multi-tab app with Plotly charts",
"ai_layer": "LangChain + Ollama chatbot for natural-language what-if simulation",
"simulation_engine": "Generic 'plan'-based scenario engine (simulate_plan)"
},
"folders": {
"app.py": "Entry point that builds and runs the dashboard",
"tabs/business.py": "Main business insights tab + AI chatbot integration",
"tabs/shared_ai.py": "LangChain LLM logic (parses user text into JSON plan)",
"utils/insights.py": "Plots churn visuals and performance metrics",
"utils/models.py": "Loads models, provides feature importances and metrics",
"utils/modelling.py": "Model training and evaluation logic",
"utils/data_prep.py": "Feature engineering from raw app usage logs",
"utils/scenario_engine_ng.py": "Latest plan-based simulation engine",
"data/data_randomforest.csv": "Feature-engineered dataset",
"models/random_forest_model.pkl": "Saved ML model used for churn prediction"
},
"key_features": [
"Interactive Gradio dashboard with churn insights",
"LLM-powered chatbot for 'what-if' simulations",
"Dynamic plan-based scenario engine (supports scale, shift, set, clip)",
"Model-aware feature validation using feature_names_in_",
"Modular architecture supporting future model or dataset updates"
],
"limitations": [
"LLM may generate invalid JSON or non-numeric expressions",
"Simulation only supports numeric features",
"Single model, single dataset workflow (no multi-model comparison yet)",
"No memory of past scenarios",
"Charts static post-simulation (text output only)"
],
"next_steps": [
"Add interactive chart refresh after each simulation",
"Enable multiple concurrent model versions (Random Forest, XGBoost, etc.)",
"Integrate scenario history comparison",
"Extend ops: normalize, bucket, optimize",
"Add retraining loop for simulated data"
],
"progress_summary": {
"version_2_0": {
"focus": "LLM Integration",
"achievements": [
"Integrated LangChain + Ollama (Mistral) to interpret natural 'what-if' queries.",
"Implemented plan-based scenario simulation via `simulate_plan`.",
"Enabled AI to generate structured JSON outputs to modify dataset features.",
"Created prompt design and Pydantic schema for reliable JSON validation.",
"Built stable baseline churn prediction using Random Forest."
],
"challenges_and_fixes": [
"Resolved 'invalid JSON' outputs via strict schema validation.",
"Fixed cross-file function mismatch (simulate_scenario vs simulate_plan).",
"Debugged PromptTemplate brace parsing errors by switching to plain string prompts.",
"Standardized simulation plan parsing and metadata validation."
]
},
"version_2_1": {
"focus": "Accuracy, Validation, and Robustness",
"achievements": [
"Created `feature_metadata.json` for feature-specific type, min, max validation.",
"Implemented metadata-aware validation and clipping in `scenario_engine_ng.py`.",
"Added Pydantic schema enforcement in `shared_ai.py`.",
"Validated correct LLM \u2192 JSON \u2192 Simulation \u2192 Result flow.",
"Developed full unit test suite (`test_simulate_plan_v2_1.py`) verifying all operation types.",
"Refined Gradio front-end (headers, accordions, layout polish)."
],
"key_learnings": [
"Discovered Gradio ChatInterface only supports one output \u2192 simplified design.",
"Reinforced modular design separation between AI layer (Pydantic) and simulation layer (dicts).",
"Ensured reproducibility with metadata validation and clipping.",
"Verified average churn deltas using controlled plan testing."
],
"validation_status": "Version 2.1 unit tests fully passed \u2705"
},
"version_2_2": {
"focus": "Context-Aware Reasoning & Memory",
"completed": [
"Integrated simulation history logging to `data/sim_history.csv`.",
"Added `utils/history.py` for logging, loading, and clearing simulation history.",
"Displayed simulation history in `performance.py` (formerly business.py).",
"Introduced manual refresh button for table updates (stable solution)."
],
"attempted_but_not_pursued": [
"Attempted real-time event-driven refresh using gr.State (flag).",
"Tried gr.Button-based triggers and .click() propagation (unsupported).",
"Explored ChatInterface multi-output for auto-refresh (not supported).",
"Experimented with LangChain reactivity, eventually replaced with manual refresh."
],
"next_phase": "Implement Mini-RAG knowledge retrieval system."
}
},
"planned_versions": {
"version_2_2_remaining": {
"goals": [
"Implement Mini-RAG Context Layer \u2014 feature_docs + vector store retrieval.",
"Integrate retriever context into LLM prompt before simulation.",
"Add limited simulation history summarization (RAG-based insights).",
"Enable multi-feature plan reasoning ('Increase sessions and reduce recency')."
],
"implementation_outline": [
"1. Create `data/feature_docs/` with plain text files describing each feature.",
"2. Implement `utils/retriever.py` using Chroma + OllamaEmbeddings.",
"3. Modify `shared_ai.py` to retrieve and inject relevant context into prompts.",
"4. Add function `build_history_docs()` to summarize last 50 simulations.",
"5. Extend LLM prompt to include context from both feature_docs and history summaries."
]
},
"version_2_3": {
"focus": "Enhanced Simulation Intelligence",
"planned_features": [
"Add explainability mode ('Why did churn drop?').",
"Support categorical features and encoding simulation.",
"Allow multiple concurrent models (RF, XGBoost, Logistic Regression).",
"Enable dynamic model selection from dashboard."
]
},
"version_2_4": {
"focus": "Optimization & Learning Loop",
"planned_features": [
"Introduce simulation optimizer \u2014 find feature combinations minimizing churn.",
"Implement retraining loop using simulated data to fine-tune model.",
"Add continuous evaluation of model drift."
]
}
},
"project_structure_notes": {
"frontend": [
"Gradio multi-tab interface (Business Insights, Model Performance, Simulation History).",
"Performance tab uses refresh button to load sim_history dynamically.",
"AI assistant integrated via `shared_ai.py` uses local LLM (Ollama)."
],
"backend": [
"Scenario engine supports 'scale', 'shift', 'set', 'clip' operations.",
"Simulation validated by metadata and schema guards.",
"Data and model persistence via CSV and Pickle respectively."
],
"tests_and_validation": [
"Automated test script `test_simulate_plan_v2_1.py` verifies correctness.",
"Console validation confirmed baseline churn predictions consistent."
]
},
"future_roadmap": [
"Version 2.2: Complete Mini-RAG context retrieval (feature_docs + sim_history).",
"Version 2.3: Add explain intent, multi-model, and categorical handling.",
"Version 2.4: Add simulation optimizer and retraining feedback loop.",
"Version 3.0: Deploy dashboard as lightweight local web app with persistent memory."
],
"version_progression": [
{
"version": "2.1",
"focus": "Accuracy & Validation",
"key_deliverables": [
"Strict JSON schema validation (Pydantic)",
"Feature metadata + value clipping",
"Validated simulation engine with automated test suite",
"Ensured reproducibility through metadata-based clipping"
]
},
{
"version": "2.2",
"focus": "Contextual Understanding",
"key_deliverables": [
"Mini-RAG retriever for feature documentation and simulation history",
"Contextual LLM prompting with retrieved feature insights",
"Multi-feature simulation and reasoning capability",
"Model Performance & Simulation History tab integration with refresh workflow"
]
},
{
"version": "2.3",
"focus": "Rich Simulation UX & Explainability",
"key_deliverables": [
"Natural explanations for churn outcomes ('why churn dropped')",
"Scenario comparison and history analytics",
"Categorical feature simulation (encoding awareness)",
"Interactive visual comparison of churn deltas between runs"
]
},
{
"version": "2.4+",
"focus": "Optimization & Continuous Learning",
"key_deliverables": [
"Monte Carlo simulation for uncertainty modeling",
"Automated scenario optimizer to minimize churn",
"Retraining feedback loop using simulated outcomes",
"Continuous evaluation for model drift and data shifts"
]
},
{
"version": "3.0",
"focus": "Deployment & Persistence",
"key_deliverables": [
"Convert dashboard into persistent local web app",
"Enable embedded vector database for RAG memory",
"Support user accounts and saved scenario sessions",
"Integrate feedback from RAG retraining and live data ingestion"
]
}
]
}