ACA050 commited on
Commit
a309487
·
verified ·
1 Parent(s): b425234

Upload 79 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +12 -0
  2. README.md +62 -12
  3. backend/__init__.py +0 -0
  4. backend/__pycache__/__init__.cpython-310.pyc +0 -0
  5. backend/__pycache__/__init__.cpython-313.pyc +0 -0
  6. backend/api/__init__.py +0 -0
  7. backend/api/__pycache__/__init__.cpython-310.pyc +0 -0
  8. backend/api/__pycache__/__init__.cpython-313.pyc +0 -0
  9. backend/api/__pycache__/main.cpython-310.pyc +0 -0
  10. backend/api/__pycache__/main.cpython-313.pyc +0 -0
  11. backend/api/main.py +80 -0
  12. backend/config/__init__.py +0 -0
  13. backend/core/__init__.py +0 -0
  14. backend/core/__pycache__/__init__.cpython-310.pyc +0 -0
  15. backend/core/__pycache__/__init__.cpython-313.pyc +0 -0
  16. backend/core/__pycache__/dataset_analyzer.cpython-310.pyc +0 -0
  17. backend/core/__pycache__/dataset_analyzer.cpython-313.pyc +0 -0
  18. backend/core/__pycache__/deployment_generator.cpython-310.pyc +0 -0
  19. backend/core/__pycache__/deployment_generator.cpython-313.pyc +0 -0
  20. backend/core/__pycache__/explainability.cpython-310.pyc +0 -0
  21. backend/core/__pycache__/explainability.cpython-313.pyc +0 -0
  22. backend/core/__pycache__/model_factory.cpython-310.pyc +0 -0
  23. backend/core/__pycache__/model_factory.cpython-313.pyc +0 -0
  24. backend/core/__pycache__/monitoring.cpython-310.pyc +0 -0
  25. backend/core/__pycache__/monitoring.cpython-313.pyc +0 -0
  26. backend/core/__pycache__/orchestrator.cpython-310.pyc +0 -0
  27. backend/core/__pycache__/orchestrator.cpython-313.pyc +0 -0
  28. backend/core/__pycache__/problem_inference.cpython-310.pyc +0 -0
  29. backend/core/__pycache__/problem_inference.cpython-313.pyc +0 -0
  30. backend/core/__pycache__/strategy_reasoner.cpython-310.pyc +0 -0
  31. backend/core/__pycache__/strategy_reasoner.cpython-313.pyc +0 -0
  32. backend/core/dataset_analyzer.py +99 -0
  33. backend/core/deployment_generator.py +30 -0
  34. backend/core/explainability.py +33 -0
  35. backend/core/model_factory.py +40 -0
  36. backend/core/monitoring.py +13 -0
  37. backend/core/orchestrator.py +88 -0
  38. backend/core/problem_inference.py +13 -0
  39. backend/core/strategy_reasoner.py +68 -0
  40. backend/experiments/__init__.py +0 -0
  41. backend/experiments/__pycache__/benchmark_runner.cpython-313.pyc +0 -0
  42. backend/experiments/benchmark_runner.py +32 -0
  43. backend/experiments/run_benchmarks.py +32 -0
  44. backend/nlp/__pycache__/evaluators.cpython-310.pyc +0 -0
  45. backend/nlp/__pycache__/evaluators.cpython-313.pyc +0 -0
  46. backend/nlp/__pycache__/preprocess.cpython-310.pyc +0 -0
  47. backend/nlp/__pycache__/preprocess.cpython-313.pyc +0 -0
  48. backend/nlp/__pycache__/trainers.cpython-310.pyc +0 -0
  49. backend/nlp/__pycache__/trainers.cpython-313.pyc +0 -0
  50. backend/nlp/embeddings.py +12 -0
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ EXPOSE 7860
11
+
12
+ CMD ["uvicorn", "backend.api.main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,62 @@
1
- ---
2
- title: ModelSmith AI
3
- emoji: 🐨
4
- colorFrom: pink
5
- colorTo: indigo
6
- sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
- short_description: Intelligent system that designs, explains and deploys ML sol
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ModelSmith AI
2
+
3
+ An intelligent ML platform that automates tabular classification and regression tasks. It analyzes datasets, recommends optimal strategies, trains models, and provides explanations.
4
+
5
+ ## Features
6
+
7
+ - **Dataset Analysis**: Automatic detection of data types, missing values, and potential issues
8
+ - **Strategy Reasoning**: Intelligent model selection based on dataset characteristics
9
+ - **Automated Training**: End-to-end model training with preprocessing pipelines
10
+ - **Explainability**: SHAP-based feature importance explanations
11
+ - **FastAPI Backend**: RESTful API for seamless integration
12
+
13
+ ## Supported Scope
14
+
15
+ - **Task**: Tabular classification and regression
16
+ - **Input**: CSV files with ≥1200 rows
17
+ - **Target**: Binary or multiclass classification, regression
18
+ - **Features**: At least 2 usable features after preprocessing
19
+
20
+ ## API Endpoints
21
+
22
+ - `POST /analyze`: Analyze dataset and get strategy recommendations
23
+ - `POST /train`: Train a model on the dataset
24
+ - `POST /explain`: Get model explanations and feature importance
25
+ - `POST /predict`: Make predictions with trained model
26
+ - `GET /health`: Health check
27
+
28
+ ## Deployment
29
+
30
+ This project is designed for deployment on Hugging Face Spaces using Docker.
31
+
32
+ ### Files for Deployment
33
+
34
+ - `Dockerfile`
35
+ - `requirements.txt`
36
+ - `backend/` (entire directory)
37
+
38
+ ### Running Locally
39
+
40
+ ```bash
41
+ pip install -r requirements.txt
42
+ uvicorn backend.api.main:app --host 0.0.0.0 --port 7860
43
+ ```
44
+
45
+ ## Limitations
46
+
47
+ - NLP functionality is disabled
48
+ - Requires datasets with ≥1200 rows
49
+ - CPU-only, no GPU support
50
+ - Stateless API (models saved temporarily)
51
+
52
+ ## Architecture
53
+
54
+ - **Orchestrator**: Main workflow coordinator
55
+ - **Dataset Analyzer**: Data profiling and preprocessing
56
+ - **Strategy Reasoner**: Model selection logic
57
+ - **Model Factory**: Training and evaluation
58
+ - **Explainability Engine**: SHAP explanations
59
+
60
+ ## License
61
+
62
+ MIT License
backend/__init__.py ADDED
File without changes
backend/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (166 Bytes). View file
 
backend/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (142 Bytes). View file
 
backend/api/__init__.py ADDED
File without changes
backend/api/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (170 Bytes). View file
 
backend/api/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (146 Bytes). View file
 
backend/api/__pycache__/main.cpython-310.pyc ADDED
Binary file (1.82 kB). View file
 
backend/api/__pycache__/main.cpython-313.pyc ADDED
Binary file (4.43 kB). View file
 
backend/api/main.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ import pandas as pd
3
+ from backend.core.orchestrator import Orchestrator
4
+
5
+ app = FastAPI()
6
+ orchestrator = Orchestrator()
7
+
8
+ @app.post("/analyze")
9
+ async def analyze_dataset(file: UploadFile = File(...), target_column: str = "target"):
10
+ try:
11
+ df = pd.read_csv(file.file)
12
+ result = orchestrator.run(df, target_column)
13
+
14
+ # Format response for frontend
15
+ dataset_info = result.get("dataset_info", {})
16
+ strategy = result.get("strategy", {})
17
+
18
+ response = {
19
+ "columns": list(df.columns),
20
+ "dataTypes": dataset_info.get("data_types", {}),
21
+ "risks": dataset_info.get("risks", []),
22
+ "problemType": result.get("problem_type"),
23
+ "confidence": strategy.get("confidence", 0),
24
+ "strategy": strategy
25
+ }
26
+ return response
27
+ except Exception as e:
28
+ raise HTTPException(status_code=400, detail=str(e))
29
+
30
+ @app.post("/train")
31
+ async def train_model(file: UploadFile = File(...), target_column: str = "target"):
32
+ try:
33
+ df = pd.read_csv(file.file)
34
+ result = orchestrator.run(df, target_column, train=True)
35
+
36
+ # Ensure strategy is included in the response
37
+ strategy = result.get("strategy", {})
38
+ response = {
39
+ "strategy": strategy,
40
+ "metrics": result.get("metrics", {}),
41
+ "model_path": result.get("model_path", "/path/to/model.pkl"),
42
+ "training_time": result.get("training_time", 0),
43
+ "model_id": result.get("model_id", "trained_model_123")
44
+ }
45
+ return response
46
+ except Exception as e:
47
+ raise HTTPException(status_code=400, detail=str(e))
48
+
49
+ @app.post("/explain")
50
+ async def explain_model(file: UploadFile = File(...), target_column: str = "target"):
51
+ try:
52
+ df = pd.read_csv(file.file)
53
+ result = orchestrator.run(df, target_column, train=True)
54
+ return {
55
+ "strategy_explanation": result.get("strategy_explanation"),
56
+ "metrics": result.get("metrics", {}),
57
+ "feature_importance": result.get("feature_importance", [])
58
+ }
59
+ except Exception as e:
60
+ raise HTTPException(status_code=400, detail=str(e))
61
+
62
+ @app.post("/predict")
63
+ async def predict(data: dict):
64
+ try:
65
+ # Load the trained model
66
+ model = orchestrator.model_io.load("exports/models/trained_model.pkl")
67
+ # Prepare data for prediction
68
+ df = pd.DataFrame([data])
69
+ preds = model.predict(df)
70
+ return {"prediction": preds.tolist()}
71
+ except Exception as e:
72
+ raise HTTPException(status_code=400, detail=str(e))
73
+
74
+ @app.get("/health")
75
+ def health():
76
+ return {"status": "ok"}
77
+
78
+
79
+
80
+
backend/config/__init__.py ADDED
File without changes
backend/core/__init__.py ADDED
File without changes
backend/core/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (171 Bytes). View file
 
backend/core/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (147 Bytes). View file
 
backend/core/__pycache__/dataset_analyzer.cpython-310.pyc ADDED
Binary file (3.39 kB). View file
 
backend/core/__pycache__/dataset_analyzer.cpython-313.pyc ADDED
Binary file (6.55 kB). View file
 
backend/core/__pycache__/deployment_generator.cpython-310.pyc ADDED
Binary file (1.13 kB). View file
 
backend/core/__pycache__/deployment_generator.cpython-313.pyc ADDED
Binary file (1.21 kB). View file
 
backend/core/__pycache__/explainability.cpython-310.pyc ADDED
Binary file (912 Bytes). View file
 
backend/core/__pycache__/explainability.cpython-313.pyc ADDED
Binary file (1.72 kB). View file
 
backend/core/__pycache__/model_factory.cpython-310.pyc ADDED
Binary file (1.81 kB). View file
 
backend/core/__pycache__/model_factory.cpython-313.pyc ADDED
Binary file (2.07 kB). View file
 
backend/core/__pycache__/monitoring.cpython-310.pyc ADDED
Binary file (645 Bytes). View file
 
backend/core/__pycache__/monitoring.cpython-313.pyc ADDED
Binary file (815 Bytes). View file
 
backend/core/__pycache__/orchestrator.cpython-310.pyc ADDED
Binary file (2.86 kB). View file
 
backend/core/__pycache__/orchestrator.cpython-313.pyc ADDED
Binary file (4.74 kB). View file
 
backend/core/__pycache__/problem_inference.cpython-310.pyc ADDED
Binary file (611 Bytes). View file
 
backend/core/__pycache__/problem_inference.cpython-313.pyc ADDED
Binary file (786 Bytes). View file
 
backend/core/__pycache__/strategy_reasoner.cpython-310.pyc ADDED
Binary file (2.04 kB). View file
 
backend/core/__pycache__/strategy_reasoner.cpython-313.pyc ADDED
Binary file (3.02 kB). View file
 
backend/core/dataset_analyzer.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from backend.utils.logger import logger
4
+
5
+ def convert_numpy_types(obj):
6
+ """Recursively convert numpy types to Python types for JSON serialization."""
7
+ if isinstance(obj, np.ndarray):
8
+ return obj.tolist()
9
+ elif isinstance(obj, (np.integer, np.int64, np.int32)):
10
+ return int(obj)
11
+ elif isinstance(obj, (np.floating, np.float64, np.float32)):
12
+ return float(obj)
13
+ elif isinstance(obj, np.bool_):
14
+ return bool(obj)
15
+ elif isinstance(obj, dict):
16
+ return {key: convert_numpy_types(value) for key, value in obj.items()}
17
+ elif isinstance(obj, list):
18
+ return [convert_numpy_types(item) for item in obj]
19
+ else:
20
+ return obj
21
+
22
+ class DatasetAnalyzer:
23
+ def analyze(self, df: pd.DataFrame, target_column: str = None):
24
+ logger.info("Starting dataset analysis...")
25
+ # Remove all-null columns
26
+ null_columns = df.columns[df.isnull().all()]
27
+ if len(null_columns) > 0:
28
+ logger.warning(f"Removing all-null columns: {list(null_columns)}")
29
+ df = df.drop(columns=null_columns)
30
+
31
+ # Remove duplicate rows
32
+ duplicate_rows = df.duplicated().sum()
33
+ if duplicate_rows > 0:
34
+ logger.warning(f"Removing {duplicate_rows} duplicate rows")
35
+ df = df.drop_duplicates()
36
+
37
+ # Remove constant columns
38
+ constant_columns = [col for col in df.columns if df[col].nunique() == 1]
39
+ if len(constant_columns) > 0:
40
+ logger.warning(f"Removing constant columns: {constant_columns}")
41
+ df = df.drop(columns=constant_columns)
42
+
43
+ # Ensure at least 2 usable features after preprocessing
44
+ usable_features = [col for col in df.columns if col != target_column]
45
+ if len(usable_features) < 2:
46
+ raise ValueError(f"Insufficient features: only {len(usable_features)} usable features after preprocessing, need at least 2")
47
+
48
+ info = {}
49
+ info["num_rows"] = df.shape[0]
50
+ info["num_columns"] = df.shape[1]
51
+ info["missing_ratio"] = df.isnull().mean().mean()
52
+ info["row_count"] = df.shape[0]
53
+ info["high_dimensional"] = bool(df.shape[1] > 50)
54
+ info["small_data"] = bool(df.shape[0] < 1200)
55
+ info["sparse_data"] = bool(df.isnull().mean().mean() > 0.4)
56
+ all_numeric_cols = df.select_dtypes(include="number").columns.tolist()
57
+ all_categorical_cols = df.select_dtypes(exclude="number").columns.tolist()
58
+ info["numeric_cols"] = [col for col in all_numeric_cols if col != target_column]
59
+ info["categorical_cols"] = [col for col in all_categorical_cols if col != target_column]
60
+
61
+ if len(info["numeric_cols"]) + len(info["categorical_cols"]) < 2:
62
+ raise ValueError("Dataset must have at least 2 usable features after preprocessing")
63
+
64
+ # Cardinality
65
+ cardinality = {col: df[col].nunique() for col in df.columns}
66
+ info["cardinality"] = cardinality
67
+
68
+ # Target-specific checks
69
+ if target_column and target_column in df.columns:
70
+ target = df[target_column]
71
+ unique_vals = target.nunique()
72
+ if target.dtype in ['int64', 'float64'] and unique_vals > 10:
73
+ info["target_type"] = "regression"
74
+ info["class_distribution"] = None
75
+ info["imbalance"] = None
76
+ else:
77
+ info["target_type"] = "classification"
78
+ value_counts = target.value_counts(normalize=True)
79
+ info["class_distribution"] = value_counts.to_dict()
80
+ info["imbalance"] = bool(value_counts.max() > 0.8)
81
+ else:
82
+ info["target_type"] = None
83
+ info["class_distribution"] = None
84
+ info["imbalance"] = None
85
+
86
+ # NLP detection heuristic
87
+ avg_text_len = None
88
+ text_columns = []
89
+ for col in info["categorical_cols"]:
90
+ if df[col].astype(str).str.len().mean() > 30:
91
+ text_columns.append(col)
92
+ info["text_columns"] = text_columns
93
+ info["possible_nlp"] = len(text_columns) > 0
94
+
95
+ return convert_numpy_types(info)
96
+
97
+
98
+
99
+
backend/core/deployment_generator.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class DeploymentGenerator:
2
+ def generate_fastapi_app(self, model_path):
3
+ template = f'''
4
+ from fastapi import FastAPI
5
+ import joblib
6
+ import pandas as pd
7
+
8
+ app = FastAPI()
9
+ model = joblib.load("{model_path}")
10
+
11
+ @app.post("/predict")
12
+ async def predict(data: dict):
13
+ df = pd.DataFrame([data])
14
+ preds = model.predict(df)
15
+ return {{"prediction": preds.tolist()}}
16
+ '''
17
+ return template
18
+
19
+ def generate_dockerfile(self):
20
+ return '''
21
+ FROM python:3.9
22
+ WORKDIR /app
23
+ COPY . /app
24
+ RUN pip install fastapi uvicorn pandas scikit-learn joblib
25
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
26
+ '''
27
+
28
+
29
+
30
+
backend/core/explainability.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shap
2
+ import numpy as np
3
+
4
+ class ExplainabilityEngine:
5
+ def explain_tabular(self, model_pipeline, X_sample):
6
+ if X_sample.empty:
7
+ raise ValueError("Sample data is empty, cannot compute explanations")
8
+
9
+ # Extract trained model and preprocessor
10
+ preprocessor = model_pipeline.named_steps["preprocessor"]
11
+ model = model_pipeline.named_steps["model"]
12
+
13
+ X_transformed = preprocessor.transform(X_sample)
14
+
15
+ if X_transformed.shape[0] == 0:
16
+ raise ValueError("Transformed sample data is empty after preprocessing")
17
+
18
+ explainer = shap.Explainer(model, X_transformed)
19
+ shap_values = explainer(X_transformed, check_additivity=False)
20
+
21
+ if shap_values is None or shap_values.values is None:
22
+ raise ValueError("SHAP computation failed")
23
+
24
+ global_importance = np.abs(shap_values.values).mean(axis=0).tolist()
25
+
26
+ if len(global_importance) == 0:
27
+ raise ValueError("No feature importance computed")
28
+
29
+ return global_importance
30
+
31
+
32
+
33
+
backend/core/model_factory.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from sklearn.model_selection import train_test_split
3
+ from ..tabular.pipelines import build_preprocessing_pipeline
4
+ from ..tabular.trainers import train_model
5
+ from ..tabular.evaluators import evaluate_model
6
+ from ..nlp.trainers import TextClassifier
7
+ from ..nlp.evaluators import evaluate_nlp_model
8
+ from ..utils.model_io import ModelIO
9
+
10
+ class ModelFactory:
11
+ def __init__(self):
12
+ self.model_io = ModelIO()
13
+
14
+ def build_and_train(self, df, target_column, dataset_info, problem_type, strategy):
15
+ if dataset_info["small_data"]:
16
+ raise ValueError("Dataset is too small for training. Minimum 1200 rows required.")
17
+
18
+ if problem_type == "nlp":
19
+ raise ValueError("NLP functionality is not supported in this version.")
20
+ else:
21
+ # Tabular
22
+ X = df.drop(columns=[target_column])
23
+ y = df[target_column]
24
+
25
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
26
+
27
+ pipeline = build_preprocessing_pipeline(dataset_info["numeric_cols"], dataset_info["categorical_cols"])
28
+ pipeline.fit(X_train, y_train)
29
+
30
+ model = train_model(pipeline, X_train, y_train, problem_type, strategy)
31
+ metrics = evaluate_model(model, X_test, y_test, problem_type)
32
+
33
+ # Save model
34
+ self.model_io.save(model, "exports/models/trained_model.pkl")
35
+
36
+ return model, metrics
37
+
38
+
39
+
40
+
backend/core/monitoring.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ class MonitoringEngine:
4
+ def detect_drift(self, train_stats, new_data_stats, threshold=0.2):
5
+ drift_flags = {}
6
+ for feature in train_stats:
7
+ if abs(train_stats[feature] - new_data_stats.get(feature, train_stats[feature])) > threshold:
8
+ drift_flags[feature] = True
9
+ return drift_flags
10
+
11
+
12
+
13
+
backend/core/orchestrator.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .dataset_analyzer import DatasetAnalyzer
2
+ from .problem_inference import ProblemInference
3
+ from .strategy_reasoner import StrategyReasoner
4
+ from .model_factory import ModelFactory
5
+ from .explainability import ExplainabilityEngine
6
+ from .deployment_generator import DeploymentGenerator
7
+ from .monitoring import MonitoringEngine
8
+ from ..utils.logger import logger
9
+ from ..utils.validators import DataValidator
10
+ from ..utils.model_io import ModelIO
11
+ import json
12
+ import os
13
+
14
+ class Orchestrator:
15
+ def __init__(self):
16
+ self.validator = DataValidator()
17
+ self.analyzer = DatasetAnalyzer()
18
+ self.inferencer = ProblemInference()
19
+ self.reasoner = StrategyReasoner()
20
+ self.model_factory = ModelFactory()
21
+ self.explainer = ExplainabilityEngine()
22
+ self.deployer = DeploymentGenerator()
23
+ self.monitor = MonitoringEngine()
24
+ self.model_io = ModelIO()
25
+
26
+ def run(self, df, target_column, train=False):
27
+ self.validator.validate_dataframe(df, target_column)
28
+ logger.info("Validation passed")
29
+ dataset_info = self.analyzer.analyze(df, target_column)
30
+ problem_type = self.inferencer.infer(dataset_info, target_column)
31
+ strategy = self.reasoner.decide(dataset_info, problem_type)
32
+
33
+ tradeoff_explanation = self.reasoner.explain_tradeoffs(strategy)
34
+
35
+ # Log strategy behavior
36
+ log_data = {
37
+ "dataset_characteristics": dataset_info,
38
+ "chosen_model_family": strategy.get("model_family"),
39
+ "detected_risks": strategy.get("risks", []),
40
+ "confidence_score": strategy.get("confidence", 0)
41
+ }
42
+ os.makedirs("experiments/logs", exist_ok=True)
43
+ with open(f"experiments/logs/{target_column}_strategy.json", "w") as f:
44
+ json.dump(log_data, f, indent=4, default=str)
45
+
46
+ response = {
47
+ "dataset_info": dataset_info,
48
+ "problem_type": problem_type,
49
+ "strategy": strategy,
50
+ "strategy_tradeoffs": tradeoff_explanation
51
+ }
52
+
53
+ if problem_type == "nlp":
54
+ response["nlp_mode"] = "activated"
55
+
56
+ if train:
57
+ model, metrics = self.model_factory.build_and_train(
58
+ df, target_column, dataset_info, problem_type, strategy
59
+ )
60
+ response["metrics"] = metrics
61
+
62
+ explanation = self.reasoner.explain_strategy(strategy)
63
+ response["strategy_explanation"] = explanation
64
+
65
+ X_sample = df.drop(columns=[target_column]).head(100) # Sample for SHAP
66
+ feature_importance = self.explainer.explain_tabular(model, X_sample)
67
+ response["feature_importance"] = feature_importance
68
+
69
+ # Save the trained model
70
+ os.makedirs("exports/models", exist_ok=True)
71
+ os.makedirs("exports/deployment", exist_ok=True)
72
+ model_path = "exports/models/trained_model.pkl"
73
+ self.model_io.save(model, model_path)
74
+
75
+ # Generate deployment artifacts
76
+ fastapi_app = self.deployer.generate_fastapi_app(model_path)
77
+ dockerfile = self.deployer.generate_dockerfile()
78
+
79
+ with open("exports/deployment/main.py", "w") as f:
80
+ f.write(fastapi_app)
81
+ with open("exports/deployment/Dockerfile", "w") as f:
82
+ f.write(dockerfile)
83
+
84
+ return response
85
+
86
+
87
+
88
+
backend/core/problem_inference.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class ProblemInference:
2
+ def infer(self, dataset_info, target_column):
3
+ if dataset_info.get("possible_nlp"):
4
+ return "nlp"
5
+
6
+ if target_column:
7
+ return "classification" if dataset_info.get("class_distribution") else "regression"
8
+
9
+ return "unknown"
10
+
11
+
12
+
13
+
backend/core/strategy_reasoner.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class StrategyReasoner:
2
+ def decide(self, dataset_info, problem_type):
3
+ strategy = {}
4
+ risks = []
5
+ score = 0.0
6
+
7
+ if dataset_info.get("small_data"):
8
+ risks.append("small_dataset")
9
+ score += 0.1
10
+
11
+ if dataset_info.get("high_dimensional"):
12
+ risks.append("high_dimensionality")
13
+ score += 0.1
14
+
15
+ if dataset_info.get("imbalance"):
16
+ risks.append("class_imbalance")
17
+ score += 0.2
18
+
19
+ if dataset_info.get("sparse_data"):
20
+ risks.append("high_missingness")
21
+ score += 0.2
22
+
23
+ if problem_type == "classification":
24
+ if "small_dataset" in risks:
25
+ model_family = "tree_ensemble"
26
+ reason = "Small datasets benefit from simpler models"
27
+ elif "high_dimensionality" in risks:
28
+ model_family = "tree_ensemble"
29
+ reason = "Tree ensembles handle high-dimensional data better"
30
+ else:
31
+ model_family = "tree_ensemble"
32
+ reason = "Tree ensembles handle complexity well"
33
+
34
+ elif problem_type == "regression":
35
+ if "high_dimensionality" in risks:
36
+ model_family = "tree_ensemble"
37
+ reason = "Tree ensembles handle high-dimensional data better"
38
+ else:
39
+ model_family = "linear_or_tree"
40
+ reason = "Balances interpretability and accuracy"
41
+
42
+ elif problem_type == "nlp":
43
+ model_family = "transformer"
44
+ reason = "Transformers best capture language semantics"
45
+
46
+ strategy["model_family"] = model_family
47
+ strategy["reason"] = reason
48
+ strategy["risks"] = risks
49
+ strategy["confidence"] = round(1 - min(score, 0.9), 2)
50
+
51
+ return strategy
52
+
53
+ def explain_strategy(self, strategy):
54
+ explanation = f"Selected {strategy['model_family']} models because: {strategy['reason']}."
55
+ if strategy.get("risks"):
56
+ explanation += f" Identified risks: {', '.join(strategy['risks'])}."
57
+ return explanation
58
+
59
+ def explain_tradeoffs(self, strategy):
60
+ explanation = f"Chose {strategy['model_family']} due to: {strategy['reason']}."
61
+ if strategy.get("risks"):
62
+ explanation += f" Risks detected: {', '.join(strategy['risks'])}."
63
+ explanation += f" Confidence score: {strategy.get('confidence')}."
64
+ return explanation
65
+
66
+
67
+
68
+
backend/experiments/__init__.py ADDED
File without changes
backend/experiments/__pycache__/benchmark_runner.cpython-313.pyc ADDED
Binary file (1.47 kB). View file
 
backend/experiments/benchmark_runner.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ class BenchmarkRunner:
4
+ def run(self, orchestrator, datasets):
5
+ results = []
6
+ for name, (df, target) in datasets.items():
7
+ start = time.time()
8
+ try:
9
+ output = orchestrator.run(df, target, train=True)
10
+ end = time.time()
11
+
12
+ results.append({
13
+ "dataset": name,
14
+ "strategy": output.get("strategy"),
15
+ "metrics": output.get("metrics"),
16
+ "time": round(end - start, 2),
17
+ "error": None
18
+ })
19
+ except Exception as e:
20
+ end = time.time()
21
+ results.append({
22
+ "dataset": name,
23
+ "strategy": None,
24
+ "metrics": None,
25
+ "time": round(end - start, 2),
26
+ "error": str(e)
27
+ })
28
+ return results
29
+
30
+
31
+
32
+
backend/experiments/run_benchmarks.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from benchmark_runner import BenchmarkRunner
3
+ import sys
4
+ import os
5
+ sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
6
+ from backend.core.orchestrator import Orchestrator
7
+
8
+ # Load datasets
9
+ datasets = {
10
+ "titanic": (pd.read_csv(os.path.join(os.path.dirname(__file__), '..', '..', 'datasets', 'real_world', 'titanic.csv')), "Survived"),
11
+ "credit_default": (pd.read_csv(os.path.join(os.path.dirname(__file__), '..', '..', 'datasets', 'real_world', 'credit_default.csv')), "default.payment.next.month"),
12
+ "house_prices": (pd.read_csv(os.path.join(os.path.dirname(__file__), '..', '..', 'datasets', 'real_world', 'house_prices.csv')), "Price"),
13
+ "telecom_churn": (pd.read_csv(os.path.join(os.path.dirname(__file__), '..', '..', 'datasets', 'real_world', 'telecom_churn.csv')), "Churn"),
14
+ "news_classification": (pd.read_csv(os.path.join(os.path.dirname(__file__), '..', '..', 'datasets', 'real_world', 'news_classification.csv')), "label"),
15
+ }
16
+
17
+ orchestrator = Orchestrator()
18
+ runner = BenchmarkRunner()
19
+ results = runner.run(orchestrator, datasets)
20
+
21
+ print("Benchmark Results:")
22
+ for result in results:
23
+ print(result)
24
+
25
+ # Save results to file
26
+ with open("experiments/benchmark_results.json", "w") as f:
27
+ import json
28
+ json.dump(results, f, indent=4)
29
+
30
+
31
+
32
+
backend/nlp/__pycache__/evaluators.cpython-310.pyc ADDED
Binary file (747 Bytes). View file
 
backend/nlp/__pycache__/evaluators.cpython-313.pyc ADDED
Binary file (1.1 kB). View file
 
backend/nlp/__pycache__/preprocess.cpython-310.pyc ADDED
Binary file (584 Bytes). View file
 
backend/nlp/__pycache__/preprocess.cpython-313.pyc ADDED
Binary file (816 Bytes). View file
 
backend/nlp/__pycache__/trainers.cpython-310.pyc ADDED
Binary file (2.56 kB). View file
 
backend/nlp/__pycache__/trainers.cpython-313.pyc ADDED
Binary file (3.12 kB). View file
 
backend/nlp/embeddings.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+
3
+ class EmbeddingEngine:
4
+ def __init__(self, model_name="all-MiniLM-L6-v2"):
5
+ self.model = SentenceTransformer(model_name)
6
+
7
+ def encode(self, texts):
8
+ return self.model.encode(texts)
9
+
10
+
11
+
12
+