Spaces:
Sleeping
Sleeping
Upload 79 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Dockerfile +12 -0
- README.md +62 -12
- backend/__init__.py +0 -0
- backend/__pycache__/__init__.cpython-310.pyc +0 -0
- backend/__pycache__/__init__.cpython-313.pyc +0 -0
- backend/api/__init__.py +0 -0
- backend/api/__pycache__/__init__.cpython-310.pyc +0 -0
- backend/api/__pycache__/__init__.cpython-313.pyc +0 -0
- backend/api/__pycache__/main.cpython-310.pyc +0 -0
- backend/api/__pycache__/main.cpython-313.pyc +0 -0
- backend/api/main.py +80 -0
- backend/config/__init__.py +0 -0
- backend/core/__init__.py +0 -0
- backend/core/__pycache__/__init__.cpython-310.pyc +0 -0
- backend/core/__pycache__/__init__.cpython-313.pyc +0 -0
- backend/core/__pycache__/dataset_analyzer.cpython-310.pyc +0 -0
- backend/core/__pycache__/dataset_analyzer.cpython-313.pyc +0 -0
- backend/core/__pycache__/deployment_generator.cpython-310.pyc +0 -0
- backend/core/__pycache__/deployment_generator.cpython-313.pyc +0 -0
- backend/core/__pycache__/explainability.cpython-310.pyc +0 -0
- backend/core/__pycache__/explainability.cpython-313.pyc +0 -0
- backend/core/__pycache__/model_factory.cpython-310.pyc +0 -0
- backend/core/__pycache__/model_factory.cpython-313.pyc +0 -0
- backend/core/__pycache__/monitoring.cpython-310.pyc +0 -0
- backend/core/__pycache__/monitoring.cpython-313.pyc +0 -0
- backend/core/__pycache__/orchestrator.cpython-310.pyc +0 -0
- backend/core/__pycache__/orchestrator.cpython-313.pyc +0 -0
- backend/core/__pycache__/problem_inference.cpython-310.pyc +0 -0
- backend/core/__pycache__/problem_inference.cpython-313.pyc +0 -0
- backend/core/__pycache__/strategy_reasoner.cpython-310.pyc +0 -0
- backend/core/__pycache__/strategy_reasoner.cpython-313.pyc +0 -0
- backend/core/dataset_analyzer.py +99 -0
- backend/core/deployment_generator.py +30 -0
- backend/core/explainability.py +33 -0
- backend/core/model_factory.py +40 -0
- backend/core/monitoring.py +13 -0
- backend/core/orchestrator.py +88 -0
- backend/core/problem_inference.py +13 -0
- backend/core/strategy_reasoner.py +68 -0
- backend/experiments/__init__.py +0 -0
- backend/experiments/__pycache__/benchmark_runner.cpython-313.pyc +0 -0
- backend/experiments/benchmark_runner.py +32 -0
- backend/experiments/run_benchmarks.py +32 -0
- backend/nlp/__pycache__/evaluators.cpython-310.pyc +0 -0
- backend/nlp/__pycache__/evaluators.cpython-313.pyc +0 -0
- backend/nlp/__pycache__/preprocess.cpython-310.pyc +0 -0
- backend/nlp/__pycache__/preprocess.cpython-313.pyc +0 -0
- backend/nlp/__pycache__/trainers.cpython-310.pyc +0 -0
- backend/nlp/__pycache__/trainers.cpython-313.pyc +0 -0
- backend/nlp/embeddings.py +12 -0
Dockerfile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY . .
|
| 9 |
+
|
| 10 |
+
EXPOSE 7860
|
| 11 |
+
|
| 12 |
+
CMD ["uvicorn", "backend.api.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,12 +1,62 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ModelSmith AI
|
| 2 |
+
|
| 3 |
+
An intelligent ML platform that automates tabular classification and regression tasks. It analyzes datasets, recommends optimal strategies, trains models, and provides explanations.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **Dataset Analysis**: Automatic detection of data types, missing values, and potential issues
|
| 8 |
+
- **Strategy Reasoning**: Intelligent model selection based on dataset characteristics
|
| 9 |
+
- **Automated Training**: End-to-end model training with preprocessing pipelines
|
| 10 |
+
- **Explainability**: SHAP-based feature importance explanations
|
| 11 |
+
- **FastAPI Backend**: RESTful API for seamless integration
|
| 12 |
+
|
| 13 |
+
## Supported Scope
|
| 14 |
+
|
| 15 |
+
- **Task**: Tabular classification and regression
|
| 16 |
+
- **Input**: CSV files with ≥1200 rows
|
| 17 |
+
- **Target**: Binary or multiclass classification, regression
|
| 18 |
+
- **Features**: At least 2 usable features after preprocessing
|
| 19 |
+
|
| 20 |
+
## API Endpoints
|
| 21 |
+
|
| 22 |
+
- `POST /analyze`: Analyze dataset and get strategy recommendations
|
| 23 |
+
- `POST /train`: Train a model on the dataset
|
| 24 |
+
- `POST /explain`: Get model explanations and feature importance
|
| 25 |
+
- `POST /predict`: Make predictions with trained model
|
| 26 |
+
- `GET /health`: Health check
|
| 27 |
+
|
| 28 |
+
## Deployment
|
| 29 |
+
|
| 30 |
+
This project is designed for deployment on Hugging Face Spaces using Docker.
|
| 31 |
+
|
| 32 |
+
### Files for Deployment
|
| 33 |
+
|
| 34 |
+
- `Dockerfile`
|
| 35 |
+
- `requirements.txt`
|
| 36 |
+
- `backend/` (entire directory)
|
| 37 |
+
|
| 38 |
+
### Running Locally
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
pip install -r requirements.txt
|
| 42 |
+
uvicorn backend.api.main:app --host 0.0.0.0 --port 7860
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
## Limitations
|
| 46 |
+
|
| 47 |
+
- NLP functionality is disabled
|
| 48 |
+
- Requires datasets with ≥1200 rows
|
| 49 |
+
- CPU-only, no GPU support
|
| 50 |
+
- Stateless API (models saved temporarily)
|
| 51 |
+
|
| 52 |
+
## Architecture
|
| 53 |
+
|
| 54 |
+
- **Orchestrator**: Main workflow coordinator
|
| 55 |
+
- **Dataset Analyzer**: Data profiling and preprocessing
|
| 56 |
+
- **Strategy Reasoner**: Model selection logic
|
| 57 |
+
- **Model Factory**: Training and evaluation
|
| 58 |
+
- **Explainability Engine**: SHAP explanations
|
| 59 |
+
|
| 60 |
+
## License
|
| 61 |
+
|
| 62 |
+
MIT License
|
backend/__init__.py
ADDED
|
File without changes
|
backend/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (166 Bytes). View file
|
|
|
backend/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (142 Bytes). View file
|
|
|
backend/api/__init__.py
ADDED
|
File without changes
|
backend/api/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (170 Bytes). View file
|
|
|
backend/api/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (146 Bytes). View file
|
|
|
backend/api/__pycache__/main.cpython-310.pyc
ADDED
|
Binary file (1.82 kB). View file
|
|
|
backend/api/__pycache__/main.cpython-313.pyc
ADDED
|
Binary file (4.43 kB). View file
|
|
|
backend/api/main.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from backend.core.orchestrator import Orchestrator
|
| 4 |
+
|
| 5 |
+
app = FastAPI()
|
| 6 |
+
orchestrator = Orchestrator()
|
| 7 |
+
|
| 8 |
+
@app.post("/analyze")
|
| 9 |
+
async def analyze_dataset(file: UploadFile = File(...), target_column: str = "target"):
|
| 10 |
+
try:
|
| 11 |
+
df = pd.read_csv(file.file)
|
| 12 |
+
result = orchestrator.run(df, target_column)
|
| 13 |
+
|
| 14 |
+
# Format response for frontend
|
| 15 |
+
dataset_info = result.get("dataset_info", {})
|
| 16 |
+
strategy = result.get("strategy", {})
|
| 17 |
+
|
| 18 |
+
response = {
|
| 19 |
+
"columns": list(df.columns),
|
| 20 |
+
"dataTypes": dataset_info.get("data_types", {}),
|
| 21 |
+
"risks": dataset_info.get("risks", []),
|
| 22 |
+
"problemType": result.get("problem_type"),
|
| 23 |
+
"confidence": strategy.get("confidence", 0),
|
| 24 |
+
"strategy": strategy
|
| 25 |
+
}
|
| 26 |
+
return response
|
| 27 |
+
except Exception as e:
|
| 28 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 29 |
+
|
| 30 |
+
@app.post("/train")
|
| 31 |
+
async def train_model(file: UploadFile = File(...), target_column: str = "target"):
|
| 32 |
+
try:
|
| 33 |
+
df = pd.read_csv(file.file)
|
| 34 |
+
result = orchestrator.run(df, target_column, train=True)
|
| 35 |
+
|
| 36 |
+
# Ensure strategy is included in the response
|
| 37 |
+
strategy = result.get("strategy", {})
|
| 38 |
+
response = {
|
| 39 |
+
"strategy": strategy,
|
| 40 |
+
"metrics": result.get("metrics", {}),
|
| 41 |
+
"model_path": result.get("model_path", "/path/to/model.pkl"),
|
| 42 |
+
"training_time": result.get("training_time", 0),
|
| 43 |
+
"model_id": result.get("model_id", "trained_model_123")
|
| 44 |
+
}
|
| 45 |
+
return response
|
| 46 |
+
except Exception as e:
|
| 47 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 48 |
+
|
| 49 |
+
@app.post("/explain")
|
| 50 |
+
async def explain_model(file: UploadFile = File(...), target_column: str = "target"):
|
| 51 |
+
try:
|
| 52 |
+
df = pd.read_csv(file.file)
|
| 53 |
+
result = orchestrator.run(df, target_column, train=True)
|
| 54 |
+
return {
|
| 55 |
+
"strategy_explanation": result.get("strategy_explanation"),
|
| 56 |
+
"metrics": result.get("metrics", {}),
|
| 57 |
+
"feature_importance": result.get("feature_importance", [])
|
| 58 |
+
}
|
| 59 |
+
except Exception as e:
|
| 60 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 61 |
+
|
| 62 |
+
@app.post("/predict")
|
| 63 |
+
async def predict(data: dict):
|
| 64 |
+
try:
|
| 65 |
+
# Load the trained model
|
| 66 |
+
model = orchestrator.model_io.load("exports/models/trained_model.pkl")
|
| 67 |
+
# Prepare data for prediction
|
| 68 |
+
df = pd.DataFrame([data])
|
| 69 |
+
preds = model.predict(df)
|
| 70 |
+
return {"prediction": preds.tolist()}
|
| 71 |
+
except Exception as e:
|
| 72 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 73 |
+
|
| 74 |
+
@app.get("/health")
|
| 75 |
+
def health():
|
| 76 |
+
return {"status": "ok"}
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
backend/config/__init__.py
ADDED
|
File without changes
|
backend/core/__init__.py
ADDED
|
File without changes
|
backend/core/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (171 Bytes). View file
|
|
|
backend/core/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (147 Bytes). View file
|
|
|
backend/core/__pycache__/dataset_analyzer.cpython-310.pyc
ADDED
|
Binary file (3.39 kB). View file
|
|
|
backend/core/__pycache__/dataset_analyzer.cpython-313.pyc
ADDED
|
Binary file (6.55 kB). View file
|
|
|
backend/core/__pycache__/deployment_generator.cpython-310.pyc
ADDED
|
Binary file (1.13 kB). View file
|
|
|
backend/core/__pycache__/deployment_generator.cpython-313.pyc
ADDED
|
Binary file (1.21 kB). View file
|
|
|
backend/core/__pycache__/explainability.cpython-310.pyc
ADDED
|
Binary file (912 Bytes). View file
|
|
|
backend/core/__pycache__/explainability.cpython-313.pyc
ADDED
|
Binary file (1.72 kB). View file
|
|
|
backend/core/__pycache__/model_factory.cpython-310.pyc
ADDED
|
Binary file (1.81 kB). View file
|
|
|
backend/core/__pycache__/model_factory.cpython-313.pyc
ADDED
|
Binary file (2.07 kB). View file
|
|
|
backend/core/__pycache__/monitoring.cpython-310.pyc
ADDED
|
Binary file (645 Bytes). View file
|
|
|
backend/core/__pycache__/monitoring.cpython-313.pyc
ADDED
|
Binary file (815 Bytes). View file
|
|
|
backend/core/__pycache__/orchestrator.cpython-310.pyc
ADDED
|
Binary file (2.86 kB). View file
|
|
|
backend/core/__pycache__/orchestrator.cpython-313.pyc
ADDED
|
Binary file (4.74 kB). View file
|
|
|
backend/core/__pycache__/problem_inference.cpython-310.pyc
ADDED
|
Binary file (611 Bytes). View file
|
|
|
backend/core/__pycache__/problem_inference.cpython-313.pyc
ADDED
|
Binary file (786 Bytes). View file
|
|
|
backend/core/__pycache__/strategy_reasoner.cpython-310.pyc
ADDED
|
Binary file (2.04 kB). View file
|
|
|
backend/core/__pycache__/strategy_reasoner.cpython-313.pyc
ADDED
|
Binary file (3.02 kB). View file
|
|
|
backend/core/dataset_analyzer.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from backend.utils.logger import logger
|
| 4 |
+
|
| 5 |
+
def convert_numpy_types(obj):
|
| 6 |
+
"""Recursively convert numpy types to Python types for JSON serialization."""
|
| 7 |
+
if isinstance(obj, np.ndarray):
|
| 8 |
+
return obj.tolist()
|
| 9 |
+
elif isinstance(obj, (np.integer, np.int64, np.int32)):
|
| 10 |
+
return int(obj)
|
| 11 |
+
elif isinstance(obj, (np.floating, np.float64, np.float32)):
|
| 12 |
+
return float(obj)
|
| 13 |
+
elif isinstance(obj, np.bool_):
|
| 14 |
+
return bool(obj)
|
| 15 |
+
elif isinstance(obj, dict):
|
| 16 |
+
return {key: convert_numpy_types(value) for key, value in obj.items()}
|
| 17 |
+
elif isinstance(obj, list):
|
| 18 |
+
return [convert_numpy_types(item) for item in obj]
|
| 19 |
+
else:
|
| 20 |
+
return obj
|
| 21 |
+
|
| 22 |
+
class DatasetAnalyzer:
|
| 23 |
+
def analyze(self, df: pd.DataFrame, target_column: str = None):
|
| 24 |
+
logger.info("Starting dataset analysis...")
|
| 25 |
+
# Remove all-null columns
|
| 26 |
+
null_columns = df.columns[df.isnull().all()]
|
| 27 |
+
if len(null_columns) > 0:
|
| 28 |
+
logger.warning(f"Removing all-null columns: {list(null_columns)}")
|
| 29 |
+
df = df.drop(columns=null_columns)
|
| 30 |
+
|
| 31 |
+
# Remove duplicate rows
|
| 32 |
+
duplicate_rows = df.duplicated().sum()
|
| 33 |
+
if duplicate_rows > 0:
|
| 34 |
+
logger.warning(f"Removing {duplicate_rows} duplicate rows")
|
| 35 |
+
df = df.drop_duplicates()
|
| 36 |
+
|
| 37 |
+
# Remove constant columns
|
| 38 |
+
constant_columns = [col for col in df.columns if df[col].nunique() == 1]
|
| 39 |
+
if len(constant_columns) > 0:
|
| 40 |
+
logger.warning(f"Removing constant columns: {constant_columns}")
|
| 41 |
+
df = df.drop(columns=constant_columns)
|
| 42 |
+
|
| 43 |
+
# Ensure at least 2 usable features after preprocessing
|
| 44 |
+
usable_features = [col for col in df.columns if col != target_column]
|
| 45 |
+
if len(usable_features) < 2:
|
| 46 |
+
raise ValueError(f"Insufficient features: only {len(usable_features)} usable features after preprocessing, need at least 2")
|
| 47 |
+
|
| 48 |
+
info = {}
|
| 49 |
+
info["num_rows"] = df.shape[0]
|
| 50 |
+
info["num_columns"] = df.shape[1]
|
| 51 |
+
info["missing_ratio"] = df.isnull().mean().mean()
|
| 52 |
+
info["row_count"] = df.shape[0]
|
| 53 |
+
info["high_dimensional"] = bool(df.shape[1] > 50)
|
| 54 |
+
info["small_data"] = bool(df.shape[0] < 1200)
|
| 55 |
+
info["sparse_data"] = bool(df.isnull().mean().mean() > 0.4)
|
| 56 |
+
all_numeric_cols = df.select_dtypes(include="number").columns.tolist()
|
| 57 |
+
all_categorical_cols = df.select_dtypes(exclude="number").columns.tolist()
|
| 58 |
+
info["numeric_cols"] = [col for col in all_numeric_cols if col != target_column]
|
| 59 |
+
info["categorical_cols"] = [col for col in all_categorical_cols if col != target_column]
|
| 60 |
+
|
| 61 |
+
if len(info["numeric_cols"]) + len(info["categorical_cols"]) < 2:
|
| 62 |
+
raise ValueError("Dataset must have at least 2 usable features after preprocessing")
|
| 63 |
+
|
| 64 |
+
# Cardinality
|
| 65 |
+
cardinality = {col: df[col].nunique() for col in df.columns}
|
| 66 |
+
info["cardinality"] = cardinality
|
| 67 |
+
|
| 68 |
+
# Target-specific checks
|
| 69 |
+
if target_column and target_column in df.columns:
|
| 70 |
+
target = df[target_column]
|
| 71 |
+
unique_vals = target.nunique()
|
| 72 |
+
if target.dtype in ['int64', 'float64'] and unique_vals > 10:
|
| 73 |
+
info["target_type"] = "regression"
|
| 74 |
+
info["class_distribution"] = None
|
| 75 |
+
info["imbalance"] = None
|
| 76 |
+
else:
|
| 77 |
+
info["target_type"] = "classification"
|
| 78 |
+
value_counts = target.value_counts(normalize=True)
|
| 79 |
+
info["class_distribution"] = value_counts.to_dict()
|
| 80 |
+
info["imbalance"] = bool(value_counts.max() > 0.8)
|
| 81 |
+
else:
|
| 82 |
+
info["target_type"] = None
|
| 83 |
+
info["class_distribution"] = None
|
| 84 |
+
info["imbalance"] = None
|
| 85 |
+
|
| 86 |
+
# NLP detection heuristic
|
| 87 |
+
avg_text_len = None
|
| 88 |
+
text_columns = []
|
| 89 |
+
for col in info["categorical_cols"]:
|
| 90 |
+
if df[col].astype(str).str.len().mean() > 30:
|
| 91 |
+
text_columns.append(col)
|
| 92 |
+
info["text_columns"] = text_columns
|
| 93 |
+
info["possible_nlp"] = len(text_columns) > 0
|
| 94 |
+
|
| 95 |
+
return convert_numpy_types(info)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
backend/core/deployment_generator.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class DeploymentGenerator:
|
| 2 |
+
def generate_fastapi_app(self, model_path):
|
| 3 |
+
template = f'''
|
| 4 |
+
from fastapi import FastAPI
|
| 5 |
+
import joblib
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
app = FastAPI()
|
| 9 |
+
model = joblib.load("{model_path}")
|
| 10 |
+
|
| 11 |
+
@app.post("/predict")
|
| 12 |
+
async def predict(data: dict):
|
| 13 |
+
df = pd.DataFrame([data])
|
| 14 |
+
preds = model.predict(df)
|
| 15 |
+
return {{"prediction": preds.tolist()}}
|
| 16 |
+
'''
|
| 17 |
+
return template
|
| 18 |
+
|
| 19 |
+
def generate_dockerfile(self):
|
| 20 |
+
return '''
|
| 21 |
+
FROM python:3.9
|
| 22 |
+
WORKDIR /app
|
| 23 |
+
COPY . /app
|
| 24 |
+
RUN pip install fastapi uvicorn pandas scikit-learn joblib
|
| 25 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 26 |
+
'''
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
backend/core/explainability.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import shap
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
class ExplainabilityEngine:
|
| 5 |
+
def explain_tabular(self, model_pipeline, X_sample):
|
| 6 |
+
if X_sample.empty:
|
| 7 |
+
raise ValueError("Sample data is empty, cannot compute explanations")
|
| 8 |
+
|
| 9 |
+
# Extract trained model and preprocessor
|
| 10 |
+
preprocessor = model_pipeline.named_steps["preprocessor"]
|
| 11 |
+
model = model_pipeline.named_steps["model"]
|
| 12 |
+
|
| 13 |
+
X_transformed = preprocessor.transform(X_sample)
|
| 14 |
+
|
| 15 |
+
if X_transformed.shape[0] == 0:
|
| 16 |
+
raise ValueError("Transformed sample data is empty after preprocessing")
|
| 17 |
+
|
| 18 |
+
explainer = shap.Explainer(model, X_transformed)
|
| 19 |
+
shap_values = explainer(X_transformed, check_additivity=False)
|
| 20 |
+
|
| 21 |
+
if shap_values is None or shap_values.values is None:
|
| 22 |
+
raise ValueError("SHAP computation failed")
|
| 23 |
+
|
| 24 |
+
global_importance = np.abs(shap_values.values).mean(axis=0).tolist()
|
| 25 |
+
|
| 26 |
+
if len(global_importance) == 0:
|
| 27 |
+
raise ValueError("No feature importance computed")
|
| 28 |
+
|
| 29 |
+
return global_importance
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
backend/core/model_factory.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from sklearn.model_selection import train_test_split
|
| 3 |
+
from ..tabular.pipelines import build_preprocessing_pipeline
|
| 4 |
+
from ..tabular.trainers import train_model
|
| 5 |
+
from ..tabular.evaluators import evaluate_model
|
| 6 |
+
from ..nlp.trainers import TextClassifier
|
| 7 |
+
from ..nlp.evaluators import evaluate_nlp_model
|
| 8 |
+
from ..utils.model_io import ModelIO
|
| 9 |
+
|
| 10 |
+
class ModelFactory:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.model_io = ModelIO()
|
| 13 |
+
|
| 14 |
+
def build_and_train(self, df, target_column, dataset_info, problem_type, strategy):
|
| 15 |
+
if dataset_info["small_data"]:
|
| 16 |
+
raise ValueError("Dataset is too small for training. Minimum 1200 rows required.")
|
| 17 |
+
|
| 18 |
+
if problem_type == "nlp":
|
| 19 |
+
raise ValueError("NLP functionality is not supported in this version.")
|
| 20 |
+
else:
|
| 21 |
+
# Tabular
|
| 22 |
+
X = df.drop(columns=[target_column])
|
| 23 |
+
y = df[target_column]
|
| 24 |
+
|
| 25 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 26 |
+
|
| 27 |
+
pipeline = build_preprocessing_pipeline(dataset_info["numeric_cols"], dataset_info["categorical_cols"])
|
| 28 |
+
pipeline.fit(X_train, y_train)
|
| 29 |
+
|
| 30 |
+
model = train_model(pipeline, X_train, y_train, problem_type, strategy)
|
| 31 |
+
metrics = evaluate_model(model, X_test, y_test, problem_type)
|
| 32 |
+
|
| 33 |
+
# Save model
|
| 34 |
+
self.model_io.save(model, "exports/models/trained_model.pkl")
|
| 35 |
+
|
| 36 |
+
return model, metrics
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
backend/core/monitoring.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
class MonitoringEngine:
|
| 4 |
+
def detect_drift(self, train_stats, new_data_stats, threshold=0.2):
|
| 5 |
+
drift_flags = {}
|
| 6 |
+
for feature in train_stats:
|
| 7 |
+
if abs(train_stats[feature] - new_data_stats.get(feature, train_stats[feature])) > threshold:
|
| 8 |
+
drift_flags[feature] = True
|
| 9 |
+
return drift_flags
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
backend/core/orchestrator.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .dataset_analyzer import DatasetAnalyzer
|
| 2 |
+
from .problem_inference import ProblemInference
|
| 3 |
+
from .strategy_reasoner import StrategyReasoner
|
| 4 |
+
from .model_factory import ModelFactory
|
| 5 |
+
from .explainability import ExplainabilityEngine
|
| 6 |
+
from .deployment_generator import DeploymentGenerator
|
| 7 |
+
from .monitoring import MonitoringEngine
|
| 8 |
+
from ..utils.logger import logger
|
| 9 |
+
from ..utils.validators import DataValidator
|
| 10 |
+
from ..utils.model_io import ModelIO
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
class Orchestrator:
|
| 15 |
+
def __init__(self):
|
| 16 |
+
self.validator = DataValidator()
|
| 17 |
+
self.analyzer = DatasetAnalyzer()
|
| 18 |
+
self.inferencer = ProblemInference()
|
| 19 |
+
self.reasoner = StrategyReasoner()
|
| 20 |
+
self.model_factory = ModelFactory()
|
| 21 |
+
self.explainer = ExplainabilityEngine()
|
| 22 |
+
self.deployer = DeploymentGenerator()
|
| 23 |
+
self.monitor = MonitoringEngine()
|
| 24 |
+
self.model_io = ModelIO()
|
| 25 |
+
|
| 26 |
+
def run(self, df, target_column, train=False):
|
| 27 |
+
self.validator.validate_dataframe(df, target_column)
|
| 28 |
+
logger.info("Validation passed")
|
| 29 |
+
dataset_info = self.analyzer.analyze(df, target_column)
|
| 30 |
+
problem_type = self.inferencer.infer(dataset_info, target_column)
|
| 31 |
+
strategy = self.reasoner.decide(dataset_info, problem_type)
|
| 32 |
+
|
| 33 |
+
tradeoff_explanation = self.reasoner.explain_tradeoffs(strategy)
|
| 34 |
+
|
| 35 |
+
# Log strategy behavior
|
| 36 |
+
log_data = {
|
| 37 |
+
"dataset_characteristics": dataset_info,
|
| 38 |
+
"chosen_model_family": strategy.get("model_family"),
|
| 39 |
+
"detected_risks": strategy.get("risks", []),
|
| 40 |
+
"confidence_score": strategy.get("confidence", 0)
|
| 41 |
+
}
|
| 42 |
+
os.makedirs("experiments/logs", exist_ok=True)
|
| 43 |
+
with open(f"experiments/logs/{target_column}_strategy.json", "w") as f:
|
| 44 |
+
json.dump(log_data, f, indent=4, default=str)
|
| 45 |
+
|
| 46 |
+
response = {
|
| 47 |
+
"dataset_info": dataset_info,
|
| 48 |
+
"problem_type": problem_type,
|
| 49 |
+
"strategy": strategy,
|
| 50 |
+
"strategy_tradeoffs": tradeoff_explanation
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
if problem_type == "nlp":
|
| 54 |
+
response["nlp_mode"] = "activated"
|
| 55 |
+
|
| 56 |
+
if train:
|
| 57 |
+
model, metrics = self.model_factory.build_and_train(
|
| 58 |
+
df, target_column, dataset_info, problem_type, strategy
|
| 59 |
+
)
|
| 60 |
+
response["metrics"] = metrics
|
| 61 |
+
|
| 62 |
+
explanation = self.reasoner.explain_strategy(strategy)
|
| 63 |
+
response["strategy_explanation"] = explanation
|
| 64 |
+
|
| 65 |
+
X_sample = df.drop(columns=[target_column]).head(100) # Sample for SHAP
|
| 66 |
+
feature_importance = self.explainer.explain_tabular(model, X_sample)
|
| 67 |
+
response["feature_importance"] = feature_importance
|
| 68 |
+
|
| 69 |
+
# Save the trained model
|
| 70 |
+
os.makedirs("exports/models", exist_ok=True)
|
| 71 |
+
os.makedirs("exports/deployment", exist_ok=True)
|
| 72 |
+
model_path = "exports/models/trained_model.pkl"
|
| 73 |
+
self.model_io.save(model, model_path)
|
| 74 |
+
|
| 75 |
+
# Generate deployment artifacts
|
| 76 |
+
fastapi_app = self.deployer.generate_fastapi_app(model_path)
|
| 77 |
+
dockerfile = self.deployer.generate_dockerfile()
|
| 78 |
+
|
| 79 |
+
with open("exports/deployment/main.py", "w") as f:
|
| 80 |
+
f.write(fastapi_app)
|
| 81 |
+
with open("exports/deployment/Dockerfile", "w") as f:
|
| 82 |
+
f.write(dockerfile)
|
| 83 |
+
|
| 84 |
+
return response
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
|
backend/core/problem_inference.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class ProblemInference:
|
| 2 |
+
def infer(self, dataset_info, target_column):
|
| 3 |
+
if dataset_info.get("possible_nlp"):
|
| 4 |
+
return "nlp"
|
| 5 |
+
|
| 6 |
+
if target_column:
|
| 7 |
+
return "classification" if dataset_info.get("class_distribution") else "regression"
|
| 8 |
+
|
| 9 |
+
return "unknown"
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
backend/core/strategy_reasoner.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class StrategyReasoner:
|
| 2 |
+
def decide(self, dataset_info, problem_type):
|
| 3 |
+
strategy = {}
|
| 4 |
+
risks = []
|
| 5 |
+
score = 0.0
|
| 6 |
+
|
| 7 |
+
if dataset_info.get("small_data"):
|
| 8 |
+
risks.append("small_dataset")
|
| 9 |
+
score += 0.1
|
| 10 |
+
|
| 11 |
+
if dataset_info.get("high_dimensional"):
|
| 12 |
+
risks.append("high_dimensionality")
|
| 13 |
+
score += 0.1
|
| 14 |
+
|
| 15 |
+
if dataset_info.get("imbalance"):
|
| 16 |
+
risks.append("class_imbalance")
|
| 17 |
+
score += 0.2
|
| 18 |
+
|
| 19 |
+
if dataset_info.get("sparse_data"):
|
| 20 |
+
risks.append("high_missingness")
|
| 21 |
+
score += 0.2
|
| 22 |
+
|
| 23 |
+
if problem_type == "classification":
|
| 24 |
+
if "small_dataset" in risks:
|
| 25 |
+
model_family = "tree_ensemble"
|
| 26 |
+
reason = "Small datasets benefit from simpler models"
|
| 27 |
+
elif "high_dimensionality" in risks:
|
| 28 |
+
model_family = "tree_ensemble"
|
| 29 |
+
reason = "Tree ensembles handle high-dimensional data better"
|
| 30 |
+
else:
|
| 31 |
+
model_family = "tree_ensemble"
|
| 32 |
+
reason = "Tree ensembles handle complexity well"
|
| 33 |
+
|
| 34 |
+
elif problem_type == "regression":
|
| 35 |
+
if "high_dimensionality" in risks:
|
| 36 |
+
model_family = "tree_ensemble"
|
| 37 |
+
reason = "Tree ensembles handle high-dimensional data better"
|
| 38 |
+
else:
|
| 39 |
+
model_family = "linear_or_tree"
|
| 40 |
+
reason = "Balances interpretability and accuracy"
|
| 41 |
+
|
| 42 |
+
elif problem_type == "nlp":
|
| 43 |
+
model_family = "transformer"
|
| 44 |
+
reason = "Transformers best capture language semantics"
|
| 45 |
+
|
| 46 |
+
strategy["model_family"] = model_family
|
| 47 |
+
strategy["reason"] = reason
|
| 48 |
+
strategy["risks"] = risks
|
| 49 |
+
strategy["confidence"] = round(1 - min(score, 0.9), 2)
|
| 50 |
+
|
| 51 |
+
return strategy
|
| 52 |
+
|
| 53 |
+
def explain_strategy(self, strategy):
|
| 54 |
+
explanation = f"Selected {strategy['model_family']} models because: {strategy['reason']}."
|
| 55 |
+
if strategy.get("risks"):
|
| 56 |
+
explanation += f" Identified risks: {', '.join(strategy['risks'])}."
|
| 57 |
+
return explanation
|
| 58 |
+
|
| 59 |
+
def explain_tradeoffs(self, strategy):
|
| 60 |
+
explanation = f"Chose {strategy['model_family']} due to: {strategy['reason']}."
|
| 61 |
+
if strategy.get("risks"):
|
| 62 |
+
explanation += f" Risks detected: {', '.join(strategy['risks'])}."
|
| 63 |
+
explanation += f" Confidence score: {strategy.get('confidence')}."
|
| 64 |
+
return explanation
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
|
backend/experiments/__init__.py
ADDED
|
File without changes
|
backend/experiments/__pycache__/benchmark_runner.cpython-313.pyc
ADDED
|
Binary file (1.47 kB). View file
|
|
|
backend/experiments/benchmark_runner.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
+
class BenchmarkRunner:
|
| 4 |
+
def run(self, orchestrator, datasets):
|
| 5 |
+
results = []
|
| 6 |
+
for name, (df, target) in datasets.items():
|
| 7 |
+
start = time.time()
|
| 8 |
+
try:
|
| 9 |
+
output = orchestrator.run(df, target, train=True)
|
| 10 |
+
end = time.time()
|
| 11 |
+
|
| 12 |
+
results.append({
|
| 13 |
+
"dataset": name,
|
| 14 |
+
"strategy": output.get("strategy"),
|
| 15 |
+
"metrics": output.get("metrics"),
|
| 16 |
+
"time": round(end - start, 2),
|
| 17 |
+
"error": None
|
| 18 |
+
})
|
| 19 |
+
except Exception as e:
|
| 20 |
+
end = time.time()
|
| 21 |
+
results.append({
|
| 22 |
+
"dataset": name,
|
| 23 |
+
"strategy": None,
|
| 24 |
+
"metrics": None,
|
| 25 |
+
"time": round(end - start, 2),
|
| 26 |
+
"error": str(e)
|
| 27 |
+
})
|
| 28 |
+
return results
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
backend/experiments/run_benchmarks.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from benchmark_runner import BenchmarkRunner
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
|
| 6 |
+
from backend.core.orchestrator import Orchestrator
|
| 7 |
+
|
| 8 |
+
# Load datasets
|
| 9 |
+
datasets = {
|
| 10 |
+
"titanic": (pd.read_csv(os.path.join(os.path.dirname(__file__), '..', '..', 'datasets', 'real_world', 'titanic.csv')), "Survived"),
|
| 11 |
+
"credit_default": (pd.read_csv(os.path.join(os.path.dirname(__file__), '..', '..', 'datasets', 'real_world', 'credit_default.csv')), "default.payment.next.month"),
|
| 12 |
+
"house_prices": (pd.read_csv(os.path.join(os.path.dirname(__file__), '..', '..', 'datasets', 'real_world', 'house_prices.csv')), "Price"),
|
| 13 |
+
"telecom_churn": (pd.read_csv(os.path.join(os.path.dirname(__file__), '..', '..', 'datasets', 'real_world', 'telecom_churn.csv')), "Churn"),
|
| 14 |
+
"news_classification": (pd.read_csv(os.path.join(os.path.dirname(__file__), '..', '..', 'datasets', 'real_world', 'news_classification.csv')), "label"),
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
orchestrator = Orchestrator()
|
| 18 |
+
runner = BenchmarkRunner()
|
| 19 |
+
results = runner.run(orchestrator, datasets)
|
| 20 |
+
|
| 21 |
+
print("Benchmark Results:")
|
| 22 |
+
for result in results:
|
| 23 |
+
print(result)
|
| 24 |
+
|
| 25 |
+
# Save results to file
|
| 26 |
+
with open("experiments/benchmark_results.json", "w") as f:
|
| 27 |
+
import json
|
| 28 |
+
json.dump(results, f, indent=4)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
backend/nlp/__pycache__/evaluators.cpython-310.pyc
ADDED
|
Binary file (747 Bytes). View file
|
|
|
backend/nlp/__pycache__/evaluators.cpython-313.pyc
ADDED
|
Binary file (1.1 kB). View file
|
|
|
backend/nlp/__pycache__/preprocess.cpython-310.pyc
ADDED
|
Binary file (584 Bytes). View file
|
|
|
backend/nlp/__pycache__/preprocess.cpython-313.pyc
ADDED
|
Binary file (816 Bytes). View file
|
|
|
backend/nlp/__pycache__/trainers.cpython-310.pyc
ADDED
|
Binary file (2.56 kB). View file
|
|
|
backend/nlp/__pycache__/trainers.cpython-313.pyc
ADDED
|
Binary file (3.12 kB). View file
|
|
|
backend/nlp/embeddings.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
|
| 3 |
+
class EmbeddingEngine:
|
| 4 |
+
def __init__(self, model_name="all-MiniLM-L6-v2"):
|
| 5 |
+
self.model = SentenceTransformer(model_name)
|
| 6 |
+
|
| 7 |
+
def encode(self, texts):
|
| 8 |
+
return self.model.encode(texts)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|