| import pandas as pd |
| import numpy as np |
| from services.training.preprocessing import auto_clean_data, fuzzy_merge_labels |
| from services.drift_service import get_drift_dashboard |
| from services.training.forecasting import estimate_training_forecast |
| from core.file_loader import load_dataframe |
|
|
|
|
| def test_fuzzy_merge_labels(): |
| s = pd.Series(["Apple", "apple", "banana", "Banana", "Apple", "orange"]) |
| merged = fuzzy_merge_labels(s, threshold=0.8) |
| |
| |
| cleaned_counts = merged.value_counts() |
| |
| assert len(cleaned_counts) == 3 |
| |
| assert cleaned_counts.iloc[0] == 3 |
| assert cleaned_counts.iloc[1] == 2 |
|
|
|
|
| def test_auto_clean_data_drops_constants_and_nulls(): |
| df = pd.DataFrame({ |
| "target": [1, 2, 3, 4, 5], |
| "id_col": [1, 2, 3, 4, 5], |
| "constant_col": ["a", "a", "a", "a", "a"], |
| "null_col": [np.nan, np.nan, np.nan, np.nan, "1"], |
| "score_col": [10.5, 12.3, 8.1, 7.5, 15.0] |
| }) |
|
|
| cleaned_df, logs = auto_clean_data(df, target="target") |
|
|
| |
| assert "constant_col" not in cleaned_df.columns |
| assert "null_col" not in cleaned_df.columns |
| assert "id_col" not in cleaned_df.columns |
| |
| |
| assert "target" in cleaned_df.columns |
| assert "score_col" in cleaned_df.columns |
|
|
|
|
| def test_auto_clean_null_standardization(): |
| df = pd.DataFrame({ |
| "target": [1, 2, 3, 4], |
| "mixed": ["val1", "n/a", "null", "val2"] |
| }) |
| |
| cleaned, _ = auto_clean_data(df, "target") |
| |
| assert pd.isna(cleaned.loc[1, "mixed"]) |
| assert pd.isna(cleaned.loc[2, "mixed"]) |
| assert cleaned.loc[0, "mixed"] == "val1" |
|
|
|
|
| def test_normalize_result_contract(): |
| from infra.result_contract import normalize_results |
|
|
| partial_results = { |
| "best_model": None, |
| "score": "95.2", |
| "leaderboard": [{"model": "A", "score": 95}], |
| "shap_summary": [("f1", 0.2)], |
| "reasoning": "Completed training", |
| } |
|
|
| normalized = normalize_results(partial_results) |
|
|
| assert normalized["best_model"] == "" |
| assert normalized["score"] == 95.2 |
| assert normalized["metric_name"] == "" |
| assert normalized["leaderboard"] == [{"model": "A", "score": 95}] |
| assert normalized["model_path"] == "" |
| assert normalized["shap_summary"] == {} |
| assert normalized["reasoning"] == ["Completed training"] |
|
|
|
|
| def test_normalize_result_contract_sanitizes_non_finite_values(): |
| from infra.result_contract import normalize_results |
|
|
| partial_results = { |
| "best_model": "LGBM", |
| "score": np.nan, |
| "leaderboard": [{"model": "A", "score": np.nan, "mse": np.inf}], |
| "shap_summary": {"f1": np.nan, "f2": np.inf}, |
| "reasoning": ["ok", np.nan], |
| } |
|
|
| normalized = normalize_results(partial_results) |
|
|
| assert normalized["score"] == 0.0 |
| assert normalized["leaderboard"] == [{"model": "A", "score": None, "mse": None}] |
| assert normalized["shap_summary"] == {"f1": None, "f2": None} |
| assert normalized["reasoning"] == ["ok", None] |
|
|
|
|
| def test_load_dataframe_supports_markdown_documents(): |
| markdown = b"# Churn Notes\n\nCustomer called support twice.\nLikely renewal risk.\n" |
|
|
| df = load_dataframe(contents=markdown, filename="notes.md") |
|
|
| assert not df.empty |
| assert list(df.columns) == ["source_file", "segment_type", "segment_index", "text", "text_length"] |
| assert df.iloc[0]["source_file"] == "notes.md" |
| assert df.iloc[0]["segment_type"] == "block" |
| assert "Churn Notes" in df.iloc[0]["text"] |
|
|
|
|
| def test_drift_dashboard_respects_custom_thresholds(): |
| baseline = {"age": {"mean": 10.0, "std": 1.0, "count": 200}} |
| current_df = pd.DataFrame({"age": np.linspace(10.2, 10.8, 200)}) |
|
|
| report = get_drift_dashboard( |
| current_df=current_df, |
| baseline_stats=baseline, |
| feature_names=["age"], |
| warning_threshold=0.01, |
| critical_threshold=0.02, |
| ) |
|
|
| assert report["thresholds"]["warning_psi"] == 0.01 |
| assert report["thresholds"]["critical_psi"] == 0.02 |
| assert "alert_summary" in report |
| assert report["alert_summary"]["level"] in {"warning", "critical", "stable"} |
|
|
|
|
| def test_training_forecast_returns_runtime_and_budget(): |
| forecast = estimate_training_forecast( |
| profile={ |
| "rows": 1200, |
| "cols": 12, |
| "columns": [f"f{i}" for i in range(12)], |
| "task_type": "classification", |
| "missing_pct": 4.5, |
| }, |
| target_column="target", |
| goal="Balanced", |
| mode="Full", |
| selected_features=["f1", "f2", "f3"], |
| cv_folds=5, |
| handle_imbalance=True, |
| auto_clean=True, |
| eval_metric="Accuracy", |
| ) |
|
|
| assert forecast["goal"] == "Balanced" |
| assert forecast["mode"] == "Full" |
| assert forecast["estimated_duration_seconds"]["max"] >= forecast["estimated_duration_seconds"]["min"] |
| assert forecast["optuna_trials"] == 32 |
| assert forecast["estimated_feature_count"] == 3 |
|
|