|
|
import pandas as pd |
|
|
from deepchecks.tabular import Dataset |
|
|
from deepchecks.tabular.suites import data_integrity, train_test_validation |
|
|
|
|
|
def validate_data(train_df: pd.DataFrame, test_df: pd.DataFrame, output_dir: str = "reports"): |
|
|
""" |
|
|
Runs DeepChecks on training and testing data. |
|
|
""" |
|
|
|
|
|
|
|
|
train_ds = Dataset(train_df, label='target_price', cat_features=[]) |
|
|
test_ds = Dataset(test_df, label='target_price', cat_features=[]) |
|
|
|
|
|
import os |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
print("Running Data Integrity Check...") |
|
|
integrity_suite = data_integrity() |
|
|
integrity_result = integrity_suite.run(train_ds) |
|
|
integrity_result.save_as_html(f"{output_dir}/data_integrity.html") |
|
|
print(f"Data Integrity report saved to {output_dir}/data_integrity.html") |
|
|
|
|
|
|
|
|
print("Running Train-Test Validation (Drift Check)...") |
|
|
validation_suite = train_test_validation() |
|
|
validation_result = validation_suite.run(train_ds, test_ds) |
|
|
validation_result.save_as_html(f"{output_dir}/train_test_validation.html") |
|
|
print(f"Train-Test Validation report saved to {output_dir}/train_test_validation.html") |
|
|
|
|
|
return integrity_result, validation_result |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pass |
|
|
|