File size: 1,552 Bytes
3bce488 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import pandas as pd
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import data_integrity, train_test_validation
def validate_data(train_df: pd.DataFrame, test_df: pd.DataFrame, output_dir: str = "reports"):
"""
Runs DeepChecks on training and testing data.
"""
# Create DeepChecks Datasets
# Assuming 'target_price' is the label for regression
train_ds = Dataset(train_df, label='target_price', cat_features=[])
test_ds = Dataset(test_df, label='target_price', cat_features=[])
import os
os.makedirs(output_dir, exist_ok=True)
# 1. Data Integrity Check
print("Running Data Integrity Check...")
integrity_suite = data_integrity()
integrity_result = integrity_suite.run(train_ds)
integrity_result.save_as_html(f"{output_dir}/data_integrity.html")
print(f"Data Integrity report saved to {output_dir}/data_integrity.html")
# 2. Train-Test Validation (Drift)
print("Running Train-Test Validation (Drift Check)...")
validation_suite = train_test_validation()
validation_result = validation_suite.run(train_ds, test_ds)
validation_result.save_as_html(f"{output_dir}/train_test_validation.html")
print(f"Train-Test Validation report saved to {output_dir}/train_test_validation.html")
return integrity_result, validation_result
if __name__ == "__main__":
# Example usage
# df = pd.read_csv("data/processed/AAPL_processed.csv")
# train_df = df.iloc[:-30]
# test_df = df.iloc[-30:]
# validate_data(train_df, test_df)
pass
|