import pandas as pd from deepchecks.tabular import Dataset from deepchecks.tabular.suites import data_integrity, train_test_validation def validate_data(train_df: pd.DataFrame, test_df: pd.DataFrame, output_dir: str = "reports"): """ Runs DeepChecks on training and testing data. """ # Create DeepChecks Datasets # Assuming 'target_price' is the label for regression train_ds = Dataset(train_df, label='target_price', cat_features=[]) test_ds = Dataset(test_df, label='target_price', cat_features=[]) import os os.makedirs(output_dir, exist_ok=True) # 1. Data Integrity Check print("Running Data Integrity Check...") integrity_suite = data_integrity() integrity_result = integrity_suite.run(train_ds) integrity_result.save_as_html(f"{output_dir}/data_integrity.html") print(f"Data Integrity report saved to {output_dir}/data_integrity.html") # 2. Train-Test Validation (Drift) print("Running Train-Test Validation (Drift Check)...") validation_suite = train_test_validation() validation_result = validation_suite.run(train_ds, test_ds) validation_result.save_as_html(f"{output_dir}/train_test_validation.html") print(f"Train-Test Validation report saved to {output_dir}/train_test_validation.html") return integrity_result, validation_result if __name__ == "__main__": # Example usage # df = pd.read_csv("data/processed/AAPL_processed.csv") # train_df = df.iloc[:-30] # test_df = df.iloc[-30:] # validate_data(train_df, test_df) pass