Spaces:
Sleeping
Sleeping
Upload report.py
Browse files
report.py
CHANGED
|
@@ -23,7 +23,9 @@ def plot_heatmap(df, title):
|
|
| 23 |
|
| 24 |
|
| 25 |
def plot_valid_data_percentage(original_df, cleaned_df, primary_key_column):
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
original_valid = (original_df[columns_to_plot].notna().sum() / len(original_df)) * 100
|
| 28 |
cleaned_valid = (cleaned_df[columns_to_plot].notna().sum() / len(cleaned_df)) * 100
|
| 29 |
|
|
@@ -52,6 +54,13 @@ def plot_valid_data_percentage(original_df, cleaned_df, primary_key_column):
|
|
| 52 |
plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
|
| 53 |
plt.close()
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
def plot_column_schemas(df):
|
| 57 |
schemas = df.dtypes.astype(str).value_counts()
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
def plot_valid_data_percentage(original_df, cleaned_df, primary_key_column):
|
| 26 |
+
# Get columns present in both DataFrames, excluding the primary key
|
| 27 |
+
columns_to_plot = [col for col in original_df.columns if col in cleaned_df.columns and col != primary_key_column]
|
| 28 |
+
|
| 29 |
original_valid = (original_df[columns_to_plot].notna().sum() / len(original_df)) * 100
|
| 30 |
cleaned_valid = (cleaned_df[columns_to_plot].notna().sum() / len(cleaned_df)) * 100
|
| 31 |
|
|
|
|
| 54 |
plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
|
| 55 |
plt.close()
|
| 56 |
|
| 57 |
+
# Print information about removed columns
|
| 58 |
+
removed_columns = set(original_df.columns) - set(cleaned_df.columns)
|
| 59 |
+
if removed_columns:
|
| 60 |
+
print(f"The following columns were removed during the cleaning process: {', '.join(removed_columns)}")
|
| 61 |
+
else:
|
| 62 |
+
print("No columns were removed during the cleaning process.")
|
| 63 |
+
|
| 64 |
|
| 65 |
def plot_column_schemas(df):
|
| 66 |
schemas = df.dtypes.astype(str).value_counts()
|