Spaces:
Sleeping
Sleeping
File size: 15,245 Bytes
e4f4b02 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 | # %%
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import QuantileTransformer
from scipy.stats import boxcox
# %%
# Load the dataset
file_path = "../data/nberces5818v1_n2012.csv"
df = pd.read_csv(file_path)
# Display basic info about the dataset
df.info()
# %%
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)
# %%
# Summary statistics
summary_stats = df.describe()
print(summary_stats)
# %%
# Visualizing missing values
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cmap="viridis", cbar=False, yticklabels=False)
plt.title("Missing Values in Dataset")
plt.show()
# %%
# # Define Manufacturing Industry Groups (Example: Modify as Needed)
# heavy_mfg = [3311, 3312, 3321, 3322] # Metal Production, Heavy Equipment
# light_mfg = [3231, 3251, 3261] # Printing, Chemicals, Plastics
# electronics_mfg = [3341, 3351] # Semiconductor, Electrical Equipment
# auto_mfg = [3361, 3362] # Automotive Industry
# # Create a column for manufacturing sector groups
# def categorize_manufacturing(naics):
# if naics in heavy_mfg:
# return "Heavy Manufacturing"
# elif naics in light_mfg:
# return "Light Manufacturing"
# elif naics in electronics_mfg:
# return "Electronics Manufacturing"
# elif naics in auto_mfg:
# return "Automotive Manufacturing"
# else:
# return "Other Manufacturing"
# df['mfg_sector'] = df['naics'].apply(categorize_manufacturing)
# %%
# # Data Cleaning
# # Continuous Trends - Use Linear Interpolation
# df[['emp', 'pay', 'prode', 'prodh', 'prodw', 'vship', 'matcost', 'vadd', 'energy', 'invent']] = df[['emp', 'pay', 'prode', 'prodh', 'prodw', 'vship', 'matcost', 'vadd', 'energy','invent']].interpolate(method='linear')
# # Moderate Missingness (1-4%) - Use Rolling Median or Global Median
# moderate_missing_cols = ['invest', 'cap', 'equip', 'plant', 'piinv', 'tfp5', 'tfp4']
# df[moderate_missing_cols] = df[moderate_missing_cols].rolling(5, min_periods=1).median()
# df[moderate_missing_cols] = df[moderate_missing_cols].fillna(df[moderate_missing_cols].median())
# # Macroeconomic Indicators - Use Spline Interpolation
# macro_cols = ['piship', 'pimat', 'piinv', 'pien']
# df[macro_cols] = df[macro_cols].interpolate(method='spline', order=3)
# # Define features and targets for predictive imputation
# features = ['emp', 'pay', 'prode', 'invest', 'cap', 'matcost', 'vadd']
# targets = ['dtfp5', 'dtfp4']
# # Process each NAICS group separately
# unique_naics = df['naics'].unique()
# for naics_code in unique_naics:
# subset_df = df[df['naics'] == naics_code] # Filter data for current NAICS code
# train_data = subset_df.dropna(subset=targets + features) # Ensure enough data exists
# if not train_data.empty:
# for target in targets:
# rf = RandomForestRegressor(n_estimators=100, random_state=42)
# rf.fit(train_data[features], train_data[target])
# # Identify rows with missing values but complete features
# missing_rows = subset_df[subset_df[target].isna() & subset_df[features].notnull().all(axis=1)]
# if not missing_rows.empty:
# df.loc[missing_rows.index, target] = rf.predict(missing_rows[features])
# # Apply KNN Imputation only to remaining missing values within the NAICS group
# knn_imputer = KNNImputer(n_neighbors=5)
# df.loc[df['naics'] == naics_code, targets] = knn_imputer.fit_transform(subset_df[targets])
# # Verify missing values are fixed
# print(df[targets].isnull().sum())
# %%
# df["naics"] = df["naics"].astype(str)
# df = df[df["naics"].str.startswith(("31", "32", "33"))].copy()
# #df_naics_mfg = df[df["naics"].str.startswith(("31", "32", "33"))].copy()
# #.astype(str)
# # Step 2: Define Manufacturing Subcategories
# manufacturing_subcategories = {
# "311": "Food Manufacturing",
# "312": "Beverage & Tobacco Product Manufacturing",
# "313": "Textile Mills",
# "314": "Textile Product Mills",
# "315": "Apparel Manufacturing",
# "316": "Leather & Allied Product Manufacturing",
# "321": "Wood Product Manufacturing",
# "322": "Paper Manufacturing",
# "323": "Printing & Related Support Activities",
# "324": "Petroleum & Coal Products Manufacturing",
# "325": "Chemical Manufacturing",
# "326": "Plastics & Rubber Products Manufacturing",
# "327": "Nonmetallic Mineral Product Manufacturing",
# "331": "Primary Metal Manufacturing",
# "332": "Fabricated Metal Product Manufacturing",
# "333": "Machinery Manufacturing",
# "334": "Computer & Electronic Product Manufacturing",
# "335": "Electrical Equipment, Appliance & Component Manufacturing",
# "336": "Transportation Equipment Manufacturing",
# "337": "Furniture & Related Product Manufacturing",
# "339": "Miscellaneous Manufacturing"
# }
# # Step 3: Assign Subcategories Based on NAICS Prefix
# df["Subcategory"] = df["naics"].str[:3].map(manufacturing_subcategories)
# %%
# Step 1: Convert NAICS Codes to String and Filter for Manufacturing (31-33)
df["naics"] = df["naics"].astype(str)
df = df[df["naics"].str.startswith(("31", "32", "33"))].copy()
# Step 2: Define Core Manufacturing Subcategories (Excluding 323 & 324)
core_manufacturing_subcategories = {
"311": "Food Manufacturing",
"312": "Beverage & Tobacco Product Manufacturing",
"313": "Textile Mills",
"314": "Textile Product Mills",
"315": "Apparel Manufacturing",
"316": "Leather & Allied Product Manufacturing",
"321": "Wood Product Manufacturing",
"322": "Paper Manufacturing",
"325": "Chemical Manufacturing",
"326": "Plastics & Rubber Products Manufacturing",
"327": "Nonmetallic Mineral Product Manufacturing",
"331": "Primary Metal Manufacturing",
"332": "Fabricated Metal Product Manufacturing",
"333": "Machinery Manufacturing",
"334": "Computer & Electronic Product Manufacturing",
"335": "Electrical Equipment, Appliance & Component Manufacturing",
"336": "Transportation Equipment Manufacturing",
"337": "Furniture & Related Product Manufacturing",
"339": "Miscellaneous Manufacturing"
}
# Step 3: Assign Subcategories Based on NAICS Prefix
df["Subcategory"] = df["naics"].str[:3].map(core_manufacturing_subcategories)
# Step 4: Remove Non-Core Manufacturing Categories (Rows where Subcategory is NaN)
df = df.dropna(subset=["Subcategory"])
# Reset index for clean output
df.reset_index(drop=True, inplace=True)
# %%
df
# %%
# Define feature and target variables
features = ['emp', 'pay', 'prode', 'invest', 'cap', 'matcost', 'vadd']
targets = ['dtfp5', 'dtfp4']
# Unique NAICS Groups
unique_naics = df['naics'].unique()
# Iterate Over Each NAICS Group
for naics_code in unique_naics:
subset_df = df[df['naics'] == naics_code].copy()
# 1️⃣ Handle Moderate Missingness with Rolling Median
moderate_missing_cols = ['invest', 'cap', 'equip', 'plant', 'piinv', 'tfp5', 'tfp4']
subset_df[moderate_missing_cols] = subset_df[moderate_missing_cols].rolling(5, min_periods=1).median()
subset_df[moderate_missing_cols] = subset_df[moderate_missing_cols].fillna(subset_df[moderate_missing_cols].median())
# 2️⃣ Apply Industry-Specific Interpolation for Continuous Trends
continuous_cols = ['emp', 'pay', 'prode', 'prodh', 'prodw', 'vship', 'matcost', 'vadd', 'energy', 'invent']
subset_df[continuous_cols] = subset_df[continuous_cols].interpolate(method='spline', order=3)
# 3️⃣ Apply Macroeconomic Adjustments with Spline Interpolation
macro_cols = ['piship', 'pimat', 'piinv', 'pien']
subset_df[macro_cols] = subset_df[macro_cols].interpolate(method='spline', order=3)
# 4️⃣ Predict Missing Values using RandomForest
train_data = subset_df.dropna(subset=targets + features)
if not train_data.empty:
for target in targets:
if subset_df[target].isna().sum() > 0:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(train_data[features], train_data[target])
# Predict only for missing values
missing_rows = subset_df[subset_df[target].isna() & subset_df[features].notnull().all(axis=1)]
if not missing_rows.empty:
df.loc[missing_rows.index, target] = rf.predict(missing_rows[features])
# 5️⃣ Apply KNN Imputation for Remaining Missing Values
knn_imputer = KNNImputer(n_neighbors=5)
df.loc[df['naics'] == naics_code, targets] = knn_imputer.fit_transform(subset_df[targets])
# Verify missing values
print("Final Missing Values:\n", df.isnull().sum())
# %%
df.drop(columns=['naics'], inplace=True)
# %%
# Visualizing missing values
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cmap="viridis", cbar=False, yticklabels=False)
plt.title("Missing Values in Dataset")
plt.show()
# %%
# Correlation Matrix
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Matrix of the Dataset")
plt.show()
# %%
def plot_feature_distributions(df, max_cols=3):
"""
Plots histograms for all columns in the dataframe.
Parameters:
df (pd.DataFrame): The dataframe containing features to plot.
max_cols (int): Maximum number of columns per row in the subplot grid.
"""
num_features = len(df.columns)
num_rows = math.ceil(num_features / max_cols) # Calculate required rows
plt.figure(figsize=(max_cols * 5, num_rows * 4)) # Adjust figure size dynamically
for i, feature in enumerate(df.columns, 1):
plt.subplot(num_rows, max_cols, i)
sns.histplot(df[feature].dropna(), bins=50, kde=True)
plt.title(f"Distribution of {feature}")
plt.tight_layout()
plt.show()
plot_feature_distributions(df, max_cols=3)
# %%
df.skew()
# %%
def preprocess_nans(df, strategy='median'):
"""
Handles NaN values before transformations.
Parameters:
- df (pd.DataFrame): Input dataframe.
- strategy (str): Method to handle NaNs ('mean', 'median', 'zero').
Returns:
- df_filled (pd.DataFrame): Dataframe with NaNs handled.
- nan_replacements (dict): Dictionary storing replacements for restoration.
"""
df_filled = df.copy()
nan_replacements = {} # Store original NaN locations
for col in df.columns:
if df[col].isna().sum() > 0:
fill_value = df[col].median() if strategy == 'median' else df[col].mean()
df_filled[col].fillna(fill_value, inplace=True)
nan_replacements[col] = fill_value # Store for later restoration
return df_filled, nan_replacements
def transform_skewed_features(df, exclude_columns=['naics', 'year', 'Subcategory']):
"""
Transforms skewed features while excluding specified columns.
Handles NaNs before transformation.
Returns:
- Transformed dataframe.
- Dictionary of applied transformations.
- Dictionary storing NaN replacements.
"""
df_cleaned, nan_replacements = preprocess_nans(df) # Handle NaNs first
df_transformed = df_cleaned.copy()
transformations = {} # Store transformations
skewness = df_cleaned.skew()
columns_to_transform = [col for col in df_cleaned.columns if col not in exclude_columns]
for col in columns_to_transform:
min_val = df_cleaned[col].min()
if skewness[col] > 3 or skewness[col] < -3: # EXTREME skewness
shift = abs(min_val) + 1 if min_val <= 0 else 0 # Ensure positivity
df_transformed[col] = np.log1p(df_transformed[col] + shift)
transformations[col] = ('log1p_shift', shift)
elif 1.5 < abs(skewness[col]) <= 3: # MODERATE skewness
df_transformed[col] = np.cbrt(df_transformed[col])
transformations[col] = ('cbrt', None)
elif 0.5 < abs(skewness[col]) <= 1.5: # MILD skewness
shift = abs(min_val) if min_val < 0 else 0
df_transformed[col] = np.sqrt(df_transformed[col] + shift)
transformations[col] = ('sqrt_shift', shift)
return df_transformed, transformations, nan_replacements
def restore_original_values(df_transformed, transformations, nan_replacements):
"""
Restores transformed values to their original scale and reintroduces NaNs.
Parameters:
- df_transformed (pd.DataFrame): Transformed dataframe.
- transformations (dict): Mapping of column names to transformations.
- nan_replacements (dict): Original NaN replacements.
Returns:
- Restored dataframe.
"""
df_original = df_transformed.copy()
for col, transform in transformations.items():
if transform[0] == 'log1p_shift':
df_original[col] = np.expm1(df_original[col]) - transform[1] # Undo shift
elif transform[0] == 'cbrt':
df_original[col] = df_original[col] ** 3 # Inverse of cube root
elif transform[0] == 'sqrt_shift':
df_original[col] = (df_original[col] ** 2) - transform[1] # Undo shift
for col, fill_value in nan_replacements.items():
df_original[col] = df_original[col].replace(fill_value, np.nan) # Restore NaNs
return df_original
# %%
# %%
# %%
# Step 1: Transform Data (including handling NaNs)
df_transformed, transformations, nan_replacements = transform_skewed_features(df)
print(nan_replacements)
# Step 2: Restore to Original Values
df_original = restore_original_values(df_transformed, transformations, nan_replacements)
# %%
# Ensure only numeric columns are compared
numeric_df = df.select_dtypes(include=[np.number]).fillna(0)
numeric_df_original = df_original.select_dtypes(include=[np.number]).fillna(0)
# Compare only numeric values
print(np.allclose(numeric_df, numeric_df_original, atol=1e-6)) # Should return True
# %%
plot_feature_distributions(df_transformed, max_cols=3)
# %%
df_transformed.skew()
# %%
# Define expanded features for scatter plots
relevant_features = ["emp", "pay", "prode", "invest", "cap", "equip", "plant", "matcost", "vadd", "energy", "invent", "piship", "pimat", "piinv", "pien"]
# Set up scatter plot grid
plt.figure(figsize=(15, 15))
for i, feature in enumerate(relevant_features, 1):
plt.subplot(5, 3, i) # 5 rows, 3 columns of subplots
sns.scatterplot(x=df_transformed[feature], y=df_transformed["vship"])
plt.title(f"Turnover (vship) vs {feature}")
plt.xlabel(feature)
plt.ylabel("Turnover (vship)")
plt.tight_layout()
plt.show()
# %%
df.to_csv("../data/processed_data.csv", index=False)
# %%
df.Subcategory.value_counts()
# %%
|