Pulastya B commited on
Commit
f1ab2a8
·
1 Parent(s): 4d07a53

fix: Auto-optimize ydata profiling for memory constraints

Browse files

- Auto-sample large datasets (>10MB or >50k rows) to 50k rows
- Force minimal mode for files >5MB
- Disable memory-intensive features (correlations, interactions)
- Prevents 512MB RAM crashes on Render free tier
- Earthquake dataset: 350k rows sample to 50k for profiling
- Trade-off: Faster reports, less memory, representative sample

Files changed (1) hide show
  1. src/tools/eda_reports.py +24 -2
src/tools/eda_reports.py CHANGED
@@ -48,17 +48,39 @@ def generate_ydata_profiling_report(
48
  else:
49
  raise ValueError(f"Unsupported file format: {file_path}")
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # Create output directory if needed
52
  os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
53
 
54
  # Configure profile based on minimal flag
55
  if minimal:
56
- # Minimal mode: faster for large datasets
57
  profile = ProfileReport(
58
  df,
59
  title=title,
60
  minimal=True,
61
- explorative=False
 
 
 
 
 
62
  )
63
  else:
64
  # Full mode: comprehensive analysis
 
48
  else:
49
  raise ValueError(f"Unsupported file format: {file_path}")
50
 
51
+ # Auto-optimize for large datasets to prevent memory crashes
52
+ rows, cols = df.shape
53
+ file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
54
+
55
+ # Automatic sampling for large datasets (>10MB or >50k rows)
56
+ should_sample = file_size_mb > 10 or rows > 50000
57
+ if should_sample and not minimal:
58
+ print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
59
+ print(f"⚡ Sampling to 50,000 rows for memory efficiency...")
60
+ df = df.sample(n=min(50000, rows), random_state=42)
61
+ minimal = True # Force minimal mode for large files
62
+
63
+ # Force minimal mode for files >5MB even after sampling
64
+ if file_size_mb > 5:
65
+ minimal = True
66
+ print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")
67
+
68
  # Create output directory if needed
69
  os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
70
 
71
  # Configure profile based on minimal flag
72
  if minimal:
73
+ # Minimal mode: faster for large datasets, less memory
74
  profile = ProfileReport(
75
  df,
76
  title=title,
77
  minimal=True,
78
+ explorative=False,
79
+ samples=None, # Disable sample display to save memory
80
+ correlations=None, # Skip correlations in minimal mode
81
+ missing_diagrams=None, # Skip missing diagrams
82
+ duplicates=None, # Skip duplicate analysis
83
+ interactions=None # Skip interactions
84
  )
85
  else:
86
  # Full mode: comprehensive analysis