Spaces:
Running
Running
Pulastya B
commited on
Commit
·
f1ab2a8
1
Parent(s):
4d07a53
fix: Auto-optimize ydata profiling for memory constraints
Browse files- Auto-sample large datasets (>10MB or >50k rows) to 50k rows
- Force minimal mode for files >5MB
- Disable memory-intensive features (correlations, interactions)
- Prevents 512MB RAM crashes on Render free tier
- Earthquake dataset: 350k rows sample to 50k for profiling
- Trade-off: Faster reports, less memory, representative sample
- src/tools/eda_reports.py +24 -2
src/tools/eda_reports.py
CHANGED
|
@@ -48,17 +48,39 @@ def generate_ydata_profiling_report(
|
|
| 48 |
else:
|
| 49 |
raise ValueError(f"Unsupported file format: {file_path}")
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
# Create output directory if needed
|
| 52 |
os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
|
| 53 |
|
| 54 |
# Configure profile based on minimal flag
|
| 55 |
if minimal:
|
| 56 |
-
# Minimal mode: faster for large datasets
|
| 57 |
profile = ProfileReport(
|
| 58 |
df,
|
| 59 |
title=title,
|
| 60 |
minimal=True,
|
| 61 |
-
explorative=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
)
|
| 63 |
else:
|
| 64 |
# Full mode: comprehensive analysis
|
|
|
|
| 48 |
else:
|
| 49 |
raise ValueError(f"Unsupported file format: {file_path}")
|
| 50 |
|
| 51 |
+
# Auto-optimize for large datasets to prevent memory crashes
|
| 52 |
+
rows, cols = df.shape
|
| 53 |
+
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
| 54 |
+
|
| 55 |
+
# Automatic sampling for large datasets (>10MB or >50k rows)
|
| 56 |
+
should_sample = file_size_mb > 10 or rows > 50000
|
| 57 |
+
if should_sample and not minimal:
|
| 58 |
+
print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
|
| 59 |
+
print(f"⚡ Sampling to 50,000 rows for memory efficiency...")
|
| 60 |
+
df = df.sample(n=min(50000, rows), random_state=42)
|
| 61 |
+
minimal = True # Force minimal mode for large files
|
| 62 |
+
|
| 63 |
+
# Force minimal mode for files >5MB even after sampling
|
| 64 |
+
if file_size_mb > 5:
|
| 65 |
+
minimal = True
|
| 66 |
+
print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")
|
| 67 |
+
|
| 68 |
# Create output directory if needed
|
| 69 |
os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
|
| 70 |
|
| 71 |
# Configure profile based on minimal flag
|
| 72 |
if minimal:
|
| 73 |
+
# Minimal mode: faster for large datasets, less memory
|
| 74 |
profile = ProfileReport(
|
| 75 |
df,
|
| 76 |
title=title,
|
| 77 |
minimal=True,
|
| 78 |
+
explorative=False,
|
| 79 |
+
samples=None, # Disable sample display to save memory
|
| 80 |
+
correlations=None, # Skip correlations in minimal mode
|
| 81 |
+
missing_diagrams=None, # Skip missing diagrams
|
| 82 |
+
duplicates=None, # Skip duplicate analysis
|
| 83 |
+
interactions=None # Skip interactions
|
| 84 |
)
|
| 85 |
else:
|
| 86 |
# Full mode: comprehensive analysis
|