Pulastya B commited on
Commit
bdf714a
·
1 Parent(s): e873289

fix: Configure high-quality ydata profiling for HuggingFace 16GB RAM

Browse files

- Made sampling thresholds configurable via env vars
- HuggingFace: 200k rows, 100MB files (minimal sampling)
- Render: 50k rows, 10MB files (aggressive sampling)
- Earthquake dataset will now use 150k sample instead of 50k
- Better quality reports on high-memory environments

Files changed (3) hide show
  1. Dockerfile +6 -0
  2. Dockerfile.render +6 -0
  3. src/tools/eda_reports.py +12 -6
Dockerfile CHANGED
@@ -86,6 +86,12 @@ ENV OUTPUT_DIR=/home/user/app/outputs
86
  ENV CACHE_DB_PATH=/home/user/app/cache_db/cache.db
87
  ENV ARTIFACT_BACKEND=local
88
 
 
 
 
 
 
 
89
  # HuggingFace Spaces uses port 7860 by default
90
  EXPOSE 7860
91
 
 
86
  ENV CACHE_DB_PATH=/home/user/app/cache_db/cache.db
87
  ENV ARTIFACT_BACKEND=local
88
 
89
+ # YData Profiling optimization for 16GB RAM (HuggingFace Spaces)
90
+ # Higher thresholds = better quality reports without sampling
91
+ ENV YDATA_MAX_ROWS=200000
92
+ ENV YDATA_MAX_SIZE_MB=100
93
+ ENV YDATA_SAMPLE_SIZE=150000
94
+
95
  # HuggingFace Spaces uses port 7860 by default
96
  EXPOSE 7860
97
 
Dockerfile.render CHANGED
@@ -78,6 +78,12 @@ ENV OUTPUT_DIR=/tmp/outputs
78
  ENV CACHE_DB_PATH=/tmp/cache_db/cache.db
79
  ENV ARTIFACT_BACKEND=local
80
 
 
 
 
 
 
 
81
  EXPOSE 8080
82
 
83
  # Start FastAPI
 
78
  ENV CACHE_DB_PATH=/tmp/cache_db/cache.db
79
  ENV ARTIFACT_BACKEND=local
80
 
81
+ # YData Profiling optimization for 512MB RAM (Render Free Tier)
82
+ # Lower thresholds = aggressive sampling to prevent crashes
83
+ ENV YDATA_MAX_ROWS=50000
84
+ ENV YDATA_MAX_SIZE_MB=10
85
+ ENV YDATA_SAMPLE_SIZE=50000
86
+
87
  EXPOSE 8080
88
 
89
  # Start FastAPI
src/tools/eda_reports.py CHANGED
@@ -52,16 +52,22 @@ def generate_ydata_profiling_report(
52
  rows, cols = df.shape
53
  file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
54
 
55
- # Automatic sampling for large datasets (>10MB or >50k rows)
56
- should_sample = file_size_mb > 10 or rows > 50000
 
 
 
 
 
57
  if should_sample and not minimal:
 
58
  print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
59
- print(f"⚡ Sampling to 50,000 rows for memory efficiency...")
60
- df = df.sample(n=min(50000, rows), random_state=42)
61
  minimal = True # Force minimal mode for large files
62
 
63
- # Force minimal mode for files >5MB even after sampling
64
- if file_size_mb > 5:
65
  minimal = True
66
  print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")
67
 
 
52
  rows, cols = df.shape
53
  file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
54
 
55
+ # Check environment: HuggingFace has 16GB, Render has 512MB
56
+ # Allow larger datasets on high-memory environments
57
+ max_rows_threshold = int(os.getenv("YDATA_MAX_ROWS", "100000")) # Default: 100k (HF), or set to 50000 for low-mem
58
+ max_size_threshold = float(os.getenv("YDATA_MAX_SIZE_MB", "50")) # Default: 50MB
59
+
60
+ # Automatic sampling only when dataset exceeds thresholds
61
+ should_sample = file_size_mb > max_size_threshold or rows > max_rows_threshold
62
  if should_sample and not minimal:
63
+ sample_size = int(os.getenv("YDATA_SAMPLE_SIZE", "100000"))
64
  print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
65
+ print(f"⚡ Sampling to {sample_size:,} rows for memory efficiency...")
66
+ df = df.sample(n=min(sample_size, rows), random_state=42)
67
  minimal = True # Force minimal mode for large files
68
 
69
+ # Force minimal mode for very large files even after sampling
70
+ if file_size_mb > max_size_threshold * 2:
71
  minimal = True
72
  print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")
73