Spaces:
Running
Running
Pulastya B commited on
Commit ·
bdf714a
1
Parent(s): e873289
fix: Configure high-quality ydata profiling for HuggingFace 16GB RAM
Browse files- Made sampling thresholds configurable via env vars
- HuggingFace: 200k rows, 100MB files (minimal sampling)
- Render: 50k rows, 10MB files (aggressive sampling)
- Earthquake dataset will now use 150k sample instead of 50k
- Better quality reports on high-memory environments
- Dockerfile +6 -0
- Dockerfile.render +6 -0
- src/tools/eda_reports.py +12 -6
Dockerfile
CHANGED
|
@@ -86,6 +86,12 @@ ENV OUTPUT_DIR=/home/user/app/outputs
|
|
| 86 |
ENV CACHE_DB_PATH=/home/user/app/cache_db/cache.db
|
| 87 |
ENV ARTIFACT_BACKEND=local
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
# HuggingFace Spaces uses port 7860 by default
|
| 90 |
EXPOSE 7860
|
| 91 |
|
|
|
|
| 86 |
ENV CACHE_DB_PATH=/home/user/app/cache_db/cache.db
|
| 87 |
ENV ARTIFACT_BACKEND=local
|
| 88 |
|
| 89 |
+
# YData Profiling optimization for 16GB RAM (HuggingFace Spaces)
|
| 90 |
+
# Higher thresholds = better quality reports without sampling
|
| 91 |
+
ENV YDATA_MAX_ROWS=200000
|
| 92 |
+
ENV YDATA_MAX_SIZE_MB=100
|
| 93 |
+
ENV YDATA_SAMPLE_SIZE=150000
|
| 94 |
+
|
| 95 |
# HuggingFace Spaces uses port 7860 by default
|
| 96 |
EXPOSE 7860
|
| 97 |
|
Dockerfile.render
CHANGED
|
@@ -78,6 +78,12 @@ ENV OUTPUT_DIR=/tmp/outputs
|
|
| 78 |
ENV CACHE_DB_PATH=/tmp/cache_db/cache.db
|
| 79 |
ENV ARTIFACT_BACKEND=local
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
EXPOSE 8080
|
| 82 |
|
| 83 |
# Start FastAPI
|
|
|
|
| 78 |
ENV CACHE_DB_PATH=/tmp/cache_db/cache.db
|
| 79 |
ENV ARTIFACT_BACKEND=local
|
| 80 |
|
| 81 |
+
# YData Profiling optimization for 512MB RAM (Render Free Tier)
|
| 82 |
+
# Lower thresholds = aggressive sampling to prevent crashes
|
| 83 |
+
ENV YDATA_MAX_ROWS=50000
|
| 84 |
+
ENV YDATA_MAX_SIZE_MB=10
|
| 85 |
+
ENV YDATA_SAMPLE_SIZE=50000
|
| 86 |
+
|
| 87 |
EXPOSE 8080
|
| 88 |
|
| 89 |
# Start FastAPI
|
src/tools/eda_reports.py
CHANGED
|
@@ -52,16 +52,22 @@ def generate_ydata_profiling_report(
|
|
| 52 |
rows, cols = df.shape
|
| 53 |
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
| 54 |
|
| 55 |
-
#
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
if should_sample and not minimal:
|
|
|
|
| 58 |
print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
|
| 59 |
-
print(f"⚡ Sampling to
|
| 60 |
-
df = df.sample(n=min(
|
| 61 |
minimal = True # Force minimal mode for large files
|
| 62 |
|
| 63 |
-
# Force minimal mode for
|
| 64 |
-
if file_size_mb >
|
| 65 |
minimal = True
|
| 66 |
print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")
|
| 67 |
|
|
|
|
| 52 |
rows, cols = df.shape
|
| 53 |
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
| 54 |
|
| 55 |
+
# Check environment: HuggingFace has 16GB, Render has 512MB
|
| 56 |
+
# Allow larger datasets on high-memory environments
|
| 57 |
+
max_rows_threshold = int(os.getenv("YDATA_MAX_ROWS", "100000")) # Default: 100k (HF), or set to 50000 for low-mem
|
| 58 |
+
max_size_threshold = float(os.getenv("YDATA_MAX_SIZE_MB", "50")) # Default: 50MB
|
| 59 |
+
|
| 60 |
+
# Automatic sampling only when dataset exceeds thresholds
|
| 61 |
+
should_sample = file_size_mb > max_size_threshold or rows > max_rows_threshold
|
| 62 |
if should_sample and not minimal:
|
| 63 |
+
sample_size = int(os.getenv("YDATA_SAMPLE_SIZE", "100000"))
|
| 64 |
print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
|
| 65 |
+
print(f"⚡ Sampling to {sample_size:,} rows for memory efficiency...")
|
| 66 |
+
df = df.sample(n=min(sample_size, rows), random_state=42)
|
| 67 |
minimal = True # Force minimal mode for large files
|
| 68 |
|
| 69 |
+
# Force minimal mode for very large files even after sampling
|
| 70 |
+
if file_size_mb > max_size_threshold * 2:
|
| 71 |
minimal = True
|
| 72 |
print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")
|
| 73 |
|