Spaces:

Pulastya0
/

Data-Science-Agent

Running

Pulastya B commited on Dec 28, 2025

Commit

bdf714a

1 Parent(s): e873289

fix: Configure high-quality ydata profiling for HuggingFace 16GB RAM

- Made sampling thresholds configurable via env vars
- HuggingFace: 200k rows, 100MB files (minimal sampling)
- Render: 50k rows, 10MB files (aggressive sampling)
- Earthquake dataset will now use 150k sample instead of 50k
- Better quality reports on high-memory environments

Files changed (3) hide show

Dockerfile +6 -0
Dockerfile.render +6 -0
src/tools/eda_reports.py +12 -6

Dockerfile CHANGED Viewed

@@ -86,6 +86,12 @@ ENV OUTPUT_DIR=/home/user/app/outputs
 ENV CACHE_DB_PATH=/home/user/app/cache_db/cache.db
 ENV ARTIFACT_BACKEND=local
 # HuggingFace Spaces uses port 7860 by default
 EXPOSE 7860

 ENV CACHE_DB_PATH=/home/user/app/cache_db/cache.db
 ENV ARTIFACT_BACKEND=local
+# YData Profiling optimization for 16GB RAM (HuggingFace Spaces)
+# Higher thresholds = better quality reports without sampling
+ENV YDATA_MAX_ROWS=200000
+ENV YDATA_MAX_SIZE_MB=100
+ENV YDATA_SAMPLE_SIZE=150000
 # HuggingFace Spaces uses port 7860 by default
 EXPOSE 7860

Dockerfile.render CHANGED Viewed

@@ -78,6 +78,12 @@ ENV OUTPUT_DIR=/tmp/outputs
 ENV CACHE_DB_PATH=/tmp/cache_db/cache.db
 ENV ARTIFACT_BACKEND=local
 EXPOSE 8080
 # Start FastAPI

 ENV CACHE_DB_PATH=/tmp/cache_db/cache.db
 ENV ARTIFACT_BACKEND=local
+# YData Profiling optimization for 512MB RAM (Render Free Tier)
+# Lower thresholds = aggressive sampling to prevent crashes
+ENV YDATA_MAX_ROWS=50000
+ENV YDATA_MAX_SIZE_MB=10
+ENV YDATA_SAMPLE_SIZE=50000
 EXPOSE 8080
 # Start FastAPI

src/tools/eda_reports.py CHANGED Viewed

@@ -52,16 +52,22 @@ def generate_ydata_profiling_report(
         rows, cols = df.shape
         file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
-        # Automatic sampling for large datasets (>10MB or >50k rows)
-        should_sample = file_size_mb > 10 or rows > 50000
         if should_sample and not minimal:
             print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
-            print(f"⚡ Sampling to 50,000 rows for memory efficiency...")
-            df = df.sample(n=min(50000, rows), random_state=42)
             minimal = True  # Force minimal mode for large files
-        # Force minimal mode for files >5MB even after sampling
-        if file_size_mb > 5:
             minimal = True
             print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")

         rows, cols = df.shape
         file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
+        # Check environment: HuggingFace has 16GB, Render has 512MB
+        # Allow larger datasets on high-memory environments
+        max_rows_threshold = int(os.getenv("YDATA_MAX_ROWS", "100000"))  # Default: 100k (HF), or set to 50000 for low-mem
+        max_size_threshold = float(os.getenv("YDATA_MAX_SIZE_MB", "50"))  # Default: 50MB
+        # Automatic sampling only when dataset exceeds thresholds
+        should_sample = file_size_mb > max_size_threshold or rows > max_rows_threshold
         if should_sample and not minimal:
+            sample_size = int(os.getenv("YDATA_SAMPLE_SIZE", "100000"))
             print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
+            print(f"⚡ Sampling to {sample_size:,} rows for memory efficiency...")
+            df = df.sample(n=min(sample_size, rows), random_state=42)
             minimal = True  # Force minimal mode for large files
+        # Force minimal mode for very large files even after sampling
+        if file_size_mb > max_size_threshold * 2:
             minimal = True
             print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")