Spaces:

jmzlx
/

dd-poc

Sleeping

Juan Salas commited on Sep 16, 2025

Commit

3632723

1 Parent(s): 6d03070

fix: resolve deployment crash with missing VDR directory

- Make config validation deployment-aware to handle missing data gracefully
- Add environment detection for Streamlit/HF deployment contexts
- Improve dataset download script with retry logic and better error handling
- Add force retry option for partial downloads
- Provide clear feedback about HF_TOKEN authentication requirements

Fixes 'Critical directory vdrs_dir does not exist' error in deployment environments

Files changed (3) hide show

app/core/config.py +25 -2
app/main.py +27 -4
scripts/setup_datasets.py +50 -6

app/core/config.py CHANGED Viewed

@@ -121,16 +121,39 @@ class AppConfig:
     def _validate_paths(self) -> None:
         """Validate that critical directories exist."""
         critical_dirs = [
             ('data_dir', self.paths['data_dir']),
-            ('vdrs_dir', self.paths['vdrs_dir'])
         ]
         for dir_name, dir_path in critical_dirs:
             if not dir_path.exists():
-                raise ValueError(f"Critical directory '{dir_name}' does not exist: {dir_path}")
             if not dir_path.is_dir():
                 raise ValueError(f"Path '{dir_name}' exists but is not a directory: {dir_path}")
     def _validate_models(self) -> None:
         """Validate that required models are available or can be downloaded."""

     def _validate_paths(self) -> None:
         """Validate that critical directories exist."""
+        # Check if we're in a deployment environment where data might be downloaded at runtime
+        is_deployment = os.getenv('STREAMLIT_SERVER_HEADLESS') == 'true' or os.getenv('HF_HOME') == '/tmp/huggingface'
         critical_dirs = [
             ('data_dir', self.paths['data_dir']),
         ]
+        # Only validate VDR directory in non-deployment environments
+        # In deployment, VDR data may be downloaded at runtime after config initialization
+        if not is_deployment:
+            critical_dirs.append(('vdrs_dir', self.paths['vdrs_dir']))
         for dir_name, dir_path in critical_dirs:
             if not dir_path.exists():
+                if is_deployment and dir_name == 'data_dir':
+                    # In deployment, create the data directory if it doesn't exist
+                    dir_path.mkdir(parents=True, exist_ok=True)
+                    print(f"⚠️  Created missing data directory for deployment: {dir_path}")
+                else:
+                    raise ValueError(f"Critical directory '{dir_name}' does not exist: {dir_path}")
             if not dir_path.is_dir():
                 raise ValueError(f"Path '{dir_name}' exists but is not a directory: {dir_path}")
+        # For deployment environments, validate VDR directory more gently
+        if is_deployment:
+            vdr_dir = self.paths['vdrs_dir']
+            if not vdr_dir.exists():
+                print(f"⚠️  VDR directory does not exist in deployment, will be created at runtime: {vdr_dir}")
+                vdr_dir.mkdir(parents=True, exist_ok=True)
+            elif vdr_dir.exists() and not any(vdr_dir.iterdir()):
+                print(f"⚠️  VDR directory exists but is empty, data will be downloaded at runtime: {vdr_dir}")
+            else:
+                print(f"✅ VDR directory exists with data: {vdr_dir}")
     def _validate_models(self) -> None:
         """Validate that required models are available or can be downloaded."""

app/main.py CHANGED Viewed

@@ -30,16 +30,39 @@ os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")
 # Ensure datasets are downloaded before starting the app
 try:
     from pathlib import Path
-    if not Path("data").exists() or len(list(Path("data").rglob("*"))) < 10:
         print("📦 Downloading datasets from HuggingFace repos...")
         import sys
         sys.path.append(str(Path(__file__).parent.parent))
         from scripts.setup_datasets import download_datasets
-        download_datasets()
-        print("✅ Datasets downloaded successfully")
 except Exception as e:
-    print(f"⚠️  Dataset download failed: {e}")
     print("   App will continue but some features may not work properly")
 # Initialize for Streamlit Cloud deployment (must be done before other imports)
 try:

 # Ensure datasets are downloaded before starting the app
 try:
     from pathlib import Path
+    data_path = Path("data")
+    file_count = len(list(data_path.rglob("*"))) if data_path.exists() else 0
+    print(f"🔍 Checking data directory: {data_path.absolute()}")
+    print(f"📊 Current file count: {file_count}")
+    if not data_path.exists() or file_count < 10:
         print("📦 Downloading datasets from HuggingFace repos...")
+        print(f"   Working directory: {Path.cwd()}")
         import sys
         sys.path.append(str(Path(__file__).parent.parent))
         from scripts.setup_datasets import download_datasets
+        # Force retry if we have very few files, indicating previous download may have failed
+        force_retry = file_count < 10 and data_path.exists()
+        download_datasets(data_path, force_retry=force_retry)
+        # Check what we actually got
+        final_count = len(list(data_path.rglob("*"))) if data_path.exists() else 0
+        print(f"📈 Final file count: {final_count}")
+        if final_count > 10:
+            print("✅ Datasets downloaded successfully")
+        else:
+            print("⚠️  Dataset download may have failed - very few files found")
+    else:
+        print(f"✅ Data directory exists with {file_count} files")
 except Exception as e:
+    print(f"❌ Dataset download failed with error: {type(e).__name__}: {e}")
     print("   App will continue but some features may not work properly")
+    import traceback
+    traceback.print_exc()
 # Initialize for Streamlit Cloud deployment (must be done before other imports)
 try:

scripts/setup_datasets.py CHANGED Viewed

@@ -8,14 +8,27 @@ from pathlib import Path
 from huggingface_hub import snapshot_download
 import shutil
-def download_datasets(data_dir: Path = Path("data")):
-    """Download all datasets from Hugging Face to local data directory"""
     print("🚀 Setting up AI Due Diligence datasets from Hugging Face...")
     # Ensure data directory exists
     data_dir.mkdir(exist_ok=True)
     datasets = [
         {
             "repo_id": "jmzlx/dd-framework",
@@ -42,18 +55,37 @@ def download_datasets(data_dir: Path = Path("data")):
         print(f"   Repository: {dataset['repo_id']}")
         try:
             # Download dataset
             snapshot_download(
                 repo_id=dataset["repo_id"],
                 repo_type="dataset",
                 local_dir=dataset["local_path"],
-                allow_patterns="data/**"  # Only download data directory
             )
             print(f"   ✅ Downloaded successfully")
         except Exception as e:
-            print(f"   ❌ Error downloading {dataset['repo_id']}: {e}")
-            print(f"   💡 You can manually download from: https://huggingface.co/datasets/{dataset['repo_id']}")
     print(f"\n🎉 Dataset setup complete! Data available in: {data_dir.absolute()}")
@@ -62,6 +94,17 @@ def download_datasets(data_dir: Path = Path("data")):
     file_count = sum(1 for f in data_dir.rglob("*") if f.is_file())
     print(f"📊 Downloaded {file_count:,} files, {total_size/(1024*1024):.1f}MB total")
 def clean_old_data(data_dir: Path = Path("data")):
     """Remove old data directory (use with caution!)"""
@@ -76,6 +119,7 @@ def main():
     parser = argparse.ArgumentParser(description="Setup datasets from Hugging Face")
     parser.add_argument("--clean", action="store_true", help="Remove existing data directory first")
     parser.add_argument("--data-dir", default="data", help="Data directory path")
     args = parser.parse_args()
@@ -85,7 +129,7 @@ def main():
     if args.clean:
         clean_old_data(data_dir)
-    download_datasets(data_dir)
 if __name__ == "__main__":
     main()

 from huggingface_hub import snapshot_download
 import shutil
+def download_datasets(data_dir: Path = Path("data"), force_retry: bool = False):
+    """Download all datasets from Hugging Face to local data directory
+    Args:
+        data_dir: Directory to download data to
+        force_retry: Force retry download even if data exists
+    """
     print("🚀 Setting up AI Due Diligence datasets from Hugging Face...")
     # Ensure data directory exists
     data_dir.mkdir(exist_ok=True)
+    # Check if we already have data and skip unless forced
+    if not force_retry:
+        file_count = sum(1 for f in data_dir.rglob("*") if f.is_file())
+        if file_count > 100:  # Reasonable threshold for "has data"
+            print(f"✅ Data directory already contains {file_count} files, skipping download")
+            print(f"   Use --force to re-download anyway")
+            return
     datasets = [
         {
             "repo_id": "jmzlx/dd-framework",
         print(f"   Repository: {dataset['repo_id']}")
         try:
+            # Use HF_TOKEN if available for private repos
+            token = os.getenv("HF_TOKEN")
+            if token:
+                print(f"   🔑 Using HuggingFace token for authentication")
+            else:
+                print(f"   ⚠️  No HF_TOKEN found - may fail for private repositories")
             # Download dataset
             snapshot_download(
                 repo_id=dataset["repo_id"],
                 repo_type="dataset",
                 local_dir=dataset["local_path"],
+                allow_patterns="data/**",  # Only download data directory
+                token=token  # Pass token if available
             )
             print(f"   ✅ Downloaded successfully")
         except Exception as e:
+            print(f"   ❌ Error downloading {dataset['repo_id']}: {type(e).__name__}: {e}")
+            if "401" in str(e) or "403" in str(e) or "private" in str(e).lower():
+                print(f"   🔒 This appears to be a private repository requiring authentication")
+                if not token:
+                    print(f"   💡 Set HF_TOKEN environment variable with read access to this repository")
+                else:
+                    print(f"   💡 Check that your HF_TOKEN has access to this repository")
+            elif "network" in str(e).lower() or "connection" in str(e).lower():
+                print(f"   🌐 Network connectivity issue - check internet connection")
+            print(f"   💡 Manual download: https://huggingface.co/datasets/{dataset['repo_id']}")
+            # Continue with other datasets even if one fails
+            continue
     print(f"\n🎉 Dataset setup complete! Data available in: {data_dir.absolute()}")
     file_count = sum(1 for f in data_dir.rglob("*") if f.is_file())
     print(f"📊 Downloaded {file_count:,} files, {total_size/(1024*1024):.1f}MB total")
+    # Check if we're in a deployment environment and provide guidance
+    is_deployment = os.getenv('STREAMLIT_SERVER_HEADLESS') == 'true' or os.getenv('HF_HOME') == '/tmp/huggingface'
+    if is_deployment:
+        print("\n🚀 Deployment Environment Detected")
+        if file_count < 50:
+            print("⚠️  Few files downloaded - this may indicate missing authentication")
+            print("💡 Ensure HF_TOKEN is set in your deployment environment secrets")
+            print("💡 Token should have read access to private repositories: jmzlx/dd-framework, jmzlx/dd-indexes, jmzlx/dd-vdrs")
+        else:
+            print("✅ Data download appears successful for deployment environment")
 def clean_old_data(data_dir: Path = Path("data")):
     """Remove old data directory (use with caution!)"""
     parser = argparse.ArgumentParser(description="Setup datasets from Hugging Face")
     parser.add_argument("--clean", action="store_true", help="Remove existing data directory first")
+    parser.add_argument("--force", action="store_true", help="Force re-download even if data exists")
     parser.add_argument("--data-dir", default="data", help="Data directory path")
     args = parser.parse_args()
     if args.clean:
         clean_old_data(data_dir)
+    download_datasets(data_dir, force_retry=args.force)
 if __name__ == "__main__":
     main()