Juan Salas commited on
Commit Β·
3632723
1
Parent(s): 6d03070
fix: resolve deployment crash with missing VDR directory
Browse files- Make config validation deployment-aware to handle missing data gracefully
- Add environment detection for Streamlit/HF deployment contexts
- Improve dataset download script with retry logic and better error handling
- Add force retry option for partial downloads
- Provide clear feedback about HF_TOKEN authentication requirements
Fixes 'Critical directory vdrs_dir does not exist' error in deployment environments
- app/core/config.py +25 -2
- app/main.py +27 -4
- scripts/setup_datasets.py +50 -6
app/core/config.py
CHANGED
|
@@ -121,16 +121,39 @@ class AppConfig:
|
|
| 121 |
|
| 122 |
def _validate_paths(self) -> None:
|
| 123 |
"""Validate that critical directories exist."""
|
|
|
|
|
|
|
|
|
|
| 124 |
critical_dirs = [
|
| 125 |
('data_dir', self.paths['data_dir']),
|
| 126 |
-
('vdrs_dir', self.paths['vdrs_dir'])
|
| 127 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
for dir_name, dir_path in critical_dirs:
|
| 130 |
if not dir_path.exists():
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
if not dir_path.is_dir():
|
| 133 |
raise ValueError(f"Path '{dir_name}' exists but is not a directory: {dir_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
def _validate_models(self) -> None:
|
| 136 |
"""Validate that required models are available or can be downloaded."""
|
|
|
|
| 121 |
|
| 122 |
def _validate_paths(self) -> None:
|
| 123 |
"""Validate that critical directories exist."""
|
| 124 |
+
# Check if we're in a deployment environment where data might be downloaded at runtime
|
| 125 |
+
is_deployment = os.getenv('STREAMLIT_SERVER_HEADLESS') == 'true' or os.getenv('HF_HOME') == '/tmp/huggingface'
|
| 126 |
+
|
| 127 |
critical_dirs = [
|
| 128 |
('data_dir', self.paths['data_dir']),
|
|
|
|
| 129 |
]
|
| 130 |
+
|
| 131 |
+
# Only validate VDR directory in non-deployment environments
|
| 132 |
+
# In deployment, VDR data may be downloaded at runtime after config initialization
|
| 133 |
+
if not is_deployment:
|
| 134 |
+
critical_dirs.append(('vdrs_dir', self.paths['vdrs_dir']))
|
| 135 |
|
| 136 |
for dir_name, dir_path in critical_dirs:
|
| 137 |
if not dir_path.exists():
|
| 138 |
+
if is_deployment and dir_name == 'data_dir':
|
| 139 |
+
# In deployment, create the data directory if it doesn't exist
|
| 140 |
+
dir_path.mkdir(parents=True, exist_ok=True)
|
| 141 |
+
print(f"β οΈ Created missing data directory for deployment: {dir_path}")
|
| 142 |
+
else:
|
| 143 |
+
raise ValueError(f"Critical directory '{dir_name}' does not exist: {dir_path}")
|
| 144 |
if not dir_path.is_dir():
|
| 145 |
raise ValueError(f"Path '{dir_name}' exists but is not a directory: {dir_path}")
|
| 146 |
+
|
| 147 |
+
# For deployment environments, validate VDR directory more gently
|
| 148 |
+
if is_deployment:
|
| 149 |
+
vdr_dir = self.paths['vdrs_dir']
|
| 150 |
+
if not vdr_dir.exists():
|
| 151 |
+
print(f"β οΈ VDR directory does not exist in deployment, will be created at runtime: {vdr_dir}")
|
| 152 |
+
vdr_dir.mkdir(parents=True, exist_ok=True)
|
| 153 |
+
elif vdr_dir.exists() and not any(vdr_dir.iterdir()):
|
| 154 |
+
print(f"β οΈ VDR directory exists but is empty, data will be downloaded at runtime: {vdr_dir}")
|
| 155 |
+
else:
|
| 156 |
+
print(f"β
VDR directory exists with data: {vdr_dir}")
|
| 157 |
|
| 158 |
def _validate_models(self) -> None:
|
| 159 |
"""Validate that required models are available or can be downloaded."""
|
app/main.py
CHANGED
|
@@ -30,16 +30,39 @@ os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")
|
|
| 30 |
# Ensure datasets are downloaded before starting the app
|
| 31 |
try:
|
| 32 |
from pathlib import Path
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
print("π¦ Downloading datasets from HuggingFace repos...")
|
|
|
|
|
|
|
| 35 |
import sys
|
| 36 |
sys.path.append(str(Path(__file__).parent.parent))
|
| 37 |
from scripts.setup_datasets import download_datasets
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
except Exception as e:
|
| 41 |
-
print(f"
|
| 42 |
print(" App will continue but some features may not work properly")
|
|
|
|
|
|
|
| 43 |
|
| 44 |
# Initialize for Streamlit Cloud deployment (must be done before other imports)
|
| 45 |
try:
|
|
|
|
| 30 |
# Ensure datasets are downloaded before starting the app
|
| 31 |
try:
|
| 32 |
from pathlib import Path
|
| 33 |
+
data_path = Path("data")
|
| 34 |
+
file_count = len(list(data_path.rglob("*"))) if data_path.exists() else 0
|
| 35 |
+
|
| 36 |
+
print(f"π Checking data directory: {data_path.absolute()}")
|
| 37 |
+
print(f"π Current file count: {file_count}")
|
| 38 |
+
|
| 39 |
+
if not data_path.exists() or file_count < 10:
|
| 40 |
print("π¦ Downloading datasets from HuggingFace repos...")
|
| 41 |
+
print(f" Working directory: {Path.cwd()}")
|
| 42 |
+
|
| 43 |
import sys
|
| 44 |
sys.path.append(str(Path(__file__).parent.parent))
|
| 45 |
from scripts.setup_datasets import download_datasets
|
| 46 |
+
# Force retry if we have very few files, indicating previous download may have failed
|
| 47 |
+
force_retry = file_count < 10 and data_path.exists()
|
| 48 |
+
download_datasets(data_path, force_retry=force_retry)
|
| 49 |
+
|
| 50 |
+
# Check what we actually got
|
| 51 |
+
final_count = len(list(data_path.rglob("*"))) if data_path.exists() else 0
|
| 52 |
+
print(f"π Final file count: {final_count}")
|
| 53 |
+
|
| 54 |
+
if final_count > 10:
|
| 55 |
+
print("β
Datasets downloaded successfully")
|
| 56 |
+
else:
|
| 57 |
+
print("β οΈ Dataset download may have failed - very few files found")
|
| 58 |
+
else:
|
| 59 |
+
print(f"β
Data directory exists with {file_count} files")
|
| 60 |
+
|
| 61 |
except Exception as e:
|
| 62 |
+
print(f"β Dataset download failed with error: {type(e).__name__}: {e}")
|
| 63 |
print(" App will continue but some features may not work properly")
|
| 64 |
+
import traceback
|
| 65 |
+
traceback.print_exc()
|
| 66 |
|
| 67 |
# Initialize for Streamlit Cloud deployment (must be done before other imports)
|
| 68 |
try:
|
scripts/setup_datasets.py
CHANGED
|
@@ -8,14 +8,27 @@ from pathlib import Path
|
|
| 8 |
from huggingface_hub import snapshot_download
|
| 9 |
import shutil
|
| 10 |
|
| 11 |
-
def download_datasets(data_dir: Path = Path("data")):
|
| 12 |
-
"""Download all datasets from Hugging Face to local data directory
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
print("π Setting up AI Due Diligence datasets from Hugging Face...")
|
| 15 |
|
| 16 |
# Ensure data directory exists
|
| 17 |
data_dir.mkdir(exist_ok=True)
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
datasets = [
|
| 20 |
{
|
| 21 |
"repo_id": "jmzlx/dd-framework",
|
|
@@ -42,18 +55,37 @@ def download_datasets(data_dir: Path = Path("data")):
|
|
| 42 |
print(f" Repository: {dataset['repo_id']}")
|
| 43 |
|
| 44 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
# Download dataset
|
| 46 |
snapshot_download(
|
| 47 |
repo_id=dataset["repo_id"],
|
| 48 |
repo_type="dataset",
|
| 49 |
local_dir=dataset["local_path"],
|
| 50 |
-
allow_patterns="data/**" # Only download data directory
|
|
|
|
| 51 |
)
|
| 52 |
print(f" β
Downloaded successfully")
|
| 53 |
|
| 54 |
except Exception as e:
|
| 55 |
-
print(f" β Error downloading {dataset['repo_id']}: {e}")
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
print(f"\nπ Dataset setup complete! Data available in: {data_dir.absolute()}")
|
| 59 |
|
|
@@ -62,6 +94,17 @@ def download_datasets(data_dir: Path = Path("data")):
|
|
| 62 |
file_count = sum(1 for f in data_dir.rglob("*") if f.is_file())
|
| 63 |
|
| 64 |
print(f"π Downloaded {file_count:,} files, {total_size/(1024*1024):.1f}MB total")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
def clean_old_data(data_dir: Path = Path("data")):
|
| 67 |
"""Remove old data directory (use with caution!)"""
|
|
@@ -76,6 +119,7 @@ def main():
|
|
| 76 |
|
| 77 |
parser = argparse.ArgumentParser(description="Setup datasets from Hugging Face")
|
| 78 |
parser.add_argument("--clean", action="store_true", help="Remove existing data directory first")
|
|
|
|
| 79 |
parser.add_argument("--data-dir", default="data", help="Data directory path")
|
| 80 |
|
| 81 |
args = parser.parse_args()
|
|
@@ -85,7 +129,7 @@ def main():
|
|
| 85 |
if args.clean:
|
| 86 |
clean_old_data(data_dir)
|
| 87 |
|
| 88 |
-
download_datasets(data_dir)
|
| 89 |
|
| 90 |
if __name__ == "__main__":
|
| 91 |
main()
|
|
|
|
| 8 |
from huggingface_hub import snapshot_download
|
| 9 |
import shutil
|
| 10 |
|
| 11 |
+
def download_datasets(data_dir: Path = Path("data"), force_retry: bool = False):
|
| 12 |
+
"""Download all datasets from Hugging Face to local data directory
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
data_dir: Directory to download data to
|
| 16 |
+
force_retry: Force retry download even if data exists
|
| 17 |
+
"""
|
| 18 |
|
| 19 |
print("π Setting up AI Due Diligence datasets from Hugging Face...")
|
| 20 |
|
| 21 |
# Ensure data directory exists
|
| 22 |
data_dir.mkdir(exist_ok=True)
|
| 23 |
|
| 24 |
+
# Check if we already have data and skip unless forced
|
| 25 |
+
if not force_retry:
|
| 26 |
+
file_count = sum(1 for f in data_dir.rglob("*") if f.is_file())
|
| 27 |
+
if file_count > 100: # Reasonable threshold for "has data"
|
| 28 |
+
print(f"β
Data directory already contains {file_count} files, skipping download")
|
| 29 |
+
print(f" Use --force to re-download anyway")
|
| 30 |
+
return
|
| 31 |
+
|
| 32 |
datasets = [
|
| 33 |
{
|
| 34 |
"repo_id": "jmzlx/dd-framework",
|
|
|
|
| 55 |
print(f" Repository: {dataset['repo_id']}")
|
| 56 |
|
| 57 |
try:
|
| 58 |
+
# Use HF_TOKEN if available for private repos
|
| 59 |
+
token = os.getenv("HF_TOKEN")
|
| 60 |
+
if token:
|
| 61 |
+
print(f" π Using HuggingFace token for authentication")
|
| 62 |
+
else:
|
| 63 |
+
print(f" β οΈ No HF_TOKEN found - may fail for private repositories")
|
| 64 |
+
|
| 65 |
# Download dataset
|
| 66 |
snapshot_download(
|
| 67 |
repo_id=dataset["repo_id"],
|
| 68 |
repo_type="dataset",
|
| 69 |
local_dir=dataset["local_path"],
|
| 70 |
+
allow_patterns="data/**", # Only download data directory
|
| 71 |
+
token=token # Pass token if available
|
| 72 |
)
|
| 73 |
print(f" β
Downloaded successfully")
|
| 74 |
|
| 75 |
except Exception as e:
|
| 76 |
+
print(f" β Error downloading {dataset['repo_id']}: {type(e).__name__}: {e}")
|
| 77 |
+
if "401" in str(e) or "403" in str(e) or "private" in str(e).lower():
|
| 78 |
+
print(f" π This appears to be a private repository requiring authentication")
|
| 79 |
+
if not token:
|
| 80 |
+
print(f" π‘ Set HF_TOKEN environment variable with read access to this repository")
|
| 81 |
+
else:
|
| 82 |
+
print(f" π‘ Check that your HF_TOKEN has access to this repository")
|
| 83 |
+
elif "network" in str(e).lower() or "connection" in str(e).lower():
|
| 84 |
+
print(f" π Network connectivity issue - check internet connection")
|
| 85 |
+
print(f" π‘ Manual download: https://huggingface.co/datasets/{dataset['repo_id']}")
|
| 86 |
+
|
| 87 |
+
# Continue with other datasets even if one fails
|
| 88 |
+
continue
|
| 89 |
|
| 90 |
print(f"\nπ Dataset setup complete! Data available in: {data_dir.absolute()}")
|
| 91 |
|
|
|
|
| 94 |
file_count = sum(1 for f in data_dir.rglob("*") if f.is_file())
|
| 95 |
|
| 96 |
print(f"π Downloaded {file_count:,} files, {total_size/(1024*1024):.1f}MB total")
|
| 97 |
+
|
| 98 |
+
# Check if we're in a deployment environment and provide guidance
|
| 99 |
+
is_deployment = os.getenv('STREAMLIT_SERVER_HEADLESS') == 'true' or os.getenv('HF_HOME') == '/tmp/huggingface'
|
| 100 |
+
if is_deployment:
|
| 101 |
+
print("\nπ Deployment Environment Detected")
|
| 102 |
+
if file_count < 50:
|
| 103 |
+
print("β οΈ Few files downloaded - this may indicate missing authentication")
|
| 104 |
+
print("π‘ Ensure HF_TOKEN is set in your deployment environment secrets")
|
| 105 |
+
print("π‘ Token should have read access to private repositories: jmzlx/dd-framework, jmzlx/dd-indexes, jmzlx/dd-vdrs")
|
| 106 |
+
else:
|
| 107 |
+
print("β
Data download appears successful for deployment environment")
|
| 108 |
|
| 109 |
def clean_old_data(data_dir: Path = Path("data")):
|
| 110 |
"""Remove old data directory (use with caution!)"""
|
|
|
|
| 119 |
|
| 120 |
parser = argparse.ArgumentParser(description="Setup datasets from Hugging Face")
|
| 121 |
parser.add_argument("--clean", action="store_true", help="Remove existing data directory first")
|
| 122 |
+
parser.add_argument("--force", action="store_true", help="Force re-download even if data exists")
|
| 123 |
parser.add_argument("--data-dir", default="data", help="Data directory path")
|
| 124 |
|
| 125 |
args = parser.parse_args()
|
|
|
|
| 129 |
if args.clean:
|
| 130 |
clean_old_data(data_dir)
|
| 131 |
|
| 132 |
+
download_datasets(data_dir, force_retry=args.force)
|
| 133 |
|
| 134 |
if __name__ == "__main__":
|
| 135 |
main()
|