Juan Salas commited on
Commit
3632723
Β·
1 Parent(s): 6d03070

fix: resolve deployment crash with missing VDR directory

Browse files

- Make config validation deployment-aware to handle missing data gracefully
- Add environment detection for Streamlit/HF deployment contexts
- Improve dataset download script with retry logic and better error handling
- Add force retry option for partial downloads
- Provide clear feedback about HF_TOKEN authentication requirements

Fixes 'Critical directory vdrs_dir does not exist' error in deployment environments

Files changed (3) hide show
  1. app/core/config.py +25 -2
  2. app/main.py +27 -4
  3. scripts/setup_datasets.py +50 -6
app/core/config.py CHANGED
@@ -121,16 +121,39 @@ class AppConfig:
121
 
122
  def _validate_paths(self) -> None:
123
  """Validate that critical directories exist."""
 
 
 
124
  critical_dirs = [
125
  ('data_dir', self.paths['data_dir']),
126
- ('vdrs_dir', self.paths['vdrs_dir'])
127
  ]
 
 
 
 
 
128
 
129
  for dir_name, dir_path in critical_dirs:
130
  if not dir_path.exists():
131
- raise ValueError(f"Critical directory '{dir_name}' does not exist: {dir_path}")
 
 
 
 
 
132
  if not dir_path.is_dir():
133
  raise ValueError(f"Path '{dir_name}' exists but is not a directory: {dir_path}")
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  def _validate_models(self) -> None:
136
  """Validate that required models are available or can be downloaded."""
 
121
 
122
  def _validate_paths(self) -> None:
123
  """Validate that critical directories exist."""
124
+ # Check if we're in a deployment environment where data might be downloaded at runtime
125
+ is_deployment = os.getenv('STREAMLIT_SERVER_HEADLESS') == 'true' or os.getenv('HF_HOME') == '/tmp/huggingface'
126
+
127
  critical_dirs = [
128
  ('data_dir', self.paths['data_dir']),
 
129
  ]
130
+
131
+ # Only validate VDR directory in non-deployment environments
132
+ # In deployment, VDR data may be downloaded at runtime after config initialization
133
+ if not is_deployment:
134
+ critical_dirs.append(('vdrs_dir', self.paths['vdrs_dir']))
135
 
136
  for dir_name, dir_path in critical_dirs:
137
  if not dir_path.exists():
138
+ if is_deployment and dir_name == 'data_dir':
139
+ # In deployment, create the data directory if it doesn't exist
140
+ dir_path.mkdir(parents=True, exist_ok=True)
141
+ print(f"⚠️ Created missing data directory for deployment: {dir_path}")
142
+ else:
143
+ raise ValueError(f"Critical directory '{dir_name}' does not exist: {dir_path}")
144
  if not dir_path.is_dir():
145
  raise ValueError(f"Path '{dir_name}' exists but is not a directory: {dir_path}")
146
+
147
+ # For deployment environments, validate VDR directory more gently
148
+ if is_deployment:
149
+ vdr_dir = self.paths['vdrs_dir']
150
+ if not vdr_dir.exists():
151
+ print(f"⚠️ VDR directory does not exist in deployment, will be created at runtime: {vdr_dir}")
152
+ vdr_dir.mkdir(parents=True, exist_ok=True)
153
+ elif vdr_dir.exists() and not any(vdr_dir.iterdir()):
154
+ print(f"⚠️ VDR directory exists but is empty, data will be downloaded at runtime: {vdr_dir}")
155
+ else:
156
+ print(f"βœ… VDR directory exists with data: {vdr_dir}")
157
 
158
  def _validate_models(self) -> None:
159
  """Validate that required models are available or can be downloaded."""
app/main.py CHANGED
@@ -30,16 +30,39 @@ os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")
30
  # Ensure datasets are downloaded before starting the app
31
  try:
32
  from pathlib import Path
33
- if not Path("data").exists() or len(list(Path("data").rglob("*"))) < 10:
 
 
 
 
 
 
34
  print("πŸ“¦ Downloading datasets from HuggingFace repos...")
 
 
35
  import sys
36
  sys.path.append(str(Path(__file__).parent.parent))
37
  from scripts.setup_datasets import download_datasets
38
- download_datasets()
39
- print("βœ… Datasets downloaded successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  except Exception as e:
41
- print(f"⚠️ Dataset download failed: {e}")
42
  print(" App will continue but some features may not work properly")
 
 
43
 
44
  # Initialize for Streamlit Cloud deployment (must be done before other imports)
45
  try:
 
30
  # Ensure datasets are downloaded before starting the app
31
  try:
32
  from pathlib import Path
33
+ data_path = Path("data")
34
+ file_count = len(list(data_path.rglob("*"))) if data_path.exists() else 0
35
+
36
+ print(f"πŸ” Checking data directory: {data_path.absolute()}")
37
+ print(f"πŸ“Š Current file count: {file_count}")
38
+
39
+ if not data_path.exists() or file_count < 10:
40
  print("πŸ“¦ Downloading datasets from HuggingFace repos...")
41
+ print(f" Working directory: {Path.cwd()}")
42
+
43
  import sys
44
  sys.path.append(str(Path(__file__).parent.parent))
45
  from scripts.setup_datasets import download_datasets
46
+ # Force retry if we have very few files, indicating previous download may have failed
47
+ force_retry = file_count < 10 and data_path.exists()
48
+ download_datasets(data_path, force_retry=force_retry)
49
+
50
+ # Check what we actually got
51
+ final_count = len(list(data_path.rglob("*"))) if data_path.exists() else 0
52
+ print(f"πŸ“ˆ Final file count: {final_count}")
53
+
54
+ if final_count > 10:
55
+ print("βœ… Datasets downloaded successfully")
56
+ else:
57
+ print("⚠️ Dataset download may have failed - very few files found")
58
+ else:
59
+ print(f"βœ… Data directory exists with {file_count} files")
60
+
61
  except Exception as e:
62
+ print(f"❌ Dataset download failed with error: {type(e).__name__}: {e}")
63
  print(" App will continue but some features may not work properly")
64
+ import traceback
65
+ traceback.print_exc()
66
 
67
  # Initialize for Streamlit Cloud deployment (must be done before other imports)
68
  try:
scripts/setup_datasets.py CHANGED
@@ -8,14 +8,27 @@ from pathlib import Path
8
  from huggingface_hub import snapshot_download
9
  import shutil
10
 
11
- def download_datasets(data_dir: Path = Path("data")):
12
- """Download all datasets from Hugging Face to local data directory"""
 
 
 
 
 
13
 
14
  print("πŸš€ Setting up AI Due Diligence datasets from Hugging Face...")
15
 
16
  # Ensure data directory exists
17
  data_dir.mkdir(exist_ok=True)
18
 
 
 
 
 
 
 
 
 
19
  datasets = [
20
  {
21
  "repo_id": "jmzlx/dd-framework",
@@ -42,18 +55,37 @@ def download_datasets(data_dir: Path = Path("data")):
42
  print(f" Repository: {dataset['repo_id']}")
43
 
44
  try:
 
 
 
 
 
 
 
45
  # Download dataset
46
  snapshot_download(
47
  repo_id=dataset["repo_id"],
48
  repo_type="dataset",
49
  local_dir=dataset["local_path"],
50
- allow_patterns="data/**" # Only download data directory
 
51
  )
52
  print(f" βœ… Downloaded successfully")
53
 
54
  except Exception as e:
55
- print(f" ❌ Error downloading {dataset['repo_id']}: {e}")
56
- print(f" πŸ’‘ You can manually download from: https://huggingface.co/datasets/{dataset['repo_id']}")
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  print(f"\nπŸŽ‰ Dataset setup complete! Data available in: {data_dir.absolute()}")
59
 
@@ -62,6 +94,17 @@ def download_datasets(data_dir: Path = Path("data")):
62
  file_count = sum(1 for f in data_dir.rglob("*") if f.is_file())
63
 
64
  print(f"πŸ“Š Downloaded {file_count:,} files, {total_size/(1024*1024):.1f}MB total")
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  def clean_old_data(data_dir: Path = Path("data")):
67
  """Remove old data directory (use with caution!)"""
@@ -76,6 +119,7 @@ def main():
76
 
77
  parser = argparse.ArgumentParser(description="Setup datasets from Hugging Face")
78
  parser.add_argument("--clean", action="store_true", help="Remove existing data directory first")
 
79
  parser.add_argument("--data-dir", default="data", help="Data directory path")
80
 
81
  args = parser.parse_args()
@@ -85,7 +129,7 @@ def main():
85
  if args.clean:
86
  clean_old_data(data_dir)
87
 
88
- download_datasets(data_dir)
89
 
90
  if __name__ == "__main__":
91
  main()
 
8
  from huggingface_hub import snapshot_download
9
  import shutil
10
 
11
+ def download_datasets(data_dir: Path = Path("data"), force_retry: bool = False):
12
+ """Download all datasets from Hugging Face to local data directory
13
+
14
+ Args:
15
+ data_dir: Directory to download data to
16
+ force_retry: Force retry download even if data exists
17
+ """
18
 
19
  print("πŸš€ Setting up AI Due Diligence datasets from Hugging Face...")
20
 
21
  # Ensure data directory exists
22
  data_dir.mkdir(exist_ok=True)
23
 
24
+ # Check if we already have data and skip unless forced
25
+ if not force_retry:
26
+ file_count = sum(1 for f in data_dir.rglob("*") if f.is_file())
27
+ if file_count > 100: # Reasonable threshold for "has data"
28
+ print(f"βœ… Data directory already contains {file_count} files, skipping download")
29
+ print(f" Use --force to re-download anyway")
30
+ return
31
+
32
  datasets = [
33
  {
34
  "repo_id": "jmzlx/dd-framework",
 
55
  print(f" Repository: {dataset['repo_id']}")
56
 
57
  try:
58
+ # Use HF_TOKEN if available for private repos
59
+ token = os.getenv("HF_TOKEN")
60
+ if token:
61
+ print(f" πŸ”‘ Using HuggingFace token for authentication")
62
+ else:
63
+ print(f" ⚠️ No HF_TOKEN found - may fail for private repositories")
64
+
65
  # Download dataset
66
  snapshot_download(
67
  repo_id=dataset["repo_id"],
68
  repo_type="dataset",
69
  local_dir=dataset["local_path"],
70
+ allow_patterns="data/**", # Only download data directory
71
+ token=token # Pass token if available
72
  )
73
  print(f" βœ… Downloaded successfully")
74
 
75
  except Exception as e:
76
+ print(f" ❌ Error downloading {dataset['repo_id']}: {type(e).__name__}: {e}")
77
+ if "401" in str(e) or "403" in str(e) or "private" in str(e).lower():
78
+ print(f" πŸ”’ This appears to be a private repository requiring authentication")
79
+ if not token:
80
+ print(f" πŸ’‘ Set HF_TOKEN environment variable with read access to this repository")
81
+ else:
82
+ print(f" πŸ’‘ Check that your HF_TOKEN has access to this repository")
83
+ elif "network" in str(e).lower() or "connection" in str(e).lower():
84
+ print(f" 🌐 Network connectivity issue - check internet connection")
85
+ print(f" πŸ’‘ Manual download: https://huggingface.co/datasets/{dataset['repo_id']}")
86
+
87
+ # Continue with other datasets even if one fails
88
+ continue
89
 
90
  print(f"\nπŸŽ‰ Dataset setup complete! Data available in: {data_dir.absolute()}")
91
 
 
94
  file_count = sum(1 for f in data_dir.rglob("*") if f.is_file())
95
 
96
  print(f"πŸ“Š Downloaded {file_count:,} files, {total_size/(1024*1024):.1f}MB total")
97
+
98
+ # Check if we're in a deployment environment and provide guidance
99
+ is_deployment = os.getenv('STREAMLIT_SERVER_HEADLESS') == 'true' or os.getenv('HF_HOME') == '/tmp/huggingface'
100
+ if is_deployment:
101
+ print("\nπŸš€ Deployment Environment Detected")
102
+ if file_count < 50:
103
+ print("⚠️ Few files downloaded - this may indicate missing authentication")
104
+ print("πŸ’‘ Ensure HF_TOKEN is set in your deployment environment secrets")
105
+ print("πŸ’‘ Token should have read access to private repositories: jmzlx/dd-framework, jmzlx/dd-indexes, jmzlx/dd-vdrs")
106
+ else:
107
+ print("βœ… Data download appears successful for deployment environment")
108
 
109
  def clean_old_data(data_dir: Path = Path("data")):
110
  """Remove old data directory (use with caution!)"""
 
119
 
120
  parser = argparse.ArgumentParser(description="Setup datasets from Hugging Face")
121
  parser.add_argument("--clean", action="store_true", help="Remove existing data directory first")
122
+ parser.add_argument("--force", action="store_true", help="Force re-download even if data exists")
123
  parser.add_argument("--data-dir", default="data", help="Data directory path")
124
 
125
  args = parser.parse_args()
 
129
  if args.clean:
130
  clean_old_data(data_dir)
131
 
132
+ download_datasets(data_dir, force_retry=args.force)
133
 
134
  if __name__ == "__main__":
135
  main()