Juan Salas commited on
Commit
495dc7c
·
1 Parent(s): b49827c

Fix project selector directory structure issues

Browse files

- Fix nested data/data/ directory structure in HuggingFace Spaces
- Update ui_components.py to use config-based data paths instead of hardcoded 'data'
- Add symlink filtering to prevent circular directory references
- Add search_indexes to excluded folders list
- Modify setup_datasets.py to extract HF repo data/ contents directly to target directory
- Prevents nested structure: data/data/checklist -> data/checklist

Resolves project selector showing incorrect 'Data (5 rooms, 974 docs)' option.
Now correctly displays only VDR projects:
- Automated Services Transformation (1 rooms, 477 docs)
- Industrial Security Leadership (1 rooms, 491 docs)

app/ui/ui_components.py CHANGED
@@ -68,7 +68,9 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
68
 
69
  # Scan for available projects
70
  projects = []
71
- data_base_path = Path("data").resolve() if Path("data").exists() else None
 
 
72
 
73
  if data_base_path and data_base_path.exists():
74
  # First check if there's a vdrs folder with projects
@@ -76,9 +78,9 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
76
  if vdrs_path.exists():
77
  # Look for project directories in vdrs
78
  for project_dir in vdrs_path.iterdir():
79
- if project_dir.is_dir() and not project_dir.name.startswith('.'):
80
- # Check if it has subdirectories that could be data rooms
81
- subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
82
  if subdirs:
83
  # Count total documents in all data rooms
84
  total_docs = count_documents_in_directory(project_dir)
@@ -92,9 +94,10 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
92
 
93
  # Also look for project directories directly in data folder (excluding special folders)
94
  for project_dir in data_base_path.iterdir():
95
- if project_dir.is_dir() and not project_dir.name.startswith('.') and project_dir.name not in ['checklist', 'questions', 'vdrs', 'strategy']:
96
- # Check if it has subdirectories that could be data rooms
97
- subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
 
98
  if subdirs:
99
  # Count total documents in all data rooms
100
  total_docs = count_documents_in_directory(project_dir)
@@ -158,7 +161,7 @@ def render_data_room_selector(project_path: str) -> Optional[str]:
158
  project_path_obj = Path(project_path)
159
 
160
  for data_room_dir in project_path_obj.iterdir():
161
- if data_room_dir.is_dir() and not data_room_dir.name.startswith('.'):
162
  # Count documents for display
163
  doc_count = count_documents_in_directory(data_room_dir)
164
  if doc_count > 0: # Only show directories with documents
 
68
 
69
  # Scan for available projects
70
  projects = []
71
+ # Use config's data directory path instead of hardcoded "data"
72
+ config = get_config()
73
+ data_base_path = config.paths['data_dir'] if config.paths['data_dir'].exists() else None
74
 
75
  if data_base_path and data_base_path.exists():
76
  # First check if there's a vdrs folder with projects
 
78
  if vdrs_path.exists():
79
  # Look for project directories in vdrs
80
  for project_dir in vdrs_path.iterdir():
81
+ if project_dir.is_dir() and not project_dir.is_symlink() and not project_dir.name.startswith('.'):
82
+ # Check if it has subdirectories that could be data rooms (exclude symlinks)
83
+ subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.is_symlink() and not d.name.startswith('.')]
84
  if subdirs:
85
  # Count total documents in all data rooms
86
  total_docs = count_documents_in_directory(project_dir)
 
94
 
95
  # Also look for project directories directly in data folder (excluding special folders)
96
  for project_dir in data_base_path.iterdir():
97
+ if (project_dir.is_dir() and not project_dir.is_symlink() and not project_dir.name.startswith('.')
98
+ and project_dir.name not in ['checklist', 'questions', 'vdrs', 'strategy', 'search_indexes']):
99
+ # Check if it has subdirectories that could be data rooms (exclude symlinks)
100
+ subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.is_symlink() and not d.name.startswith('.')]
101
  if subdirs:
102
  # Count total documents in all data rooms
103
  total_docs = count_documents_in_directory(project_dir)
 
161
  project_path_obj = Path(project_path)
162
 
163
  for data_room_dir in project_path_obj.iterdir():
164
+ if data_room_dir.is_dir() and not data_room_dir.is_symlink() and not data_room_dir.name.startswith('.'):
165
  # Count documents for display
166
  doc_count = count_documents_in_directory(data_room_dir)
167
  if doc_count > 0: # Only show directories with documents
scripts/setup_datasets.py CHANGED
@@ -4,6 +4,7 @@ Dataset Setup Script - Download data from Hugging Face instead of storing in git
4
  """
5
 
6
  import os
 
7
  from pathlib import Path
8
  from huggingface_hub import snapshot_download
9
  import shutil
@@ -62,14 +63,25 @@ def download_datasets(data_dir: Path = Path("data"), force_retry: bool = False):
62
  else:
63
  print(f" ⚠️ No HF_TOKEN found - may fail for private repositories")
64
 
65
- # Download dataset
66
- snapshot_download(
67
- repo_id=dataset["repo_id"],
68
- repo_type="dataset",
69
- local_dir=dataset["local_path"],
70
- allow_patterns="data/**", # Only download data directory
71
- token=token # Pass token if available
72
- )
 
 
 
 
 
 
 
 
 
 
 
73
  print(f" ✅ Downloaded successfully")
74
 
75
  except Exception as e:
 
4
  """
5
 
6
  import os
7
+ import tempfile
8
  from pathlib import Path
9
  from huggingface_hub import snapshot_download
10
  import shutil
 
63
  else:
64
  print(f" ⚠️ No HF_TOKEN found - may fail for private repositories")
65
 
66
+ # Download to temporary directory first to handle nested structure
67
+ with tempfile.TemporaryDirectory() as temp_dir:
68
+ temp_path = Path(temp_dir)
69
+ snapshot_download(
70
+ repo_id=dataset["repo_id"],
71
+ repo_type="dataset",
72
+ local_dir=temp_path,
73
+ allow_patterns="data/**", # Download data directory
74
+ token=token # Pass token if available
75
+ )
76
+
77
+ # Move contents from temp_dir/data to target data_dir
78
+ temp_data_dir = temp_path / "data"
79
+ if temp_data_dir.exists():
80
+ for item in temp_data_dir.iterdir():
81
+ target_path = dataset["local_path"] / item.name
82
+ if target_path.exists():
83
+ shutil.rmtree(target_path) if target_path.is_dir() else target_path.unlink()
84
+ shutil.move(str(item), str(target_path))
85
  print(f" ✅ Downloaded successfully")
86
 
87
  except Exception as e: