Juan Salas commited on
Commit ·
495dc7c
1
Parent(s): b49827c
Fix project selector directory structure issues
Browse files- Fix nested data/data/ directory structure in HuggingFace Spaces
- Update ui_components.py to use config-based data paths instead of hardcoded 'data'
- Add symlink filtering to prevent circular directory references
- Add search_indexes to excluded folders list
- Modify setup_datasets.py to extract HF repo data/ contents directly to target directory
- Prevents nested structure: data/data/checklist -> data/checklist
Resolves project selector showing incorrect 'Data (5 rooms, 974 docs)' option.
Now correctly displays only VDR projects:
- Automated Services Transformation (1 rooms, 477 docs)
- Industrial Security Leadership (1 rooms, 491 docs)
- app/ui/ui_components.py +11 -8
- scripts/setup_datasets.py +20 -8
app/ui/ui_components.py
CHANGED
|
@@ -68,7 +68,9 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
|
|
| 68 |
|
| 69 |
# Scan for available projects
|
| 70 |
projects = []
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
|
| 73 |
if data_base_path and data_base_path.exists():
|
| 74 |
# First check if there's a vdrs folder with projects
|
|
@@ -76,9 +78,9 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
|
|
| 76 |
if vdrs_path.exists():
|
| 77 |
# Look for project directories in vdrs
|
| 78 |
for project_dir in vdrs_path.iterdir():
|
| 79 |
-
if project_dir.is_dir() and not project_dir.name.startswith('.'):
|
| 80 |
-
# Check if it has subdirectories that could be data rooms
|
| 81 |
-
subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
|
| 82 |
if subdirs:
|
| 83 |
# Count total documents in all data rooms
|
| 84 |
total_docs = count_documents_in_directory(project_dir)
|
|
@@ -92,9 +94,10 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
|
|
| 92 |
|
| 93 |
# Also look for project directories directly in data folder (excluding special folders)
|
| 94 |
for project_dir in data_base_path.iterdir():
|
| 95 |
-
if project_dir.is_dir() and not project_dir.
|
| 96 |
-
|
| 97 |
-
|
|
|
|
| 98 |
if subdirs:
|
| 99 |
# Count total documents in all data rooms
|
| 100 |
total_docs = count_documents_in_directory(project_dir)
|
|
@@ -158,7 +161,7 @@ def render_data_room_selector(project_path: str) -> Optional[str]:
|
|
| 158 |
project_path_obj = Path(project_path)
|
| 159 |
|
| 160 |
for data_room_dir in project_path_obj.iterdir():
|
| 161 |
-
if data_room_dir.is_dir() and not data_room_dir.name.startswith('.'):
|
| 162 |
# Count documents for display
|
| 163 |
doc_count = count_documents_in_directory(data_room_dir)
|
| 164 |
if doc_count > 0: # Only show directories with documents
|
|
|
|
| 68 |
|
| 69 |
# Scan for available projects
|
| 70 |
projects = []
|
| 71 |
+
# Use config's data directory path instead of hardcoded "data"
|
| 72 |
+
config = get_config()
|
| 73 |
+
data_base_path = config.paths['data_dir'] if config.paths['data_dir'].exists() else None
|
| 74 |
|
| 75 |
if data_base_path and data_base_path.exists():
|
| 76 |
# First check if there's a vdrs folder with projects
|
|
|
|
| 78 |
if vdrs_path.exists():
|
| 79 |
# Look for project directories in vdrs
|
| 80 |
for project_dir in vdrs_path.iterdir():
|
| 81 |
+
if project_dir.is_dir() and not project_dir.is_symlink() and not project_dir.name.startswith('.'):
|
| 82 |
+
# Check if it has subdirectories that could be data rooms (exclude symlinks)
|
| 83 |
+
subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.is_symlink() and not d.name.startswith('.')]
|
| 84 |
if subdirs:
|
| 85 |
# Count total documents in all data rooms
|
| 86 |
total_docs = count_documents_in_directory(project_dir)
|
|
|
|
| 94 |
|
| 95 |
# Also look for project directories directly in data folder (excluding special folders)
|
| 96 |
for project_dir in data_base_path.iterdir():
|
| 97 |
+
if (project_dir.is_dir() and not project_dir.is_symlink() and not project_dir.name.startswith('.')
|
| 98 |
+
and project_dir.name not in ['checklist', 'questions', 'vdrs', 'strategy', 'search_indexes']):
|
| 99 |
+
# Check if it has subdirectories that could be data rooms (exclude symlinks)
|
| 100 |
+
subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.is_symlink() and not d.name.startswith('.')]
|
| 101 |
if subdirs:
|
| 102 |
# Count total documents in all data rooms
|
| 103 |
total_docs = count_documents_in_directory(project_dir)
|
|
|
|
| 161 |
project_path_obj = Path(project_path)
|
| 162 |
|
| 163 |
for data_room_dir in project_path_obj.iterdir():
|
| 164 |
+
if data_room_dir.is_dir() and not data_room_dir.is_symlink() and not data_room_dir.name.startswith('.'):
|
| 165 |
# Count documents for display
|
| 166 |
doc_count = count_documents_in_directory(data_room_dir)
|
| 167 |
if doc_count > 0: # Only show directories with documents
|
scripts/setup_datasets.py
CHANGED
|
@@ -4,6 +4,7 @@ Dataset Setup Script - Download data from Hugging Face instead of storing in git
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
|
|
|
| 7 |
from pathlib import Path
|
| 8 |
from huggingface_hub import snapshot_download
|
| 9 |
import shutil
|
|
@@ -62,14 +63,25 @@ def download_datasets(data_dir: Path = Path("data"), force_retry: bool = False):
|
|
| 62 |
else:
|
| 63 |
print(f" ⚠️ No HF_TOKEN found - may fail for private repositories")
|
| 64 |
|
| 65 |
-
# Download
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
print(f" ✅ Downloaded successfully")
|
| 74 |
|
| 75 |
except Exception as e:
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
| 7 |
+
import tempfile
|
| 8 |
from pathlib import Path
|
| 9 |
from huggingface_hub import snapshot_download
|
| 10 |
import shutil
|
|
|
|
| 63 |
else:
|
| 64 |
print(f" ⚠️ No HF_TOKEN found - may fail for private repositories")
|
| 65 |
|
| 66 |
+
# Download to temporary directory first to handle nested structure
|
| 67 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 68 |
+
temp_path = Path(temp_dir)
|
| 69 |
+
snapshot_download(
|
| 70 |
+
repo_id=dataset["repo_id"],
|
| 71 |
+
repo_type="dataset",
|
| 72 |
+
local_dir=temp_path,
|
| 73 |
+
allow_patterns="data/**", # Download data directory
|
| 74 |
+
token=token # Pass token if available
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Move contents from temp_dir/data to target data_dir
|
| 78 |
+
temp_data_dir = temp_path / "data"
|
| 79 |
+
if temp_data_dir.exists():
|
| 80 |
+
for item in temp_data_dir.iterdir():
|
| 81 |
+
target_path = dataset["local_path"] / item.name
|
| 82 |
+
if target_path.exists():
|
| 83 |
+
shutil.rmtree(target_path) if target_path.is_dir() else target_path.unlink()
|
| 84 |
+
shutil.move(str(item), str(target_path))
|
| 85 |
print(f" ✅ Downloaded successfully")
|
| 86 |
|
| 87 |
except Exception as e:
|