Spaces:

jmzlx
/

dd-poc

Sleeping

Juan Salas commited on Sep 16, 2025

Commit

495dc7c

1 Parent(s): b49827c

Fix project selector directory structure issues

- Fix nested data/data/ directory structure in HuggingFace Spaces
- Update ui_components.py to use config-based data paths instead of hardcoded 'data'
- Add symlink filtering to prevent circular directory references
- Add search_indexes to excluded folders list
- Modify setup_datasets.py to extract HF repo data/ contents directly to target directory
- Prevents nested structure: data/data/checklist -> data/checklist

Resolves project selector showing incorrect 'Data (5 rooms, 974 docs)' option.
Now correctly displays only VDR projects:
- Automated Services Transformation (1 rooms, 477 docs)
- Industrial Security Leadership (1 rooms, 491 docs)

Files changed (2) hide show

app/ui/ui_components.py +11 -8
scripts/setup_datasets.py +20 -8

app/ui/ui_components.py CHANGED Viewed

@@ -68,7 +68,9 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
     # Scan for available projects
     projects = []
-    data_base_path = Path("data").resolve() if Path("data").exists() else None
     if data_base_path and data_base_path.exists():
         # First check if there's a vdrs folder with projects
@@ -76,9 +78,9 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
         if vdrs_path.exists():
             # Look for project directories in vdrs
             for project_dir in vdrs_path.iterdir():
-                if project_dir.is_dir() and not project_dir.name.startswith('.'):
-                    # Check if it has subdirectories that could be data rooms
-                    subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
                     if subdirs:
                         # Count total documents in all data rooms
                         total_docs = count_documents_in_directory(project_dir)
@@ -92,9 +94,10 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
         # Also look for project directories directly in data folder (excluding special folders)
         for project_dir in data_base_path.iterdir():
-            if project_dir.is_dir() and not project_dir.name.startswith('.') and project_dir.name not in ['checklist', 'questions', 'vdrs', 'strategy']:
-                # Check if it has subdirectories that could be data rooms
-                subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
                 if subdirs:
                     # Count total documents in all data rooms
                     total_docs = count_documents_in_directory(project_dir)
@@ -158,7 +161,7 @@ def render_data_room_selector(project_path: str) -> Optional[str]:
     project_path_obj = Path(project_path)
     for data_room_dir in project_path_obj.iterdir():
-        if data_room_dir.is_dir() and not data_room_dir.name.startswith('.'):
             # Count documents for display
             doc_count = count_documents_in_directory(data_room_dir)
             if doc_count > 0:  # Only show directories with documents

     # Scan for available projects
     projects = []
+    # Use config's data directory path instead of hardcoded "data"
+    config = get_config()
+    data_base_path = config.paths['data_dir'] if config.paths['data_dir'].exists() else None
     if data_base_path and data_base_path.exists():
         # First check if there's a vdrs folder with projects
         if vdrs_path.exists():
             # Look for project directories in vdrs
             for project_dir in vdrs_path.iterdir():
+                if project_dir.is_dir() and not project_dir.is_symlink() and not project_dir.name.startswith('.'):
+                    # Check if it has subdirectories that could be data rooms (exclude symlinks)
+                    subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.is_symlink() and not d.name.startswith('.')]
                     if subdirs:
                         # Count total documents in all data rooms
                         total_docs = count_documents_in_directory(project_dir)
         # Also look for project directories directly in data folder (excluding special folders)
         for project_dir in data_base_path.iterdir():
+            if (project_dir.is_dir() and not project_dir.is_symlink() and not project_dir.name.startswith('.')
+                and project_dir.name not in ['checklist', 'questions', 'vdrs', 'strategy', 'search_indexes']):
+                # Check if it has subdirectories that could be data rooms (exclude symlinks)
+                subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.is_symlink() and not d.name.startswith('.')]
                 if subdirs:
                     # Count total documents in all data rooms
                     total_docs = count_documents_in_directory(project_dir)
     project_path_obj = Path(project_path)
     for data_room_dir in project_path_obj.iterdir():
+        if data_room_dir.is_dir() and not data_room_dir.is_symlink() and not data_room_dir.name.startswith('.'):
             # Count documents for display
             doc_count = count_documents_in_directory(data_room_dir)
             if doc_count > 0:  # Only show directories with documents

scripts/setup_datasets.py CHANGED Viewed

@@ -4,6 +4,7 @@ Dataset Setup Script - Download data from Hugging Face instead of storing in git
 """
 import os
 from pathlib import Path
 from huggingface_hub import snapshot_download
 import shutil
@@ -62,14 +63,25 @@ def download_datasets(data_dir: Path = Path("data"), force_retry: bool = False):
             else:
                 print(f"   ⚠️  No HF_TOKEN found - may fail for private repositories")
-            # Download dataset
-            snapshot_download(
-                repo_id=dataset["repo_id"],
-                repo_type="dataset",
-                local_dir=dataset["local_path"],
-                allow_patterns="data/**",  # Only download data directory
-                token=token  # Pass token if available
-            )
             print(f"   ✅ Downloaded successfully")
         except Exception as e:

 """
 import os
+import tempfile
 from pathlib import Path
 from huggingface_hub import snapshot_download
 import shutil
             else:
                 print(f"   ⚠️  No HF_TOKEN found - may fail for private repositories")
+            # Download to temporary directory first to handle nested structure
+            with tempfile.TemporaryDirectory() as temp_dir:
+                temp_path = Path(temp_dir)
+                snapshot_download(
+                    repo_id=dataset["repo_id"],
+                    repo_type="dataset",
+                    local_dir=temp_path,
+                    allow_patterns="data/**",  # Download data directory
+                    token=token  # Pass token if available
+                )
+                # Move contents from temp_dir/data to target data_dir
+                temp_data_dir = temp_path / "data"
+                if temp_data_dir.exists():
+                    for item in temp_data_dir.iterdir():
+                        target_path = dataset["local_path"] / item.name
+                        if target_path.exists():
+                            shutil.rmtree(target_path) if target_path.is_dir() else target_path.unlink()
+                        shutil.move(str(item), str(target_path))
             print(f"   ✅ Downloaded successfully")
         except Exception as e: