Spaces:
Sleeping
Sleeping
Sync from GitHub
Browse files- app.py +38 -26
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -34,43 +34,55 @@ HF_RESULTS = "amaksay/inkslop-results"
|
|
| 34 |
def restructure_dataset(task_dir: Path, task_id: str) -> None:
|
| 35 |
"""Restructure downloaded dataset so samples/ is at the root.
|
| 36 |
|
| 37 |
-
HF datasets have
|
| 38 |
-
task_dir/original/samples/...
|
|
|
|
| 39 |
|
| 40 |
-
We need:
|
| 41 |
task_dir/samples/...
|
| 42 |
|
| 43 |
This ensures the dataset_name matches the task_id in results.
|
| 44 |
"""
|
| 45 |
-
# Find the directory containing 'samples/'
|
| 46 |
-
samples_dir = None
|
| 47 |
-
for subdir in task_dir.iterdir():
|
| 48 |
-
if subdir.is_dir() and (subdir / "samples").exists():
|
| 49 |
-
samples_dir = subdir / "samples"
|
| 50 |
-
break
|
| 51 |
-
|
| 52 |
-
if not samples_dir:
|
| 53 |
-
logger.warning(f"No samples/ found in {task_dir}, skipping restructure")
|
| 54 |
-
return
|
| 55 |
-
|
| 56 |
# Check if already restructured
|
| 57 |
-
|
|
|
|
| 58 |
logger.info(f"{task_id} already restructured")
|
| 59 |
return
|
| 60 |
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
def download_hf_datasets() -> Path:
|
|
|
|
| 34 |
def restructure_dataset(task_dir: Path, task_id: str) -> None:
|
| 35 |
"""Restructure downloaded dataset so samples/ is at the root.
|
| 36 |
|
| 37 |
+
HF datasets may have multiple sample directories:
|
| 38 |
+
task_dir/original/samples/...
|
| 39 |
+
task_dir/source_data/samples/...
|
| 40 |
|
| 41 |
+
We need to merge all into:
|
| 42 |
task_dir/samples/...
|
| 43 |
|
| 44 |
This ensures the dataset_name matches the task_id in results.
|
| 45 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
# Check if already restructured
|
| 47 |
+
target = task_dir / "samples"
|
| 48 |
+
if target.exists():
|
| 49 |
logger.info(f"{task_id} already restructured")
|
| 50 |
return
|
| 51 |
|
| 52 |
+
# Find ALL directories containing 'samples/'
|
| 53 |
+
samples_dirs = []
|
| 54 |
+
for subdir in task_dir.iterdir():
|
| 55 |
+
if subdir.is_dir() and (subdir / "samples").exists():
|
| 56 |
+
samples_dirs.append(subdir / "samples")
|
| 57 |
|
| 58 |
+
if not samples_dirs:
|
| 59 |
+
logger.warning(f"No samples/ found in {task_dir}, skipping restructure")
|
| 60 |
+
return
|
| 61 |
|
| 62 |
+
logger.info(f"Restructuring {task_id}: found {len(samples_dirs)} sample directories")
|
| 63 |
+
|
| 64 |
+
# Create target samples directory
|
| 65 |
+
target.mkdir(exist_ok=True)
|
| 66 |
+
|
| 67 |
+
# Move samples from each source directory
|
| 68 |
+
for samples_dir in samples_dirs:
|
| 69 |
+
logger.info(f" Moving samples from {samples_dir.parent.name}/samples/")
|
| 70 |
+
for sample in samples_dir.iterdir():
|
| 71 |
+
if sample.is_dir():
|
| 72 |
+
dest = target / sample.name
|
| 73 |
+
if not dest.exists():
|
| 74 |
+
shutil.move(str(sample), str(dest))
|
| 75 |
+
else:
|
| 76 |
+
logger.warning(f" Skipping duplicate sample: {sample.name}")
|
| 77 |
+
|
| 78 |
+
# Clean up the now-empty parent directories
|
| 79 |
+
for samples_dir in samples_dirs:
|
| 80 |
+
parent = samples_dir.parent
|
| 81 |
+
if parent.exists() and parent != task_dir:
|
| 82 |
+
try:
|
| 83 |
+
shutil.rmtree(parent)
|
| 84 |
+
except Exception:
|
| 85 |
+
pass # Ignore cleanup errors
|
| 86 |
|
| 87 |
|
| 88 |
def download_hf_datasets() -> Path:
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
gradio
|
| 2 |
huggingface_hub>=0.20.0
|
| 3 |
Pillow>=9.0.0
|
| 4 |
numpy>=1.20.0
|
|
|
|
| 1 |
+
gradio==6.3.0
|
| 2 |
huggingface_hub>=0.20.0
|
| 3 |
Pillow>=9.0.0
|
| 4 |
numpy>=1.20.0
|