amaksay commited on
Commit
c15dd04
·
verified ·
1 Parent(s): b1d61f7

Sync from GitHub

Browse files
Files changed (2) hide show
  1. app.py +38 -26
  2. requirements.txt +1 -1
app.py CHANGED
@@ -34,43 +34,55 @@ HF_RESULTS = "amaksay/inkslop-results"
34
  def restructure_dataset(task_dir: Path, task_id: str) -> None:
35
  """Restructure downloaded dataset so samples/ is at the root.
36
 
37
- HF datasets have structure like:
38
- task_dir/original/samples/... or task_dir/source_data/samples/...
 
39
 
40
- We need:
41
  task_dir/samples/...
42
 
43
  This ensures the dataset_name matches the task_id in results.
44
  """
45
- # Find the directory containing 'samples/'
46
- samples_dir = None
47
- for subdir in task_dir.iterdir():
48
- if subdir.is_dir() and (subdir / "samples").exists():
49
- samples_dir = subdir / "samples"
50
- break
51
-
52
- if not samples_dir:
53
- logger.warning(f"No samples/ found in {task_dir}, skipping restructure")
54
- return
55
-
56
  # Check if already restructured
57
- if (task_dir / "samples").exists():
 
58
  logger.info(f"{task_id} already restructured")
59
  return
60
 
61
- logger.info(f"Restructuring {task_id}: moving {samples_dir.parent.name}/samples to root")
 
 
 
 
62
 
63
- # Move samples to task root
64
- target = task_dir / "samples"
65
- shutil.move(str(samples_dir), str(target))
66
 
67
- # Clean up the now-empty parent directory
68
- parent = samples_dir.parent
69
- if parent.exists() and parent != task_dir:
70
- try:
71
- shutil.rmtree(parent)
72
- except Exception:
73
- pass # Ignore cleanup errors
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
 
76
  def download_hf_datasets() -> Path:
 
34
  def restructure_dataset(task_dir: Path, task_id: str) -> None:
35
  """Restructure downloaded dataset so samples/ is at the root.
36
 
37
+ HF datasets may have multiple sample directories:
38
+ task_dir/original/samples/...
39
+ task_dir/source_data/samples/...
40
 
41
+ We need to merge all into:
42
  task_dir/samples/...
43
 
44
  This ensures the dataset_name matches the task_id in results.
45
  """
 
 
 
 
 
 
 
 
 
 
 
46
  # Check if already restructured
47
+ target = task_dir / "samples"
48
+ if target.exists():
49
  logger.info(f"{task_id} already restructured")
50
  return
51
 
52
+ # Find ALL directories containing 'samples/'
53
+ samples_dirs = []
54
+ for subdir in task_dir.iterdir():
55
+ if subdir.is_dir() and (subdir / "samples").exists():
56
+ samples_dirs.append(subdir / "samples")
57
 
58
+ if not samples_dirs:
59
+ logger.warning(f"No samples/ found in {task_dir}, skipping restructure")
60
+ return
61
 
62
+ logger.info(f"Restructuring {task_id}: found {len(samples_dirs)} sample directories")
63
+
64
+ # Create target samples directory
65
+ target.mkdir(exist_ok=True)
66
+
67
+ # Move samples from each source directory
68
+ for samples_dir in samples_dirs:
69
+ logger.info(f" Moving samples from {samples_dir.parent.name}/samples/")
70
+ for sample in samples_dir.iterdir():
71
+ if sample.is_dir():
72
+ dest = target / sample.name
73
+ if not dest.exists():
74
+ shutil.move(str(sample), str(dest))
75
+ else:
76
+ logger.warning(f" Skipping duplicate sample: {sample.name}")
77
+
78
+ # Clean up the now-empty parent directories
79
+ for samples_dir in samples_dirs:
80
+ parent = samples_dir.parent
81
+ if parent.exists() and parent != task_dir:
82
+ try:
83
+ shutil.rmtree(parent)
84
+ except Exception:
85
+ pass # Ignore cleanup errors
86
 
87
 
88
  def download_hf_datasets() -> Path:
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio>=6.0.0
2
  huggingface_hub>=0.20.0
3
  Pillow>=9.0.0
4
  numpy>=1.20.0
 
1
+ gradio==6.3.0
2
  huggingface_hub>=0.20.0
3
  Pillow>=9.0.0
4
  numpy>=1.20.0