VictorLJZ commited on
Commit
6db0a72
·
1 Parent(s): 719e93e
.gitignore CHANGED
@@ -178,3 +178,5 @@ medrax-pdfs/
178
  model-weights/
179
 
180
  .DS_Store
 
 
 
178
  model-weights/
179
 
180
  .DS_Store
181
+
182
+ benchmarking/data/
benchmarking/data/rexvqa/download_rexgradient_images.py DELETED
@@ -1,172 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Utility script to download and extract ReXGradient-160K images.
4
-
5
- This script helps users download the actual PNG images from the ReXGradient-160K dataset,
6
- which are stored as part files on HuggingFace and need to be concatenated and extracted.
7
-
8
- Usage:
9
- python download_rexgradient_images.py --output_dir /path/to/images
10
- """
11
-
12
- import argparse
13
- import subprocess
14
- from pathlib import Path
15
- from huggingface_hub import hf_hub_download, list_repo_files
16
- import requests
17
- from tqdm import tqdm
18
-
19
-
20
- def download_file(url, output_path, chunk_size=8192):
21
- """Download a file with progress bar."""
22
- response = requests.get(url, stream=True)
23
- total_size = int(response.headers.get('content-length', 0))
24
-
25
- with open(output_path, 'wb') as f:
26
- with tqdm(total=total_size, unit='B', unit_scale=True, desc=output_path.name) as pbar:
27
- for chunk in response.iter_content(chunk_size=chunk_size):
28
- if chunk:
29
- f.write(chunk)
30
- pbar.update(len(chunk))
31
-
32
-
33
- def main():
34
- parser = argparse.ArgumentParser(description="Download ReXGradient-160K images")
35
- parser.add_argument(
36
- "--output_dir",
37
- type=str,
38
- required=True,
39
- help="Directory to save extracted images"
40
- )
41
- parser.add_argument(
42
- "--repo_id",
43
- type=str,
44
- default="rajpurkarlab/ReXGradient-160K",
45
- help="HuggingFace repository ID"
46
- )
47
- parser.add_argument(
48
- "--skip_download",
49
- action="store_true",
50
- help="Skip downloading and only extract if files exist"
51
- )
52
-
53
- args = parser.parse_args()
54
-
55
- output_dir = Path(args.output_dir)
56
- output_dir.mkdir(parents=True, exist_ok=True)
57
-
58
- print(f"Output directory: {output_dir}")
59
-
60
- # Check if we need to accept the license first
61
- print("Note: You may need to accept the dataset license on HuggingFace first:")
62
- print(f"Visit: https://huggingface.co/datasets/{args.repo_id}")
63
- print("Click 'Access repository' and accept the license agreement.")
64
- print()
65
-
66
- try:
67
- # List files in the repository
68
- print("Listing files in repository...")
69
- files = list_repo_files(args.repo_id, repo_type='dataset')
70
- part_files = [f for f in files if f.startswith("deid_png.part")]
71
-
72
- if not part_files:
73
- print("No part files found. The images might be in a different format.")
74
- print("Available files:")
75
- for f in files:
76
- print(f" - {f}")
77
- return
78
-
79
- print(f"Found {len(part_files)} part files:")
80
- for f in part_files:
81
- print(f" - {f}")
82
-
83
- # Download part files
84
- if not args.skip_download:
85
- print("\nDownloading part files...")
86
- for part_file in part_files:
87
- output_path = output_dir / part_file
88
- if output_path.exists():
89
- print(f"Skipping {part_file} (already exists)")
90
- continue
91
-
92
- print(f"Downloading {part_file}...")
93
- try:
94
- hf_hub_download(
95
- repo_id=args.repo_id,
96
- filename=part_file,
97
- local_dir=output_dir,
98
- local_dir_use_symlinks=False,
99
- repo_type='dataset'
100
- )
101
- except Exception as e:
102
- print(f"Error downloading {part_file}: {e}")
103
- print("You may need to accept the license agreement on HuggingFace.")
104
- return
105
-
106
- # Concatenate part files
107
- tar_path = output_dir / "deid_png.tar"
108
- if not tar_path.exists():
109
- print("\nConcatenating part files...")
110
- with open(tar_path, 'wb') as tar_file:
111
- for part_file in sorted(part_files):
112
- part_path = output_dir / part_file
113
- if part_path.exists():
114
- print(f"Adding {part_file}...")
115
- with open(part_path, 'rb') as f:
116
- tar_file.write(f.read())
117
- else:
118
- print(f"Warning: {part_file} not found, skipping...")
119
- else:
120
- print(f"Tar file already exists: {tar_path}")
121
-
122
- # Extract tar file
123
- if tar_path.exists():
124
- print("\nExtracting images...")
125
- images_dir = output_dir / "images"
126
- images_dir.mkdir(exist_ok=True)
127
-
128
- # Check if already extracted
129
- if any(images_dir.glob("*.png")):
130
- print("Images already extracted.")
131
- else:
132
- try:
133
- subprocess.run([
134
- "tar", "-xf", str(tar_path),
135
- "-C", str(images_dir)
136
- ], check=True)
137
- print("Extraction completed!")
138
- except subprocess.CalledProcessError as e:
139
- print(f"Error extracting tar file: {e}")
140
- return
141
- except FileNotFoundError:
142
- print("Error: 'tar' command not found. Please install tar or extract manually.")
143
- return
144
-
145
- # Count extracted images
146
- png_files = list(images_dir.glob("*.png"))
147
- print(f"Extracted {len(png_files)} PNG images to {images_dir}")
148
-
149
- # Show some example filenames
150
- if png_files:
151
- print("\nExample image filenames:")
152
- for f in png_files[:5]:
153
- print(f" - {f.name}")
154
- if len(png_files) > 5:
155
- print(f" ... and {len(png_files) - 5} more")
156
-
157
- print(f"\nSetup complete! Use this directory as images_dir in ReXVQABenchmark:")
158
- print(f"images_dir='{images_dir}'")
159
-
160
- except Exception as e:
161
- print(f"Error: {e}")
162
- print("\nManual setup instructions:")
163
- print("1. Visit https://huggingface.co/datasets/rajpurkarlab/ReXGradient-160K")
164
- print("2. Accept the license agreement")
165
- print("3. Download the deid_png.part* files")
166
- print("4. Concatenate: cat deid_png.part* > deid_png.tar")
167
- print("5. Extract: tar -xf deid_png.tar")
168
- print("6. Use the extracted directory as images_dir")
169
-
170
-
171
- if __name__ == "__main__":
172
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
medrax/agent/__init__.py CHANGED
@@ -1 +1 @@
1
- from .agent import State, Agent
 
1
+ from .agent import AgentState, Agent