Leacb4 commited on
Commit
2c8ce2e
Β·
verified Β·
1 Parent(s): 4611995

Upload data/dowload_images_data.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data/dowload_images_data.py +217 -0
data/dowload_images_data.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to download all images from the dataset locally.
4
+ This file downloads all images from URLs in the dataset CSV and saves them locally
5
+ to speed up training by avoiding repeated downloads. It uses parallel processing
6
+ to download multiple images simultaneously and updates the CSV with local paths
7
+ of downloaded images.
8
+ """
9
+
10
+ import pandas as pd
11
+ import requests
12
+ from PIL import Image
13
+ from io import BytesIO
14
+ from tqdm import tqdm
15
+ import hashlib
16
+ from pathlib import Path
17
+ import time
18
+ import concurrent.futures
19
+ from threading import Lock
20
+ import config
21
+
22
+ class ImageDownloader:
23
+ def __init__(self, df, images_dir=config.images_dir, max_workers=8, timeout=10):
24
+ """
25
+ Initialize the image downloader.
26
+
27
+ Args:
28
+ csv_path: Path to the CSV file containing the URLs
29
+ images_dir: Directory to save the images
30
+ max_workers: Number of threads for parallel download
31
+ timeout: Timeout for HTTP requests (seconds)
32
+ """
33
+ self.df = df
34
+ self.images_dir = Path(images_dir)
35
+ self.max_workers = max_workers
36
+ self.timeout = timeout
37
+
38
+ # Create the images directory if it doesn't exist
39
+ self.images_dir.mkdir(parents=True, exist_ok=True)
40
+
41
+ # Statistics
42
+ self.stats = {
43
+ 'downloaded': 0,
44
+ 'skipped': 0,
45
+ 'failed': 0,
46
+ 'total': 0
47
+ }
48
+ self.stats_lock = Lock()
49
+
50
+ def url_to_filename(self, url):
51
+ """Convert a URL to a secure filename."""
52
+ # Use MD5 hash of the URL to avoid character issues
53
+ url_hash = hashlib.md5(url.encode()).hexdigest()
54
+ return f"{url_hash}.jpg"
55
+
56
+ def download_single_image(self, row):
57
+ """
58
+ Download a single image.
59
+
60
+ Args:
61
+ row: Tuple (index, pandas.Series) containing the row data
62
+
63
+ Returns:
64
+ tuple: (success, index, message)
65
+ """
66
+ idx, data = row
67
+ url = data[config.column_url_image]
68
+
69
+ # Filename based on the URL
70
+ filename = self.url_to_filename(url)
71
+ filepath = self.images_dir / filename
72
+
73
+ # Check if the image already exists
74
+ if filepath.exists():
75
+ with self.stats_lock:
76
+ self.stats['skipped'] += 1
77
+ return True, idx, f"Skipped (already exists): {filename}"
78
+
79
+ try:
80
+ # Download the image
81
+ response = requests.get(url, timeout=self.timeout, stream=True)
82
+ response.raise_for_status()
83
+
84
+ # Check the content type
85
+ content_type = response.headers.get('content-type', '')
86
+ if not content_type.startswith('image/'):
87
+ with self.stats_lock:
88
+ self.stats['failed'] += 1
89
+ return False, idx, f"Not an image: {content_type}"
90
+
91
+ # Save the image
92
+ try:
93
+ image = Image.open(BytesIO(response.content)).convert("RGB")
94
+ image.save(filepath, "JPEG", quality=85, optimize=True)
95
+
96
+ with self.stats_lock:
97
+ self.stats['downloaded'] += 1
98
+ return True, idx, f"Downloaded: {filename}"
99
+
100
+ except Exception as img_error:
101
+ with self.stats_lock:
102
+ self.stats['failed'] += 1
103
+ return False, idx, f"Image processing error: {str(img_error)}"
104
+
105
+ except requests.exceptions.RequestException as e:
106
+ with self.stats_lock:
107
+ self.stats['failed'] += 1
108
+ return False, idx, f"Download error: {str(e)}"
109
+ except Exception as e:
110
+ with self.stats_lock:
111
+ self.stats['failed'] += 1
112
+ return False, idx, f"Unexpected error: {str(e)}"
113
+
114
+ def download_all_images(self):
115
+ """Download all images from the dataset."""
116
+ print(f"πŸ“Š Loading dataset from {self.df}")
117
+ self.stats['total'] = len(self.df)
118
+
119
+ print(f"πŸ” Found {len(self.df)} images to download")
120
+ print(f"πŸ“ Saving in: {self.images_dir}")
121
+ print(f"πŸ”§ Using {self.max_workers} threads")
122
+
123
+ # Create a new DataFrame with local paths
124
+ df_local = self.df.copy()
125
+ df_local[config.column_local_image_path] = ""
126
+ df_local['download_success'] = False
127
+
128
+ start_time = time.time()
129
+
130
+ # Parallel download
131
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
132
+ # Submit all tasks
133
+ future_to_row = {
134
+ executor.submit(self.download_single_image, row): row
135
+ for row in self.df.iterrows()
136
+ }
137
+
138
+ # Process the results with a progress bar
139
+ with tqdm(total=len(self.df), desc="πŸ“₯ Downloading", unit="img") as pbar:
140
+ for future in concurrent.futures.as_completed(future_to_row):
141
+ row = future_to_row[future]
142
+ idx = row[0]
143
+
144
+ try:
145
+ success, _, message = future.result()
146
+
147
+ if success:
148
+ # Add the local path to the DataFrame
149
+ filename = self.url_to_filename(row[1][config.column_url_image])
150
+ df_local.loc[idx, config.column_local_image_path] = str(self.images_dir / filename)
151
+ df_local.loc[idx, 'download_success'] = True
152
+
153
+ # Update the progress bar
154
+ pbar.set_postfix({
155
+ 'OK': self.stats['downloaded'],
156
+ 'Skip': self.stats['skipped'],
157
+ 'Fail': self.stats['failed']
158
+ })
159
+ pbar.update(1)
160
+
161
+ except Exception as e:
162
+ print(f"❌ Unexpected error for index {idx}: {e}")
163
+ with self.stats_lock:
164
+ self.stats['failed'] += 1
165
+ pbar.update(1)
166
+
167
+ elapsed_time = time.time() - start_time
168
+
169
+ # Final statistics
170
+ print("\n" + "="*60)
171
+ print("πŸ“Š DOWNLOAD STATISTICS")
172
+ print("="*60)
173
+ print(f"βœ… Downloaded: {self.stats['downloaded']}")
174
+ print(f"⏭️ Skipped (already present): {self.stats['skipped']}")
175
+ print(f"❌ Failed: {self.stats['failed']}")
176
+ print(f"πŸ“Š Total: {self.stats['total']}")
177
+ print(f"⏱️ Time elapsed: {elapsed_time:.1f}s")
178
+
179
+ success_rate = (self.stats['downloaded'] + self.stats['skipped']) / self.stats['total'] * 100
180
+ print(f"🎯 Success rate: {success_rate:.1f}%")
181
+
182
+ if self.stats['downloaded'] > 0:
183
+ avg_time = elapsed_time / self.stats['downloaded']
184
+ print(f"⚑ Average time per image: {avg_time:.2f}s")
185
+
186
+ # Save the updated DataFrame
187
+ output_path = config.local_dataset_path
188
+ df_local.to_csv(output_path, index=False)
189
+ print(f"πŸ’Ύ Updated dataset saved: {output_path}")
190
+
191
+ return df_local
192
+
193
+ def main():
194
+ """Main function."""
195
+ print("πŸš€ STARTING IMAGE DOWNLOADER")
196
+ print("="*60)
197
+
198
+ # Configuration
199
+ df = pd.read_csv(config.local_dataset_path)
200
+ df = df[df['color'] != 'unknown']
201
+
202
+ # Create the downloader
203
+ downloader = ImageDownloader(
204
+ df=df,
205
+ images_dir=config.images_dir,
206
+ max_workers=8,
207
+ timeout=10
208
+ )
209
+
210
+ # Download all images
211
+ df_with_paths = downloader.download_all_images()
212
+
213
+ print("\nπŸŽ‰ DOWNLOAD COMPLETED!")
214
+ print("πŸ’‘ You can now use the local images for training.")
215
+
216
+ if __name__ == "__main__":
217
+ main()