zhimin-z commited on
Commit
dc7d314
·
1 Parent(s): fe87ed7
Files changed (1) hide show
  1. msr.py +11 -30
msr.py CHANGED
@@ -56,8 +56,7 @@ GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
56
  # Streaming batch configuration
57
  BATCH_SIZE_DAYS = 1 # Process 1 day at a time (~24 hourly files)
58
 
59
- # Download configuration
60
- DOWNLOAD_RETRY_DELAY = 2
61
  MAX_RETRIES = 5
62
 
63
  # Upload configuration
@@ -135,40 +134,22 @@ def get_hf_token():
135
  # =============================================================================
136
 
137
  def download_file(url):
138
- """Download a GHArchive file with retry logic."""
139
  filename = url.split("/")[-1]
140
  filepath = os.path.join(GHARCHIVE_DATA_LOCAL_PATH, filename)
141
 
142
  if os.path.exists(filepath):
143
  return True
144
 
145
- for attempt in range(MAX_RETRIES):
146
- try:
147
- response = requests.get(url, timeout=30)
148
- response.raise_for_status()
149
- with open(filepath, "wb") as f:
150
- f.write(response.content)
151
- return True
152
-
153
- except requests.exceptions.HTTPError as e:
154
- # 404 means the file doesn't exist in GHArchive - skip without retry
155
- if e.response.status_code == 404:
156
- if attempt == 0: # Only log once, not for each retry
157
- print(f" ⚠ {filename}: Not available (404) - skipping")
158
- return False
159
-
160
- # Other HTTP errors (5xx, etc.) should be retried
161
- wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
162
- print(f" ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
163
- time.sleep(wait_time)
164
-
165
- except Exception as e:
166
- # Network errors, timeouts, etc. should be retried
167
- wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
168
- print(f" ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
169
- time.sleep(wait_time)
170
-
171
- return False
172
 
173
 
174
  def download_all_gharchive_data():
 
56
  # Streaming batch configuration
57
  BATCH_SIZE_DAYS = 1 # Process 1 day at a time (~24 hourly files)
58
 
59
+ # Retry configuration
 
60
  MAX_RETRIES = 5
61
 
62
  # Upload configuration
 
134
  # =============================================================================
135
 
136
  def download_file(url):
137
+ """Download a GHArchive file with a single attempt."""
138
  filename = url.split("/")[-1]
139
  filepath = os.path.join(GHARCHIVE_DATA_LOCAL_PATH, filename)
140
 
141
  if os.path.exists(filepath):
142
  return True
143
 
144
+ try:
145
+ response = requests.get(url, timeout=30)
146
+ response.raise_for_status()
147
+ with open(filepath, "wb") as f:
148
+ f.write(response.content)
149
+ return True
150
+ except Exception as e:
151
+ print(f" ⚠ {filename}: {e}")
152
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
 
155
  def download_all_gharchive_data():