zhimin-z
commited on
Commit
·
dc7d314
1
Parent(s):
fe87ed7
add
Browse files
msr.py
CHANGED
|
@@ -56,8 +56,7 @@ GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
|
|
| 56 |
# Streaming batch configuration
|
| 57 |
BATCH_SIZE_DAYS = 1 # Process 1 day at a time (~24 hourly files)
|
| 58 |
|
| 59 |
-
#
|
| 60 |
-
DOWNLOAD_RETRY_DELAY = 2
|
| 61 |
MAX_RETRIES = 5
|
| 62 |
|
| 63 |
# Upload configuration
|
|
@@ -135,40 +134,22 @@ def get_hf_token():
|
|
| 135 |
# =============================================================================
|
| 136 |
|
| 137 |
def download_file(url):
|
| 138 |
-
"""Download a GHArchive file with
|
| 139 |
filename = url.split("/")[-1]
|
| 140 |
filepath = os.path.join(GHARCHIVE_DATA_LOCAL_PATH, filename)
|
| 141 |
|
| 142 |
if os.path.exists(filepath):
|
| 143 |
return True
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
# 404 means the file doesn't exist in GHArchive - skip without retry
|
| 155 |
-
if e.response.status_code == 404:
|
| 156 |
-
if attempt == 0: # Only log once, not for each retry
|
| 157 |
-
print(f" ⚠ {filename}: Not available (404) - skipping")
|
| 158 |
-
return False
|
| 159 |
-
|
| 160 |
-
# Other HTTP errors (5xx, etc.) should be retried
|
| 161 |
-
wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
|
| 162 |
-
print(f" ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
|
| 163 |
-
time.sleep(wait_time)
|
| 164 |
-
|
| 165 |
-
except Exception as e:
|
| 166 |
-
# Network errors, timeouts, etc. should be retried
|
| 167 |
-
wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
|
| 168 |
-
print(f" ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
|
| 169 |
-
time.sleep(wait_time)
|
| 170 |
-
|
| 171 |
-
return False
|
| 172 |
|
| 173 |
|
| 174 |
def download_all_gharchive_data():
|
|
|
|
| 56 |
# Streaming batch configuration
|
| 57 |
BATCH_SIZE_DAYS = 1 # Process 1 day at a time (~24 hourly files)
|
| 58 |
|
| 59 |
+
# Retry configuration
|
|
|
|
| 60 |
MAX_RETRIES = 5
|
| 61 |
|
| 62 |
# Upload configuration
|
|
|
|
| 134 |
# =============================================================================
|
| 135 |
|
| 136 |
def download_file(url):
|
| 137 |
+
"""Download a GHArchive file with a single attempt."""
|
| 138 |
filename = url.split("/")[-1]
|
| 139 |
filepath = os.path.join(GHARCHIVE_DATA_LOCAL_PATH, filename)
|
| 140 |
|
| 141 |
if os.path.exists(filepath):
|
| 142 |
return True
|
| 143 |
|
| 144 |
+
try:
|
| 145 |
+
response = requests.get(url, timeout=30)
|
| 146 |
+
response.raise_for_status()
|
| 147 |
+
with open(filepath, "wb") as f:
|
| 148 |
+
f.write(response.content)
|
| 149 |
+
return True
|
| 150 |
+
except Exception as e:
|
| 151 |
+
print(f" ⚠ {filename}: {e}")
|
| 152 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
def download_all_gharchive_data():
|