Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -14,7 +14,7 @@ ALL_REPO_ID = "samfred2/ALL"
|
|
| 14 |
ATO_REPO_ID = "samfred2/ATO"
|
| 15 |
OUTPUT_REPO_ID = "samfred2/ALL2"
|
| 16 |
OUTPUT_DIR = "processed_files"
|
| 17 |
-
HF_TOKEN = os.getenv("HF_TOKEN", "
|
| 18 |
MAX_UPLOADS_PER_HOUR = 128
|
| 19 |
RATE_LIMIT_DELAY = 3600 # 1 hour in seconds
|
| 20 |
|
|
@@ -92,10 +92,11 @@ def download_file(repo_id, filename, local_dir):
|
|
| 92 |
logger.error(f"Could not download {filename}. Error: {e}")
|
| 93 |
return None
|
| 94 |
|
| 95 |
-
def upload_file_with_rate_limit(api, file_path, path_in_repo, upload_state):
|
| 96 |
"""
|
| 97 |
Uploads a file to HF with rate limiting (128 files per hour).
|
| 98 |
Handles 429 errors by waiting an hour and resuming.
|
|
|
|
| 99 |
"""
|
| 100 |
while True:
|
| 101 |
try:
|
|
@@ -106,6 +107,18 @@ def upload_file_with_rate_limit(api, file_path, path_in_repo, upload_state):
|
|
| 106 |
wait_time = RATE_LIMIT_DELAY - elapsed
|
| 107 |
wait_until = datetime.datetime.now() + datetime.timedelta(seconds=wait_time)
|
| 108 |
logger.info(f"Rate limit reached ({MAX_UPLOADS_PER_HOUR} uploads/hour). Waiting until {wait_until}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
time.sleep(wait_time)
|
| 110 |
upload_state['hour_start'] = time.time()
|
| 111 |
upload_state['count'] = 0
|
|
@@ -125,6 +138,19 @@ def upload_file_with_rate_limit(api, file_path, path_in_repo, upload_state):
|
|
| 125 |
if "429" in str(e) or "rate" in str(e).lower():
|
| 126 |
wait_until = datetime.datetime.now() + datetime.timedelta(seconds=RATE_LIMIT_DELAY)
|
| 127 |
logger.warning(f"Rate limit hit (429). Waiting 1 hour until {wait_until}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
time.sleep(RATE_LIMIT_DELAY)
|
| 129 |
upload_state['hour_start'] = time.time()
|
| 130 |
upload_state['count'] = 0
|
|
@@ -161,8 +187,8 @@ def load_progress(progress_file):
|
|
| 161 |
with open(progress_file, 'r') as f:
|
| 162 |
return json.load(f)
|
| 163 |
except:
|
| 164 |
-
return {'processed': [], 'uploaded': []}
|
| 165 |
-
return {'processed': [], 'uploaded': []}
|
| 166 |
|
| 167 |
def save_progress(progress_file, progress):
|
| 168 |
"""Save progress tracking file."""
|
|
@@ -296,7 +322,7 @@ def process_datasets():
|
|
| 296 |
|
| 297 |
# e. Upload to samfred2/ALL2
|
| 298 |
if all_filename not in progress['uploaded']:
|
| 299 |
-
upload_file_with_rate_limit(api, final_output_path, all_filename, upload_state)
|
| 300 |
progress['uploaded'].append(all_filename)
|
| 301 |
processing_state['uploaded'] += 1
|
| 302 |
|
|
@@ -347,7 +373,7 @@ def process_datasets():
|
|
| 347 |
logger.info(f"Saved locally to {final_output_path}")
|
| 348 |
|
| 349 |
# Upload to samfred2/ALL2
|
| 350 |
-
upload_file_with_rate_limit(api, final_output_path, all_filename, upload_state)
|
| 351 |
progress['uploaded'].append(all_filename)
|
| 352 |
processing_state['uploaded'] += 1
|
| 353 |
|
|
|
|
| 14 |
ATO_REPO_ID = "samfred2/ATO"
|
| 15 |
OUTPUT_REPO_ID = "samfred2/ALL2"
|
| 16 |
OUTPUT_DIR = "processed_files"
|
| 17 |
+
HF_TOKEN = os.getenv("HF_TOKEN", "")
|
| 18 |
MAX_UPLOADS_PER_HOUR = 128
|
| 19 |
RATE_LIMIT_DELAY = 3600 # 1 hour in seconds
|
| 20 |
|
|
|
|
| 92 |
logger.error(f"Could not download {filename}. Error: {e}")
|
| 93 |
return None
|
| 94 |
|
| 95 |
+
def upload_file_with_rate_limit(api, file_path, path_in_repo, upload_state, progress=None):
|
| 96 |
"""
|
| 97 |
Uploads a file to HF with rate limiting (128 files per hour).
|
| 98 |
Handles 429 errors by waiting an hour and resuming.
|
| 99 |
+
Tracks rate limit events in progress file.
|
| 100 |
"""
|
| 101 |
while True:
|
| 102 |
try:
|
|
|
|
| 107 |
wait_time = RATE_LIMIT_DELAY - elapsed
|
| 108 |
wait_until = datetime.datetime.now() + datetime.timedelta(seconds=wait_time)
|
| 109 |
logger.info(f"Rate limit reached ({MAX_UPLOADS_PER_HOUR} uploads/hour). Waiting until {wait_until}")
|
| 110 |
+
|
| 111 |
+
# Track rate limit event
|
| 112 |
+
if progress is not None:
|
| 113 |
+
event = {
|
| 114 |
+
'timestamp': datetime.datetime.now().isoformat(),
|
| 115 |
+
'type': 'hourly_limit_reached',
|
| 116 |
+
'reason': f'Reached {MAX_UPLOADS_PER_HOUR} uploads/hour',
|
| 117 |
+
'wait_seconds': int(wait_time),
|
| 118 |
+
'resume_time': wait_until.isoformat()
|
| 119 |
+
}
|
| 120 |
+
progress.setdefault('rate_limit_events', []).append(event)
|
| 121 |
+
|
| 122 |
time.sleep(wait_time)
|
| 123 |
upload_state['hour_start'] = time.time()
|
| 124 |
upload_state['count'] = 0
|
|
|
|
| 138 |
if "429" in str(e) or "rate" in str(e).lower():
|
| 139 |
wait_until = datetime.datetime.now() + datetime.timedelta(seconds=RATE_LIMIT_DELAY)
|
| 140 |
logger.warning(f"Rate limit hit (429). Waiting 1 hour until {wait_until}")
|
| 141 |
+
|
| 142 |
+
# Track rate limit error event
|
| 143 |
+
if progress is not None:
|
| 144 |
+
event = {
|
| 145 |
+
'timestamp': datetime.datetime.now().isoformat(),
|
| 146 |
+
'type': 'http_429_error',
|
| 147 |
+
'reason': 'HTTP 429 Too Many Requests from HF',
|
| 148 |
+
'wait_seconds': RATE_LIMIT_DELAY,
|
| 149 |
+
'resume_time': wait_until.isoformat(),
|
| 150 |
+
'file': path_in_repo
|
| 151 |
+
}
|
| 152 |
+
progress.setdefault('rate_limit_events', []).append(event)
|
| 153 |
+
|
| 154 |
time.sleep(RATE_LIMIT_DELAY)
|
| 155 |
upload_state['hour_start'] = time.time()
|
| 156 |
upload_state['count'] = 0
|
|
|
|
| 187 |
with open(progress_file, 'r') as f:
|
| 188 |
return json.load(f)
|
| 189 |
except:
|
| 190 |
+
return {'processed': [], 'uploaded': [], 'rate_limit_events': []}
|
| 191 |
+
return {'processed': [], 'uploaded': [], 'rate_limit_events': []}
|
| 192 |
|
| 193 |
def save_progress(progress_file, progress):
|
| 194 |
"""Save progress tracking file."""
|
|
|
|
| 322 |
|
| 323 |
# e. Upload to samfred2/ALL2
|
| 324 |
if all_filename not in progress['uploaded']:
|
| 325 |
+
upload_file_with_rate_limit(api, final_output_path, all_filename, upload_state, progress)
|
| 326 |
progress['uploaded'].append(all_filename)
|
| 327 |
processing_state['uploaded'] += 1
|
| 328 |
|
|
|
|
| 373 |
logger.info(f"Saved locally to {final_output_path}")
|
| 374 |
|
| 375 |
# Upload to samfred2/ALL2
|
| 376 |
+
upload_file_with_rate_limit(api, final_output_path, all_filename, upload_state, progress)
|
| 377 |
progress['uploaded'].append(all_filename)
|
| 378 |
processing_state['uploaded'] += 1
|
| 379 |
|