Samfredoly commited on
Commit
ac4731a
·
verified ·
1 Parent(s): 26bebee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -6
app.py CHANGED
@@ -14,7 +14,7 @@ ALL_REPO_ID = "samfred2/ALL"
14
  ATO_REPO_ID = "samfred2/ATO"
15
  OUTPUT_REPO_ID = "samfred2/ALL2"
16
  OUTPUT_DIR = "processed_files"
17
- HF_TOKEN = os.getenv("HF_TOKEN", "x")
18
  MAX_UPLOADS_PER_HOUR = 128
19
  RATE_LIMIT_DELAY = 3600 # 1 hour in seconds
20
 
@@ -92,10 +92,11 @@ def download_file(repo_id, filename, local_dir):
92
  logger.error(f"Could not download {filename}. Error: {e}")
93
  return None
94
 
95
- def upload_file_with_rate_limit(api, file_path, path_in_repo, upload_state):
96
  """
97
  Uploads a file to HF with rate limiting (128 files per hour).
98
  Handles 429 errors by waiting an hour and resuming.
 
99
  """
100
  while True:
101
  try:
@@ -106,6 +107,18 @@ def upload_file_with_rate_limit(api, file_path, path_in_repo, upload_state):
106
  wait_time = RATE_LIMIT_DELAY - elapsed
107
  wait_until = datetime.datetime.now() + datetime.timedelta(seconds=wait_time)
108
  logger.info(f"Rate limit reached ({MAX_UPLOADS_PER_HOUR} uploads/hour). Waiting until {wait_until}")
 
 
 
 
 
 
 
 
 
 
 
 
109
  time.sleep(wait_time)
110
  upload_state['hour_start'] = time.time()
111
  upload_state['count'] = 0
@@ -125,6 +138,19 @@ def upload_file_with_rate_limit(api, file_path, path_in_repo, upload_state):
125
  if "429" in str(e) or "rate" in str(e).lower():
126
  wait_until = datetime.datetime.now() + datetime.timedelta(seconds=RATE_LIMIT_DELAY)
127
  logger.warning(f"Rate limit hit (429). Waiting 1 hour until {wait_until}")
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  time.sleep(RATE_LIMIT_DELAY)
129
  upload_state['hour_start'] = time.time()
130
  upload_state['count'] = 0
@@ -161,8 +187,8 @@ def load_progress(progress_file):
161
  with open(progress_file, 'r') as f:
162
  return json.load(f)
163
  except:
164
- return {'processed': [], 'uploaded': []}
165
- return {'processed': [], 'uploaded': []}
166
 
167
  def save_progress(progress_file, progress):
168
  """Save progress tracking file."""
@@ -296,7 +322,7 @@ def process_datasets():
296
 
297
  # e. Upload to samfred2/ALL2
298
  if all_filename not in progress['uploaded']:
299
- upload_file_with_rate_limit(api, final_output_path, all_filename, upload_state)
300
  progress['uploaded'].append(all_filename)
301
  processing_state['uploaded'] += 1
302
 
@@ -347,7 +373,7 @@ def process_datasets():
347
  logger.info(f"Saved locally to {final_output_path}")
348
 
349
  # Upload to samfred2/ALL2
350
- upload_file_with_rate_limit(api, final_output_path, all_filename, upload_state)
351
  progress['uploaded'].append(all_filename)
352
  processing_state['uploaded'] += 1
353
 
 
14
  ATO_REPO_ID = "samfred2/ATO"
15
  OUTPUT_REPO_ID = "samfred2/ALL2"
16
  OUTPUT_DIR = "processed_files"
17
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
18
  MAX_UPLOADS_PER_HOUR = 128
19
  RATE_LIMIT_DELAY = 3600 # 1 hour in seconds
20
 
 
92
  logger.error(f"Could not download {filename}. Error: {e}")
93
  return None
94
 
95
+ def upload_file_with_rate_limit(api, file_path, path_in_repo, upload_state, progress=None):
96
  """
97
  Uploads a file to HF with rate limiting (128 files per hour).
98
  Handles 429 errors by waiting an hour and resuming.
99
+ Tracks rate limit events in progress file.
100
  """
101
  while True:
102
  try:
 
107
  wait_time = RATE_LIMIT_DELAY - elapsed
108
  wait_until = datetime.datetime.now() + datetime.timedelta(seconds=wait_time)
109
  logger.info(f"Rate limit reached ({MAX_UPLOADS_PER_HOUR} uploads/hour). Waiting until {wait_until}")
110
+
111
+ # Track rate limit event
112
+ if progress is not None:
113
+ event = {
114
+ 'timestamp': datetime.datetime.now().isoformat(),
115
+ 'type': 'hourly_limit_reached',
116
+ 'reason': f'Reached {MAX_UPLOADS_PER_HOUR} uploads/hour',
117
+ 'wait_seconds': int(wait_time),
118
+ 'resume_time': wait_until.isoformat()
119
+ }
120
+ progress.setdefault('rate_limit_events', []).append(event)
121
+
122
  time.sleep(wait_time)
123
  upload_state['hour_start'] = time.time()
124
  upload_state['count'] = 0
 
138
  if "429" in str(e) or "rate" in str(e).lower():
139
  wait_until = datetime.datetime.now() + datetime.timedelta(seconds=RATE_LIMIT_DELAY)
140
  logger.warning(f"Rate limit hit (429). Waiting 1 hour until {wait_until}")
141
+
142
+ # Track rate limit error event
143
+ if progress is not None:
144
+ event = {
145
+ 'timestamp': datetime.datetime.now().isoformat(),
146
+ 'type': 'http_429_error',
147
+ 'reason': 'HTTP 429 Too Many Requests from HF',
148
+ 'wait_seconds': RATE_LIMIT_DELAY,
149
+ 'resume_time': wait_until.isoformat(),
150
+ 'file': path_in_repo
151
+ }
152
+ progress.setdefault('rate_limit_events', []).append(event)
153
+
154
  time.sleep(RATE_LIMIT_DELAY)
155
  upload_state['hour_start'] = time.time()
156
  upload_state['count'] = 0
 
187
  with open(progress_file, 'r') as f:
188
  return json.load(f)
189
  except:
190
+ return {'processed': [], 'uploaded': [], 'rate_limit_events': []}
191
+ return {'processed': [], 'uploaded': [], 'rate_limit_events': []}
192
 
193
  def save_progress(progress_file, progress):
194
  """Save progress tracking file."""
 
322
 
323
  # e. Upload to samfred2/ALL2
324
  if all_filename not in progress['uploaded']:
325
+ upload_file_with_rate_limit(api, final_output_path, all_filename, upload_state, progress)
326
  progress['uploaded'].append(all_filename)
327
  processing_state['uploaded'] += 1
328
 
 
373
  logger.info(f"Saved locally to {final_output_path}")
374
 
375
  # Upload to samfred2/ALL2
376
+ upload_file_with_rate_limit(api, final_output_path, all_filename, upload_state, progress)
377
  progress['uploaded'].append(all_filename)
378
  processing_state['uploaded'] += 1
379