Samfredoly commited on
Commit
26bebee
·
verified ·
1 Parent(s): 406a380

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -15
app.py CHANGED
@@ -14,7 +14,7 @@ ALL_REPO_ID = "samfred2/ALL"
14
  ATO_REPO_ID = "samfred2/ATO"
15
  OUTPUT_REPO_ID = "samfred2/ALL2"
16
  OUTPUT_DIR = "processed_files"
17
- HF_TOKEN = os.getenv("HF_TOKEN", "")
18
  MAX_UPLOADS_PER_HOUR = 128
19
  RATE_LIMIT_DELAY = 3600 # 1 hour in seconds
20
 
@@ -187,9 +187,11 @@ def run_processing_thread():
187
 
188
  def process_datasets():
189
  """
190
- Downloads file lists, matches them, downloads the content, integrates
191
- the transcription data into the full course files, and uploads to samfred2/ALL2.
192
- Processes entire dataset with rate limiting (128 files/hour, 429 error handling).
 
 
193
  """
194
 
195
  # Load progress
@@ -230,13 +232,6 @@ def process_datasets():
230
  match_map[ato_file] = matching_all_file
231
 
232
  logger.info(f"Found {len(match_map)} matching pairs.")
233
- processing_state['total'] = len(match_map)
234
-
235
- # 3. Process the matched files
236
- logger.info("--- 3. Downloading, Integrating, and Uploading Files ---")
237
- logger.info(f"Total pairs to process: {len(match_map)}")
238
- logger.info(f"Already processed: {len(progress['processed'])}")
239
- logger.info(f"Already uploaded: {len(progress['uploaded'])}")
240
 
241
  # Create temporary directories for downloads
242
  all_download_dir = os.path.join(OUTPUT_DIR, "all_raw")
@@ -244,8 +239,17 @@ def process_datasets():
244
  os.makedirs(all_download_dir, exist_ok=True)
245
  os.makedirs(ato_download_dir, exist_ok=True)
246
 
247
- # Iterate over ALL matched pairs
 
 
 
 
 
 
 
248
  processed_count = 0
 
 
249
  for ato_filename, all_filename in match_map.items():
250
  if not processing_state['running']:
251
  logger.info("Processing stopped by user")
@@ -254,6 +258,7 @@ def process_datasets():
254
  # Skip if already processed
255
  if all_filename in progress['processed']:
256
  logger.info(f"Skipping already processed: {all_filename}")
 
257
  continue
258
 
259
  processing_state['current_file'] = all_filename
@@ -296,15 +301,68 @@ def process_datasets():
296
  processing_state['uploaded'] += 1
297
 
298
  progress['processed'].append(all_filename)
 
299
  save_progress(progress_file, progress)
300
  processed_count += 1
301
  processing_state['processed'] = processed_count
302
 
303
  logger.info(f"Progress: {processed_count}/{len(match_map)} | Uploaded: {len(progress['uploaded'])}")
304
 
305
- logger.info("--- Process Complete ---")
306
- logger.info(f"Total processed: {len(progress['processed'])}")
307
- logger.info(f"Total uploaded: {len(progress['uploaded'])}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
  if __name__ == "__main__":
310
  import sys
 
14
  ATO_REPO_ID = "samfred2/ATO"
15
  OUTPUT_REPO_ID = "samfred2/ALL2"
16
  OUTPUT_DIR = "processed_files"
17
+ HF_TOKEN = os.getenv("HF_TOKEN", "x")
18
  MAX_UPLOADS_PER_HOUR = 128
19
  RATE_LIMIT_DELAY = 3600 # 1 hour in seconds
20
 
 
187
 
188
  def process_datasets():
189
  """
190
+ Two-phase processing:
191
+ 1. Process matched pairs (ALL files with corresponding ATO transcriptions)
192
+ 2. Upload remaining ALL files without transcriptions
193
+
194
+ With rate limiting (128 files/hour, 429 error handling).
195
  """
196
 
197
  # Load progress
 
232
  match_map[ato_file] = matching_all_file
233
 
234
  logger.info(f"Found {len(match_map)} matching pairs.")
 
 
 
 
 
 
 
235
 
236
  # Create temporary directories for downloads
237
  all_download_dir = os.path.join(OUTPUT_DIR, "all_raw")
 
239
  os.makedirs(all_download_dir, exist_ok=True)
240
  os.makedirs(ato_download_dir, exist_ok=True)
241
 
242
+ # ============================================================
243
+ # PHASE 1: Process matched pairs (with transcriptions)
244
+ # ============================================================
245
+ logger.info("--- PHASE 1: Processing matched pairs (files with transcriptions) ---")
246
+ logger.info(f"Total pairs to process: {len(match_map)}")
247
+ logger.info(f"Already processed: {len(progress['processed'])}")
248
+ logger.info(f"Already uploaded: {len(progress['uploaded'])}")
249
+
250
  processed_count = 0
251
+ matched_all_files = set() # Track which ALL files were processed in phase 1
252
+
253
  for ato_filename, all_filename in match_map.items():
254
  if not processing_state['running']:
255
  logger.info("Processing stopped by user")
 
258
  # Skip if already processed
259
  if all_filename in progress['processed']:
260
  logger.info(f"Skipping already processed: {all_filename}")
261
+ matched_all_files.add(all_filename)
262
  continue
263
 
264
  processing_state['current_file'] = all_filename
 
301
  processing_state['uploaded'] += 1
302
 
303
  progress['processed'].append(all_filename)
304
+ matched_all_files.add(all_filename)
305
  save_progress(progress_file, progress)
306
  processed_count += 1
307
  processing_state['processed'] = processed_count
308
 
309
  logger.info(f"Progress: {processed_count}/{len(match_map)} | Uploaded: {len(progress['uploaded'])}")
310
 
311
+ logger.info("--- PHASE 1 Complete ---")
312
+ logger.info(f"Phase 1 processed: {processed_count} files with transcriptions")
313
+
314
+ # ============================================================
315
+ # PHASE 2: Upload remaining ALL files without transcriptions
316
+ # ============================================================
317
+ logger.info("--- PHASE 2: Uploading remaining ALL files without transcriptions ---")
318
+
319
+ # Find files in ALL that don't have matches (no transcription)
320
+ remaining_files = [f for f in all_json_files if f not in matched_all_files and f not in progress['uploaded']]
321
+ logger.info(f"Found {len(remaining_files)} files without transcriptions to upload")
322
+
323
+ remaining_count = 0
324
+ for all_filename in remaining_files:
325
+ if not processing_state['running']:
326
+ logger.info("Processing stopped by user during phase 2")
327
+ break
328
+
329
+ processing_state['current_file'] = all_filename
330
+ logger.info(f"Uploading remaining file {remaining_count + 1}/{len(remaining_files)}: {all_filename}")
331
+
332
+ # Download ALL file
333
+ all_local_path = download_file(ALL_REPO_ID, all_filename, all_download_dir)
334
+ if not all_local_path:
335
+ continue
336
+
337
+ # Load and prepare file
338
+ all_data = load_json_file(all_local_path)
339
+ if not all_data:
340
+ continue
341
+
342
+ # Save locally (no transcription added)
343
+ final_output_path = os.path.join(OUTPUT_DIR, all_filename)
344
+ with open(final_output_path, 'w') as f:
345
+ json.dump(all_data, f, indent=4)
346
+
347
+ logger.info(f"Saved locally to {final_output_path}")
348
+
349
+ # Upload to samfred2/ALL2
350
+ upload_file_with_rate_limit(api, final_output_path, all_filename, upload_state)
351
+ progress['uploaded'].append(all_filename)
352
+ processing_state['uploaded'] += 1
353
+
354
+ save_progress(progress_file, progress)
355
+ remaining_count += 1
356
+
357
+ logger.info(f"Phase 2 Progress: {remaining_count}/{len(remaining_files)} | Total Uploaded: {len(progress['uploaded'])}")
358
+
359
+ logger.info("--- PHASE 2 Complete ---")
360
+ logger.info(f"Phase 2 uploaded: {remaining_count} files without transcriptions")
361
+
362
+ logger.info("=== ALL PROCESSING COMPLETE ===")
363
+ logger.info(f"Total processed (with transcriptions): {len(progress['processed'])}")
364
+ logger.info(f"Total uploaded (all files): {len(progress['uploaded'])}")
365
+ logger.info(f"Final stats: {len(progress['processed'])} with transcriptions + {remaining_count} without transcriptions = {len(progress['uploaded'])} total")
366
 
367
  if __name__ == "__main__":
368
  import sys