Spaces:

Samfredoly
/

Ally2

Sleeping

App Files Files Community

Samfredoly commited on Dec 3, 2025

Commit

26bebee

verified ·

1 Parent(s): 406a380

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -15

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ ALL_REPO_ID = "samfred2/ALL"
 ATO_REPO_ID = "samfred2/ATO"
 OUTPUT_REPO_ID = "samfred2/ALL2"
 OUTPUT_DIR = "processed_files"
-HF_TOKEN = os.getenv("HF_TOKEN", "")
 MAX_UPLOADS_PER_HOUR = 128
 RATE_LIMIT_DELAY = 3600  # 1 hour in seconds
@@ -187,9 +187,11 @@ def run_processing_thread():
 def process_datasets():
     """
-    Downloads file lists, matches them, downloads the content, integrates
-    the transcription data into the full course files, and uploads to samfred2/ALL2.
-    Processes entire dataset with rate limiting (128 files/hour, 429 error handling).
     """
     # Load progress
@@ -230,13 +232,6 @@ def process_datasets():
             match_map[ato_file] = matching_all_file
     logger.info(f"Found {len(match_map)} matching pairs.")
-    processing_state['total'] = len(match_map)
-    # 3. Process the matched files
-    logger.info("--- 3. Downloading, Integrating, and Uploading Files ---")
-    logger.info(f"Total pairs to process: {len(match_map)}")
-    logger.info(f"Already processed: {len(progress['processed'])}")
-    logger.info(f"Already uploaded: {len(progress['uploaded'])}")
     # Create temporary directories for downloads
     all_download_dir = os.path.join(OUTPUT_DIR, "all_raw")
@@ -244,8 +239,17 @@ def process_datasets():
     os.makedirs(all_download_dir, exist_ok=True)
     os.makedirs(ato_download_dir, exist_ok=True)
-    # Iterate over ALL matched pairs
     processed_count = 0
     for ato_filename, all_filename in match_map.items():
         if not processing_state['running']:
             logger.info("Processing stopped by user")
@@ -254,6 +258,7 @@ def process_datasets():
         # Skip if already processed
         if all_filename in progress['processed']:
             logger.info(f"Skipping already processed: {all_filename}")
             continue
         processing_state['current_file'] = all_filename
@@ -296,15 +301,68 @@ def process_datasets():
             processing_state['uploaded'] += 1
         progress['processed'].append(all_filename)
         save_progress(progress_file, progress)
         processed_count += 1
         processing_state['processed'] = processed_count
         logger.info(f"Progress: {processed_count}/{len(match_map)} | Uploaded: {len(progress['uploaded'])}")
-    logger.info("--- Process Complete ---")
-    logger.info(f"Total processed: {len(progress['processed'])}")
-    logger.info(f"Total uploaded: {len(progress['uploaded'])}")
 if __name__ == "__main__":
     import sys

 ATO_REPO_ID = "samfred2/ATO"
 OUTPUT_REPO_ID = "samfred2/ALL2"
 OUTPUT_DIR = "processed_files"
+HF_TOKEN = os.getenv("HF_TOKEN", "x")
 MAX_UPLOADS_PER_HOUR = 128
 RATE_LIMIT_DELAY = 3600  # 1 hour in seconds
 def process_datasets():
     """
+    Two-phase processing:
+    1. Process matched pairs (ALL files with corresponding ATO transcriptions)
+    2. Upload remaining ALL files without transcriptions
+    With rate limiting (128 files/hour, 429 error handling).
     """
     # Load progress
             match_map[ato_file] = matching_all_file
     logger.info(f"Found {len(match_map)} matching pairs.")
     # Create temporary directories for downloads
     all_download_dir = os.path.join(OUTPUT_DIR, "all_raw")
     os.makedirs(all_download_dir, exist_ok=True)
     os.makedirs(ato_download_dir, exist_ok=True)
+    # ============================================================
+    # PHASE 1: Process matched pairs (with transcriptions)
+    # ============================================================
+    logger.info("--- PHASE 1: Processing matched pairs (files with transcriptions) ---")
+    logger.info(f"Total pairs to process: {len(match_map)}")
+    logger.info(f"Already processed: {len(progress['processed'])}")
+    logger.info(f"Already uploaded: {len(progress['uploaded'])}")
     processed_count = 0
+    matched_all_files = set()  # Track which ALL files were processed in phase 1
     for ato_filename, all_filename in match_map.items():
         if not processing_state['running']:
             logger.info("Processing stopped by user")
         # Skip if already processed
         if all_filename in progress['processed']:
             logger.info(f"Skipping already processed: {all_filename}")
+            matched_all_files.add(all_filename)
             continue
         processing_state['current_file'] = all_filename
             processing_state['uploaded'] += 1
         progress['processed'].append(all_filename)
+        matched_all_files.add(all_filename)
         save_progress(progress_file, progress)
         processed_count += 1
         processing_state['processed'] = processed_count
         logger.info(f"Progress: {processed_count}/{len(match_map)} | Uploaded: {len(progress['uploaded'])}")
+    logger.info("--- PHASE 1 Complete ---")
+    logger.info(f"Phase 1 processed: {processed_count} files with transcriptions")
+    # ============================================================
+    # PHASE 2: Upload remaining ALL files without transcriptions
+    # ============================================================
+    logger.info("--- PHASE 2: Uploading remaining ALL files without transcriptions ---")
+    # Find files in ALL that don't have matches (no transcription)
+    remaining_files = [f for f in all_json_files if f not in matched_all_files and f not in progress['uploaded']]
+    logger.info(f"Found {len(remaining_files)} files without transcriptions to upload")
+    remaining_count = 0
+    for all_filename in remaining_files:
+        if not processing_state['running']:
+            logger.info("Processing stopped by user during phase 2")
+            break
+        processing_state['current_file'] = all_filename
+        logger.info(f"Uploading remaining file {remaining_count + 1}/{len(remaining_files)}: {all_filename}")
+        # Download ALL file
+        all_local_path = download_file(ALL_REPO_ID, all_filename, all_download_dir)
+        if not all_local_path:
+            continue
+        # Load and prepare file
+        all_data = load_json_file(all_local_path)
+        if not all_data:
+            continue
+        # Save locally (no transcription added)
+        final_output_path = os.path.join(OUTPUT_DIR, all_filename)
+        with open(final_output_path, 'w') as f:
+            json.dump(all_data, f, indent=4)
+        logger.info(f"Saved locally to {final_output_path}")
+        # Upload to samfred2/ALL2
+        upload_file_with_rate_limit(api, final_output_path, all_filename, upload_state)
+        progress['uploaded'].append(all_filename)
+        processing_state['uploaded'] += 1
+        save_progress(progress_file, progress)
+        remaining_count += 1
+        logger.info(f"Phase 2 Progress: {remaining_count}/{len(remaining_files)} | Total Uploaded: {len(progress['uploaded'])}")
+    logger.info("--- PHASE 2 Complete ---")
+    logger.info(f"Phase 2 uploaded: {remaining_count} files without transcriptions")
+    logger.info("=== ALL PROCESSING COMPLETE ===")
+    logger.info(f"Total processed (with transcriptions): {len(progress['processed'])}")
+    logger.info(f"Total uploaded (all files): {len(progress['uploaded'])}")
+    logger.info(f"Final stats: {len(progress['processed'])} with transcriptions + {remaining_count} without transcriptions = {len(progress['uploaded'])} total")
 if __name__ == "__main__":
     import sys