Spaces:

Fred808
/

FLOWCAP1

Paused

App Files Files Community

Fred808 commited on Oct 29, 2025

Commit

2a6cacc

verified ·

1 Parent(s): f42f03e

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -88

app.py CHANGED Viewed

@@ -188,7 +188,7 @@ async def send_image_for_captioning(image_path: Path, course_name: str, progress
     print(f"[{FLOW_ID}] FAILED after {MAX_RETRIES} attempts for {image_path.name}.")
     return None
-async def download_and_extract_zip(course_name: str) -> Optional[tuple[Path, str]]:
     """Downloads the zip file for the course and extracts its contents."""
     print(f"[{FLOW_ID}] Looking for files starting with '{course_name}' in frames/ directory...")
@@ -211,12 +211,18 @@ async def download_and_extract_zip(course_name: str) -> Optional[tuple[Path, str
             print(f"[{FLOW_ID}] No zip files found starting with '{course_name}' in frames/ directory.")
             return None, None
-        # Use the first matching file (you could modify this to process all matches)
-        repo_file_full_path = matching_files[0] # e.g., frames/DAREEFSA_full_name.zip
         # Extract the full file name from the path (e.g., DAREEFSA_full_name.zip)
         zip_full_name = Path(repo_file_full_path).name
-        print(f"[{FLOW_ID}] Found matching file: {repo_file_full_path}. Full name: {zip_full_name}")
         # Use hf_hub_download to get the file path
         zip_path = hf_hub_download(
@@ -237,12 +243,12 @@ async def download_and_extract_zip(course_name: str) -> Optional[tuple[Path, str
         print(f"[{FLOW_ID}] Extraction complete to {extract_dir}.")
-        # Return the extraction directory and the full zip file name
-        return extract_dir, zip_full_name
     except Exception as e:
         print(f"[{FLOW_ID}] Error downloading or extracting zip for {course_name}: {e}")
-        return None, None
 async def upload_captions_to_hf(zip_full_name: str, captions: List[Dict]) -> bool:
     """Uploads the final captions JSON file to the output dataset.
@@ -275,99 +281,130 @@ async def upload_captions_to_hf(zip_full_name: str, captions: List[Dict]) -> boo
         return False
 async def process_course_task(course_name: str):
-    """Main task to process a single course."""
-    print(f"[{FLOW_ID}] Starting processing for course: {course_name}")
-    extract_dir = None
-    zip_full_name = None
-    success = False
-    error_message = None
-    all_captions = []
-    try:
-        # download_and_extract_zip now returns a tuple: (extract_dir, zip_full_name)
-        download_result = await download_and_extract_zip(course_name)
-        if download_result is None or download_result[0] is None:
-            raise Exception("Failed to download or extract zip file.")
-        extract_dir, zip_full_name = download_result
-        # FIX: Use recursive glob to find images in subdirectories
-        image_paths = [p for p in extract_dir.glob("**/*") if p.is_file() and p.suffix.lower() in ['.jpg', '.jpeg', '.png']]
-        print(f"[{FLOW_ID}] Found {len(image_paths)} images to process.")
-        if not image_paths:
-            print(f"[{FLOW_ID}] No images found in {course_name}. Marking as complete.")
-            success = True
-        else:
-            # Initialize progress tracker
-            progress_tracker = {
-                'total': len(image_paths),
-                'completed': 0
-            }
-            print(f"[{FLOW_ID}] Starting captioning for {progress_tracker['total']} images...")
-            # Create a semaphore to limit concurrent tasks to the number of available servers
-            # This ensures we only launch as many image processing tasks as we have servers.
-            semaphore = asyncio.Semaphore(len(servers))
-            async def limited_send_image_for_captioning(image_path, course_name, progress_tracker):
-                async with semaphore:
-                    return await send_image_for_captioning(image_path, course_name, progress_tracker)
-            # Create a list of tasks for parallel captioning
-            caption_tasks = []
-            for image_path in image_paths:
-                caption_tasks.append(limited_send_image_for_captioning(image_path, course_name, progress_tracker))
-            # Run all captioning tasks concurrently
-            results = await asyncio.gather(*caption_tasks)
-            # Filter out failed results
-            all_captions = [r for r in results if r is not None]
-            # Final progress report
-            if len(all_captions) == len(image_paths):
-                print(f"[{FLOW_ID}] FINAL PROGRESS: Successfully completed all {len(all_captions)} captions.")
-                success = True
             else:
-                print(f"[{FLOW_ID}] FINAL PROGRESS: Completed with partial result: {len(all_captions)}/{len(image_paths)} captions.")
-                success = False
-            # Upload results
-            if all_captions and zip_full_name:
-                # Use the full zip file name for the upload as requested
-                print(f"[{FLOW_ID}] Uploading {len(all_captions)} captions for {zip_full_name}...")
-                if await upload_captions_to_hf(zip_full_name, all_captions):
-                    print(f"[{FLOW_ID}] Successfully uploaded captions for {zip_full_name}.")
-                    success = True # Mark as success if upload is successful
                 else:
-                    print(f"[{FLOW_ID}] Failed to upload captions for {zip_full_name}.")
-                    success = False
-            else:
-                print(f"[{FLOW_ID}] No captions generated or zip_full_name is missing. Skipping upload.")
-                success = False
-    except Exception as e:
-        error_message = str(e)
-        print(f"[{FLOW_ID}] Critical error in process_course_task for {course_name}: {error_message}")
-    finally:
-        # Cleanup temporary files
-        if extract_dir and extract_dir.exists():
-            print(f"[{FLOW_ID}] Cleaned up temporary directory {extract_dir}.")
-            import shutil
-            shutil.rmtree(extract_dir, ignore_errors=True)
-        # Report completion to manager
-        # The original code for this part is missing but is implied by the user's output
-        # For now, we'll just print a completion message
-        print(f"[{FLOW_ID}] Reporting completion for {course_name} (Success: {success})...")
-        # In a real scenario, you would call the manager endpoint here
-        # await report_completion_to_manager(course_name, success, error_message)
-        return success
 async def report_completion(course_name: str, success: bool, error_message: Optional[str] = None):
     """Reports the task result back to the Manager Server."""

     print(f"[{FLOW_ID}] FAILED after {MAX_RETRIES} attempts for {image_path.name}.")
     return None
+async def download_and_extract_zip(course_name: str, processed_files: Set[str]) -> Optional[tuple[Path, str, str]]:
     """Downloads the zip file for the course and extracts its contents."""
     print(f"[{FLOW_ID}] Looking for files starting with '{course_name}' in frames/ directory...")
             print(f"[{FLOW_ID}] No zip files found starting with '{course_name}' in frames/ directory.")
             return None, None
+        # Filter out already processed files and select the first one
+        unprocessed_files = [f for f in matching_files if f not in processed_files]
+        if not unprocessed_files:
+            print(f"[{FLOW_ID}] No new zip files found for '{course_name}'.")
+            return None, None, None
+        repo_file_full_path = unprocessed_files[0] # e.g., frames/DAREEFSA_full_name.zip
         # Extract the full file name from the path (e.g., DAREEFSA_full_name.zip)
         zip_full_name = Path(repo_file_full_path).name
+        print(f"[{FLOW_ID}] Found new matching file: {repo_file_full_path}. Full name: {zip_full_name}")
         # Use hf_hub_download to get the file path
         zip_path = hf_hub_download(
         print(f"[{FLOW_ID}] Extraction complete to {extract_dir}.")
+        # Return the extraction directory, the full zip file name, and the repo path
+        return extract_dir, zip_full_name, repo_file_full_path
     except Exception as e:
         print(f"[{FLOW_ID}] Error downloading or extracting zip for {course_name}: {e}")
+        return None, None, None
 async def upload_captions_to_hf(zip_full_name: str, captions: List[Dict]) -> bool:
     """Uploads the final captions JSON file to the output dataset.
         return False
 async def process_course_task(course_name: str):
+    """Main task to process a single course, looping until all files are processed."""
+    print(f"[{FLOW_ID}] Starting continuous processing for course: {course_name}")
+    processed_files = set()
+    all_processed_files_log = []
+    global_success = True
+    # Loop to continuously check for new files matching the course_name prefix
+    while True:
+        extract_dir = None
+        zip_full_name = None
+        repo_file_full_path = None
+        try:
+            # download_and_extract_zip now returns a tuple: (extract_dir, zip_full_name, repo_file_full_path)
+            download_result = await download_and_extract_zip(course_name, processed_files)
+            if download_result is None or download_result[0] is None:
+                # No new files found, or an error occurred during search/download
+                if download_result is not None and download_result[0] is None and download_result[1] is None:
+                    print(f"[{FLOW_ID}] No new files found for {course_name}. Exiting loop.")
+                    break
+                else:
+                    # An error occurred during search/download
+                    raise Exception("Failed to download or extract zip file.")
+            extract_dir, zip_full_name, repo_file_full_path = download_result
+            # Add the file to the processed set immediately to avoid re-processing in the next loop
+            processed_files.add(repo_file_full_path)
+            all_processed_files_log.append(repo_file_full_path)
+            # --- Start Processing the single file ---
+            # FIX: Use recursive glob to find images in subdirectories
+            image_paths = [p for p in extract_dir.glob("**/*") if p.is_file() and p.suffix.lower() in ['.jpg', '.jpeg', '.png']]
+            print(f"[{FLOW_ID}] Found {len(image_paths)} images to process in {zip_full_name}.")
+            current_file_success = False
+            if not image_paths:
+                print(f"[{FLOW_ID}] No images found in {zip_full_name}. Marking as complete.")
+                current_file_success = True
             else:
+                # Initialize progress tracker
+                progress_tracker = {
+                    'total': len(image_paths),
+                    'completed': 0
+                }
+                print(f"[{FLOW_ID}] Starting captioning for {progress_tracker['total']} images in {zip_full_name}...")
+                # Create a semaphore to limit concurrent tasks to the number of available servers
+                semaphore = asyncio.Semaphore(len(servers))
+                async def limited_send_image_for_captioning(image_path, course_name, progress_tracker):
+                    async with semaphore:
+                        return await send_image_for_captioning(image_path, course_name, progress_tracker)
+                # Create a list of tasks for parallel captioning
+                caption_tasks = []
+                for image_path in image_paths:
+                    caption_tasks.append(limited_send_image_for_captioning(image_path, course_name, progress_tracker))
+                # Run all captioning tasks concurrently
+                results = await asyncio.gather(*caption_tasks)
+                # Filter out failed results
+                all_captions = [r for r in results if r is not None]
+                # Final progress report for the current file
+                if len(all_captions) == len(image_paths):
+                    print(f"[{FLOW_ID}] FINAL PROGRESS for {zip_full_name}: Successfully completed all {len(all_captions)} captions.")
+                    current_file_success = True
                 else:
+                    print(f"[{FLOW_ID}] FINAL PROGRESS for {zip_full_name}: Completed with partial result: {len(all_captions)}/{len(image_paths)} captions.")
+                    current_file_success = False
+                # Upload results
+                if all_captions and zip_full_name:
+                    # Use the full zip file name for the upload as requested
+                    print(f"[{FLOW_ID}] Uploading {len(all_captions)} captions for {zip_full_name}...")
+                    if await upload_captions_to_hf(zip_full_name, all_captions):
+                        print(f"[{FLOW_ID}] Successfully uploaded captions for {zip_full_name}.")
+                        # If partial success, we still upload, but the overall task is marked as failure if any file failed
+                        if not current_file_success:
+                            global_success = False
+                    else:
+                        print(f"[{FLOW_ID}] Failed to upload captions for {zip_full_name}.")
+                        current_file_success = False
+                        global_success = False
+                else:
+                    print(f"[{FLOW_ID}] No captions generated or zip_full_name is missing. Skipping upload for {zip_full_name}.")
+                    current_file_success = False
+                    global_success = False
+            # --- End Processing the single file ---
+        except Exception as e:
+            error_message = str(e)
+            print(f"[{FLOW_ID}] Critical error in process_course_task for {course_name}: {error_message}")
+            global_success = False
+        finally:
+            # Cleanup temporary files for the current file
+            if extract_dir and extract_dir.exists():
+                print(f"[{FLOW_ID}] Cleaned up temporary directory {extract_dir}.")
+                import shutil
+                shutil.rmtree(extract_dir, ignore_errors=True)
+            # If an unrecoverable error occurred (e.g., during search/download), break the loop
+            if download_result is None and extract_dir is None:
+                break
+    # --- Final Report after the loop is complete ---
+    print(f"[{FLOW_ID}] All processing loops complete for {course_name}.")
+    print(f"[{FLOW_ID}] Total files processed: {len(all_processed_files_log)}")
+    print(f"[{FLOW_ID}] List of processed files: {all_processed_files_log}")
+    # Report completion to manager
+    final_error_message = error_message if not global_success else None
+    # Assuming report_completion exists and is an async function
+    # await report_completion(course_name, global_success, final_error_message)
+    return global_success
 async def report_completion(course_name: str, success: bool, error_message: Optional[str] = None):
     """Reports the task result back to the Manager Server."""