Spaces:

Fred808
/

Flow

Paused

App Files Files Community

Fred808 commited on Oct 21, 2025

Commit

b5a54de

verified ·

1 Parent(s): eb57683

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -34

app.py CHANGED Viewed

@@ -7,6 +7,8 @@ from typing import Dict, List, Set, Optional
 from urllib.parse import quote, urljoin
 from datetime import datetime
 from pathlib import Path
 from fastapi import FastAPI, BackgroundTasks, HTTPException, status
 from fastapi.responses import JSONResponse
@@ -17,6 +19,13 @@ import uvicorn
 CAPTIONS_DIR = Path("captions_data")
 CAPTIONS_DIR.mkdir(exist_ok=True)
 def get_caption_file_path(course: str) -> Path:
     """Get the path to the JSON file for storing course captions"""
     safe_name = quote(course, safe='')
@@ -87,7 +96,6 @@ CAPTION_SERVERS = [
     "https://fredalone-fredalone-8h285h.hf.space/analyze"
 ]
 MODEL_TYPE = "Florence-2-large"  # Explicitly request large model
-DATA_COLLECTION_SERVER = "https://fred808-flow.hf.space"
 # FastAPI Models
 class CourseInfo(BaseModel):
@@ -401,35 +409,49 @@ async def process_image(server: CaptionServer, course: str, image: Dict) -> Dict
     finally:
         server.busy = False
-async def submit_to_dataset(course: str, metadata_list: List[Dict]):
-    """Submit course results to dataset collection server"""
-    # Group by parent folder
-    parent_folder = os.path.dirname(course) if '/' in course else course.split('_')[0]
-    # Prepare payload
-    payload = {
-        "text": f"Completed captions for course {course}. done",
-        "course": parent_folder,
-        "metadata": {
-            "course_name": course,
-            "image_count": len(metadata_list),
-            "completed_at": datetime.now().isoformat()
-        },
-        "captions": metadata_list
-    }
-    async with aiohttp.ClientSession() as session:
-        try:
-            async with session.post(
-                f"{DATA_COLLECTION_SERVER}/submit",
-                json=payload
-            ) as resp:
-                result = await resp.json()
-                print(f"✓ Dataset submission result for {course}: {result}")
-                return result
-        except Exception as e:
-            print(f"✗ Error submitting to dataset: {e}")
-            return None
 async def process_course(course: str, servers: List[CaptionServer]):
     """Process all images in a course using available servers with proper retry logic"""
@@ -462,6 +484,12 @@ async def process_course(course: str, servers: List[CaptionServer]):
     if not pending_images:
         print(f"All images already processed or failed for course {course}")
         print(f"- Processed: {len(processed_images[course])}, Failed: {len(failed_images[course])}")
         return
     print(f"Images to process: {len(pending_images)} (already processed: {len(processed_images[course])}, failed: {len(failed_images[course])})")
@@ -545,9 +573,15 @@ async def process_course(course: str, servers: List[CaptionServer]):
             print(f"\n✓ Course {course} completed with {failed_count} failed images")
         else:
             print(f"\n✓ Course {course} fully completed")
-            # Submit to dataset only when fully completed
-            print(f"Submitting {len(course_captions[course])} captions to dataset...")
-            await submit_to_dataset(course, course_captions[course])
     else:
         print(f"\n→ Course {course} partially completed: {done}/{total} processed, {failed_count} failed")
@@ -662,7 +696,8 @@ async def startup_event():
     print("Caption Coordinator API started")
     print(f"Source server: {SOURCE_SERVER}")
     print(f"Caption servers: {len(CAPTION_SERVERS)}")
-    print(f"Dataset server: {DATA_COLLECTION_SERVER}")
     # Start processing automatically (like original main())
     if auto_start_processing:

 from urllib.parse import quote, urljoin
 from datetime import datetime
 from pathlib import Path
+from datasets import Dataset, DatasetDict
+import huggingface_hub
 from fastapi import FastAPI, BackgroundTasks, HTTPException, status
 from fastapi.responses import JSONResponse
 CAPTIONS_DIR = Path("captions_data")
 CAPTIONS_DIR.mkdir(exist_ok=True)
+# Hugging Face configuration
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_DATASET_ID = os.getenv("HF_DATASET_ID", "fred808/helium")
+if not HF_TOKEN:
+    raise ValueError("HF_TOKEN environment variable is required")
 def get_caption_file_path(course: str) -> Path:
     """Get the path to the JSON file for storing course captions"""
     safe_name = quote(course, safe='')
     "https://fredalone-fredalone-8h285h.hf.space/analyze"
 ]
 MODEL_TYPE = "Florence-2-large"  # Explicitly request large model
 # FastAPI Models
 class CourseInfo(BaseModel):
     finally:
         server.busy = False
+async def upload_to_huggingface(course: str, metadata_list: List[Dict]):
+    """Upload course captions to Hugging Face dataset"""
+    try:
+        print(f"📤 Uploading {len(metadata_list)} captions for {course} to Hugging Face...")
+        # Prepare data for Hugging Face dataset
+        dataset_data = {
+            "course": [],
+            "image_filename": [],
+            "caption": [],
+            "processing_server": [],
+            "processing_time": [],
+            "timestamp": []
+        }
+        for metadata in metadata_list:
+            dataset_data["course"].append(course)
+            dataset_data["image_filename"].append(metadata["image"])
+            dataset_data["caption"].append(metadata["caption"])
+            dataset_data["processing_server"].append(metadata["server"])
+            dataset_data["processing_time"].append(metadata["processing_time"])
+            dataset_data["timestamp"].append(metadata["timestamp"])
+        # Create dataset
+        dataset = Dataset.from_dict(dataset_data)
+        # Login to Hugging Face
+        huggingface_hub.login(token=HF_TOKEN)
+        # Push to hub
+        dataset.push_to_hub(
+            HF_DATASET_ID,
+            config_name=course.replace("/", "_").replace(" ", "_"),
+            split="train",  # You can change this to "train", "validation", "test" as needed
+            commit_message=f"Add captions for course {course} - {len(metadata_list)} images"
+        )
+        print(f"✅ Successfully uploaded {len(metadata_list)} captions for {course} to {HF_DATASET_ID}")
+        return True
+    except Exception as e:
+        print(f"❌ Error uploading to Hugging Face: {e}")
+        return False
 async def process_course(course: str, servers: List[CaptionServer]):
     """Process all images in a course using available servers with proper retry logic"""
     if not pending_images:
         print(f"All images already processed or failed for course {course}")
         print(f"- Processed: {len(processed_images[course])}, Failed: {len(failed_images[course])}")
+        # If course is completed, upload to Hugging Face
+        if len(processed_images[course]) + len(failed_images[course]) >= len(images):
+            if course_captions[course]:
+                print(f"📤 Course {course} completed, uploading to Hugging Face...")
+                await upload_to_huggingface(course, course_captions[course])
         return
     print(f"Images to process: {len(pending_images)} (already processed: {len(processed_images[course])}, failed: {len(failed_images[course])})")
             print(f"\n✓ Course {course} completed with {failed_count} failed images")
         else:
             print(f"\n✓ Course {course} fully completed")
+        # Upload to Hugging Face when course is completed
+        if course_captions[course]:
+            print(f"📤 Uploading {len(course_captions[course])} captions to Hugging Face...")
+            success = await upload_to_huggingface(course, course_captions[course])
+            if success:
+                print(f"✅ Successfully uploaded {course} to Hugging Face")
+            else:
+                print(f"❌ Failed to upload {course} to Hugging Face")
     else:
         print(f"\n→ Course {course} partially completed: {done}/{total} processed, {failed_count} failed")
     print("Caption Coordinator API started")
     print(f"Source server: {SOURCE_SERVER}")
     print(f"Caption servers: {len(CAPTION_SERVERS)}")
+    print(f"Hugging Face dataset: {HF_DATASET_ID}")
+    print(f"HF Token: {'✅ Set' if HF_TOKEN else '❌ Missing'}")
     # Start processing automatically (like original main())
     if auto_start_processing: