Fred808 commited on
Commit
b5a54de
Β·
verified Β·
1 Parent(s): eb57683

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -34
app.py CHANGED
@@ -7,6 +7,8 @@ from typing import Dict, List, Set, Optional
7
  from urllib.parse import quote, urljoin
8
  from datetime import datetime
9
  from pathlib import Path
 
 
10
 
11
  from fastapi import FastAPI, BackgroundTasks, HTTPException, status
12
  from fastapi.responses import JSONResponse
@@ -17,6 +19,13 @@ import uvicorn
17
  CAPTIONS_DIR = Path("captions_data")
18
  CAPTIONS_DIR.mkdir(exist_ok=True)
19
 
 
 
 
 
 
 
 
20
  def get_caption_file_path(course: str) -> Path:
21
  """Get the path to the JSON file for storing course captions"""
22
  safe_name = quote(course, safe='')
@@ -87,7 +96,6 @@ CAPTION_SERVERS = [
87
  "https://fredalone-fredalone-8h285h.hf.space/analyze"
88
  ]
89
  MODEL_TYPE = "Florence-2-large" # Explicitly request large model
90
- DATA_COLLECTION_SERVER = "https://fred808-flow.hf.space"
91
 
92
  # FastAPI Models
93
  class CourseInfo(BaseModel):
@@ -401,35 +409,49 @@ async def process_image(server: CaptionServer, course: str, image: Dict) -> Dict
401
  finally:
402
  server.busy = False
403
 
404
- async def submit_to_dataset(course: str, metadata_list: List[Dict]):
405
- """Submit course results to dataset collection server"""
406
- # Group by parent folder
407
- parent_folder = os.path.dirname(course) if '/' in course else course.split('_')[0]
408
-
409
- # Prepare payload
410
- payload = {
411
- "text": f"Completed captions for course {course}. done",
412
- "course": parent_folder,
413
- "metadata": {
414
- "course_name": course,
415
- "image_count": len(metadata_list),
416
- "completed_at": datetime.now().isoformat()
417
- },
418
- "captions": metadata_list
419
- }
420
-
421
- async with aiohttp.ClientSession() as session:
422
- try:
423
- async with session.post(
424
- f"{DATA_COLLECTION_SERVER}/submit",
425
- json=payload
426
- ) as resp:
427
- result = await resp.json()
428
- print(f"βœ“ Dataset submission result for {course}: {result}")
429
- return result
430
- except Exception as e:
431
- print(f"βœ— Error submitting to dataset: {e}")
432
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
  async def process_course(course: str, servers: List[CaptionServer]):
435
  """Process all images in a course using available servers with proper retry logic"""
@@ -462,6 +484,12 @@ async def process_course(course: str, servers: List[CaptionServer]):
462
  if not pending_images:
463
  print(f"All images already processed or failed for course {course}")
464
  print(f"- Processed: {len(processed_images[course])}, Failed: {len(failed_images[course])}")
 
 
 
 
 
 
465
  return
466
 
467
  print(f"Images to process: {len(pending_images)} (already processed: {len(processed_images[course])}, failed: {len(failed_images[course])})")
@@ -545,9 +573,15 @@ async def process_course(course: str, servers: List[CaptionServer]):
545
  print(f"\nβœ“ Course {course} completed with {failed_count} failed images")
546
  else:
547
  print(f"\nβœ“ Course {course} fully completed")
548
- # Submit to dataset only when fully completed
549
- print(f"Submitting {len(course_captions[course])} captions to dataset...")
550
- await submit_to_dataset(course, course_captions[course])
 
 
 
 
 
 
551
  else:
552
  print(f"\nβ†’ Course {course} partially completed: {done}/{total} processed, {failed_count} failed")
553
 
@@ -662,7 +696,8 @@ async def startup_event():
662
  print("Caption Coordinator API started")
663
  print(f"Source server: {SOURCE_SERVER}")
664
  print(f"Caption servers: {len(CAPTION_SERVERS)}")
665
- print(f"Dataset server: {DATA_COLLECTION_SERVER}")
 
666
 
667
  # Start processing automatically (like original main())
668
  if auto_start_processing:
 
7
  from urllib.parse import quote, urljoin
8
  from datetime import datetime
9
  from pathlib import Path
10
+ from datasets import Dataset, DatasetDict
11
+ import huggingface_hub
12
 
13
  from fastapi import FastAPI, BackgroundTasks, HTTPException, status
14
  from fastapi.responses import JSONResponse
 
19
  CAPTIONS_DIR = Path("captions_data")
20
  CAPTIONS_DIR.mkdir(exist_ok=True)
21
 
22
+ # Hugging Face configuration
23
+ HF_TOKEN = os.getenv("HF_TOKEN")
24
+ HF_DATASET_ID = os.getenv("HF_DATASET_ID", "fred808/helium")
25
+
26
+ if not HF_TOKEN:
27
+ raise ValueError("HF_TOKEN environment variable is required")
28
+
29
  def get_caption_file_path(course: str) -> Path:
30
  """Get the path to the JSON file for storing course captions"""
31
  safe_name = quote(course, safe='')
 
96
  "https://fredalone-fredalone-8h285h.hf.space/analyze"
97
  ]
98
  MODEL_TYPE = "Florence-2-large" # Explicitly request large model
 
99
 
100
  # FastAPI Models
101
  class CourseInfo(BaseModel):
 
409
  finally:
410
  server.busy = False
411
 
412
+ async def upload_to_huggingface(course: str, metadata_list: List[Dict]):
413
+ """Upload course captions to Hugging Face dataset"""
414
+ try:
415
+ print(f"πŸ“€ Uploading {len(metadata_list)} captions for {course} to Hugging Face...")
416
+
417
+ # Prepare data for Hugging Face dataset
418
+ dataset_data = {
419
+ "course": [],
420
+ "image_filename": [],
421
+ "caption": [],
422
+ "processing_server": [],
423
+ "processing_time": [],
424
+ "timestamp": []
425
+ }
426
+
427
+ for metadata in metadata_list:
428
+ dataset_data["course"].append(course)
429
+ dataset_data["image_filename"].append(metadata["image"])
430
+ dataset_data["caption"].append(metadata["caption"])
431
+ dataset_data["processing_server"].append(metadata["server"])
432
+ dataset_data["processing_time"].append(metadata["processing_time"])
433
+ dataset_data["timestamp"].append(metadata["timestamp"])
434
+
435
+ # Create dataset
436
+ dataset = Dataset.from_dict(dataset_data)
437
+
438
+ # Login to Hugging Face
439
+ huggingface_hub.login(token=HF_TOKEN)
440
+
441
+ # Push to hub
442
+ dataset.push_to_hub(
443
+ HF_DATASET_ID,
444
+ config_name=course.replace("/", "_").replace(" ", "_"),
445
+ split="train", # You can change this to "train", "validation", "test" as needed
446
+ commit_message=f"Add captions for course {course} - {len(metadata_list)} images"
447
+ )
448
+
449
+ print(f"βœ… Successfully uploaded {len(metadata_list)} captions for {course} to {HF_DATASET_ID}")
450
+ return True
451
+
452
+ except Exception as e:
453
+ print(f"❌ Error uploading to Hugging Face: {e}")
454
+ return False
455
 
456
  async def process_course(course: str, servers: List[CaptionServer]):
457
  """Process all images in a course using available servers with proper retry logic"""
 
484
  if not pending_images:
485
  print(f"All images already processed or failed for course {course}")
486
  print(f"- Processed: {len(processed_images[course])}, Failed: {len(failed_images[course])}")
487
+
488
+ # If course is completed, upload to Hugging Face
489
+ if len(processed_images[course]) + len(failed_images[course]) >= len(images):
490
+ if course_captions[course]:
491
+ print(f"πŸ“€ Course {course} completed, uploading to Hugging Face...")
492
+ await upload_to_huggingface(course, course_captions[course])
493
  return
494
 
495
  print(f"Images to process: {len(pending_images)} (already processed: {len(processed_images[course])}, failed: {len(failed_images[course])})")
 
573
  print(f"\nβœ“ Course {course} completed with {failed_count} failed images")
574
  else:
575
  print(f"\nβœ“ Course {course} fully completed")
576
+
577
+ # Upload to Hugging Face when course is completed
578
+ if course_captions[course]:
579
+ print(f"πŸ“€ Uploading {len(course_captions[course])} captions to Hugging Face...")
580
+ success = await upload_to_huggingface(course, course_captions[course])
581
+ if success:
582
+ print(f"βœ… Successfully uploaded {course} to Hugging Face")
583
+ else:
584
+ print(f"❌ Failed to upload {course} to Hugging Face")
585
  else:
586
  print(f"\nβ†’ Course {course} partially completed: {done}/{total} processed, {failed_count} failed")
587
 
 
696
  print("Caption Coordinator API started")
697
  print(f"Source server: {SOURCE_SERVER}")
698
  print(f"Caption servers: {len(CAPTION_SERVERS)}")
699
+ print(f"Hugging Face dataset: {HF_DATASET_ID}")
700
+ print(f"HF Token: {'βœ… Set' if HF_TOKEN else '❌ Missing'}")
701
 
702
  # Start processing automatically (like original main())
703
  if auto_start_processing: