Samfredoly commited on
Commit
6d01df4
·
verified ·
1 Parent(s): 0a799d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +332 -248
app.py CHANGED
@@ -1,14 +1,15 @@
1
  #!/usr/bin/env python3
2
  """
3
- Hugging Face Data Processor - Single Unified Server
4
 
5
  A complete, self-contained FastAPI application that:
6
  1. Automatically processes all courses from samelias1/Helium and samelias1/Data
7
  2. Merges frame data with cursor information
8
  3. Searches for exact transcription matches in samfred2/ATO
9
- 4. Generates combined JSON output ready for upload to samfred2/ALL
10
- 5. Provides REST API for monitoring and management
11
- 6. Includes built-in web dashboard for real-time progress tracking
 
12
 
13
  Run with: python server.py
14
  Then open: http://localhost:8000
@@ -18,6 +19,7 @@ import json
18
  import asyncio
19
  import os
20
  import sys
 
21
  from pathlib import Path
22
  from typing import Optional, List, Dict, Any
23
  from datetime import datetime
@@ -31,6 +33,7 @@ from fastapi.staticfiles import StaticFiles
31
  from fastapi.middleware.cors import CORSMiddleware
32
  from pydantic import BaseModel
33
  from huggingface_hub import hf_hub_download, HfApi
 
34
  import uvicorn
35
 
36
  # ============================================================================
@@ -53,6 +56,8 @@ class JobStatus(str, Enum):
53
  PENDING = "pending"
54
  FETCHING_FILES = "fetching_files"
55
  PROCESSING = "processing"
 
 
56
  COMPLETED = "completed"
57
  FAILED = "failed"
58
  CANCELLED = "cancelled"
@@ -68,6 +73,8 @@ class ProcessingJob(BaseModel):
68
  started_at: Optional[str] = None
69
  completed_at: Optional[str] = None
70
  output_file: Optional[str] = None
 
 
71
  progress_percent: float = 0.0
72
 
73
  # ============================================================================
@@ -97,7 +104,7 @@ app.add_middleware(
97
  )
98
 
99
  # ============================================================================
100
- # Helper Functions
101
  # ============================================================================
102
 
103
  def get_hf_api() -> HfApi:
@@ -120,13 +127,11 @@ def list_dataset_files(dataset_id: str) -> List[str]:
120
  def download_file(repo_id: str, file_name: str) -> Optional[str]:
121
  """Download a file from Hugging Face dataset to cache."""
122
  try:
123
- print(f"[DOWNLOAD] {file_name} from {repo_id}...")
124
  path = hf_hub_download(
125
  repo_id=repo_id,
126
  filename=file_name,
127
  repo_type="dataset"
128
  )
129
- print(f"[DOWNLOAD] ✓ {file_name}")
130
  return path
131
  except Exception as e:
132
  print(f"[ERROR] Failed to download {file_name}: {e}")
@@ -179,16 +184,125 @@ def merge_course_data(helium_path: str, data_path: str) -> List[Dict]:
179
  print(f"[ERROR] Failed to merge course data: {e}")
180
  return []
181
 
182
- def find_exact_transcription(course_name: str, ato_files: List[str]) -> Optional[str]:
183
  """Search for exact transcription file match in ATO dataset."""
184
- expected_file = course_name.replace("_frames.json", ".json")
185
 
186
  if expected_file in ato_files:
187
- print(f"[MATCH] Found transcription: {expected_file}")
188
  return expected_file
189
 
190
  return None
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  async def process_single_course(
193
  course_file: str,
194
  job: ProcessingJob,
@@ -196,20 +310,16 @@ async def process_single_course(
196
  ) -> Optional[Dict]:
197
  """Process a single course: merge data and fetch transcription if available."""
198
  try:
199
- print(f"\n[PROCESS] Course: {course_file}")
200
-
201
  # Download from Helium and Data
202
  helium_path = download_file(DATASET_HELIUM, course_file)
203
  data_path = download_file(DATASET_DATA, course_file)
204
 
205
  if not helium_path or not data_path:
206
- print(f"[SKIP] Missing data files for {course_file}")
207
  return None
208
 
209
  # Merge frame data
210
  merged_frames = merge_course_data(helium_path, data_path)
211
  if not merged_frames:
212
- print(f"[SKIP] Failed to merge data for {course_file}")
213
  return None
214
 
215
  # Try to find and download transcription
@@ -270,7 +380,14 @@ async def process_all_courses_background(job_id: str):
270
 
271
  for idx, course_file in enumerate(course_files):
272
  try:
273
- course_data = await process_single_course(course_file, job, ato_files)
 
 
 
 
 
 
 
274
  if course_data:
275
  all_courses.append(course_data)
276
 
@@ -286,15 +403,22 @@ async def process_all_courses_background(job_id: str):
286
  print(f"[ERROR] Failed to process {course_file}: {e}")
287
  continue
288
 
289
- # Save combined output
 
290
  timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
291
- output_file = OUTPUT_DIR / f"combined_output_{timestamp}.json"
 
292
 
293
- print(f"\n[SAVE] Saving output to {output_file}...")
294
  with open(output_file, "w") as f:
295
  json.dump(all_courses, f, indent=2)
296
 
297
  job.output_file = str(output_file)
 
 
 
 
 
298
  job.status = JobStatus.COMPLETED
299
  job.completed_at = datetime.utcnow().isoformat()
300
 
@@ -315,17 +439,17 @@ async def process_all_courses_background(job_id: str):
315
  traceback.print_exc()
316
 
317
  # ============================================================================
318
- # API Endpoints
319
  # ============================================================================
320
 
321
- @app.get("/")
322
- async def root():
323
- """Health check endpoint."""
324
  return {
325
  "status": "running",
326
  "service": "Hugging Face Data Processor",
327
  "version": "1.0.0",
328
- "dashboard": "http://localhost:8000/dashboard"
329
  }
330
 
331
  @app.post("/api/jobs/create")
@@ -412,7 +536,7 @@ async def get_stats():
412
  total_jobs = len(jobs_db)
413
  completed = sum(1 for j in jobs_db.values() if j.status == JobStatus.COMPLETED)
414
  failed = sum(1 for j in jobs_db.values() if j.status == JobStatus.FAILED)
415
- processing = sum(1 for j in jobs_db.values() if j.status in [JobStatus.PROCESSING, JobStatus.FETCHING_FILES])
416
 
417
  total_files = sum(j.total_files for j in jobs_db.values())
418
  total_processed = sum(j.processed_files for j in jobs_db.values())
@@ -429,7 +553,7 @@ async def get_stats():
429
  }
430
 
431
  # ============================================================================
432
- # Web Dashboard
433
  # ============================================================================
434
 
435
  DASHBOARD_HTML = """
@@ -440,6 +564,7 @@ DASHBOARD_HTML = """
440
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
441
  <title>Hugging Face Data Processor</title>
442
  <style>
 
443
  * {
444
  margin: 0;
445
  padding: 0;
@@ -590,7 +715,7 @@ DASHBOARD_HTML = """
590
  color: #856404;
591
  }
592
 
593
- .status-processing {
594
  background: #cfe2ff;
595
  color: #084298;
596
  }
@@ -607,297 +732,249 @@ DASHBOARD_HTML = """
607
 
608
  .status-cancelled {
609
  background: #e2e3e5;
610
- color: #383d41;
611
  }
612
 
613
- .progress-bar {
614
- width: 100%;
615
- height: 8px;
616
- background: #e0e0e0;
617
- border-radius: 4px;
618
  overflow: hidden;
619
- margin: 10px 0;
620
  }
621
 
622
- .progress-fill {
623
- height: 100%;
624
- background: linear-gradient(90deg, #667eea, #764ba2);
625
- transition: width 0.3s ease;
 
 
 
626
  }
627
 
628
  .job-details {
629
- display: grid;
630
- grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
631
- gap: 15px;
632
- margin-top: 15px;
633
- }
634
-
635
- .detail {
636
  font-size: 0.9em;
 
637
  }
638
 
639
- .detail-label {
640
- color: #666;
641
- font-weight: 500;
642
  }
643
 
644
- .detail-value {
645
  color: #333;
646
- font-weight: 600;
647
- margin-top: 5px;
648
  }
649
 
650
- .message {
651
- padding: 15px;
652
- border-radius: 6px;
653
- margin-bottom: 15px;
654
- display: none;
 
 
655
  }
656
 
657
- .message.show {
658
- display: block;
 
 
 
659
  }
660
-
661
- .message.success {
662
- background: #d1e7dd;
663
- color: #0f5132;
664
- border: 1px solid #badbcc;
 
 
665
  }
666
-
667
- .message.error {
668
- background: #f8d7da;
669
- color: #842029;
670
- border: 1px solid #f5c2c7;
671
  }
672
-
673
- .loading {
674
- display: inline-block;
675
- width: 20px;
676
- height: 20px;
677
- border: 3px solid #f3f3f3;
678
- border-top: 3px solid #667eea;
679
- border-radius: 50%;
680
- animation: spin 1s linear infinite;
681
  }
682
-
683
- @keyframes spin {
684
- 0% { transform: rotate(0deg); }
685
- 100% { transform: rotate(360deg); }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
686
  }
687
-
688
- .empty {
689
- text-align: center;
690
- padding: 40px;
691
- color: #999;
 
 
 
 
 
 
 
 
 
 
 
 
692
  }
693
-
694
- .footer {
695
- text-align: center;
696
- color: rgba(255, 255, 255, 0.8);
697
- margin-top: 30px;
698
- font-size: 0.9em;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699
  }
700
- </style>
 
 
 
 
 
 
701
  </head>
702
  <body>
703
  <div class="container">
704
  <header>
705
- <h1>🚀 Hugging Face Data Processor</h1>
706
- <p class="subtitle">Automated dataset processing and merging</p>
707
  <div class="controls">
708
- <button onclick="createJob()" id="createBtn">Create New Job</button>
709
- <button onclick="refreshAll()">Refresh</button>
710
- <button onclick="clearJobs()">Clear All</button>
711
  </div>
712
- <div id="message" class="message"></div>
713
  </header>
714
-
715
  <div class="grid">
716
  <div class="card">
717
- <h2>📊 Statistics</h2>
718
  <div class="stat">
719
  <span class="stat-label">Total Jobs</span>
720
- <span class="stat-value" id="totalJobs">0</span>
721
  </div>
722
  <div class="stat">
723
- <span class="stat-label">Completed</span>
724
- <span class="stat-value" id="completedJobs">0</span>
725
  </div>
726
  <div class="stat">
727
- <span class="stat-label">Processing</span>
728
- <span class="stat-value" id="processingJobs">0</span>
729
  </div>
730
  <div class="stat">
731
- <span class="stat-label">Failed</span>
732
- <span class="stat-value" id="failedJobs">0</span>
733
  </div>
734
  </div>
735
-
736
  <div class="card">
737
- <h2>📈 Progress</h2>
738
  <div class="stat">
739
- <span class="stat-label">Files Processed</span>
740
- <span class="stat-value" id="filesProcessed">0</span>
741
  </div>
742
  <div class="stat">
743
- <span class="stat-label">Total Files</span>
744
- <span class="stat-value" id="totalFiles">0</span>
745
  </div>
746
  <div class="stat">
747
  <span class="stat-label">Transcriptions Matched</span>
748
- <span class="stat-value" id="transcriptionsMatched">0</span>
749
  </div>
750
  </div>
751
  </div>
752
-
753
  <div class="job-list">
754
- <h2>📋 Processing Jobs</h2>
755
- <div id="jobsList" class="empty">No jobs yet. Create one to get started!</div>
 
 
756
  </div>
757
 
758
- <div class="footer">
759
- <p>Hugging Face Data Processor v1.0 | API: http://localhost:8000/docs</p>
760
- </div>
761
  </div>
762
-
763
- <script>
764
- const API_BASE = '/api';
765
- let autoRefresh = setInterval(refreshAll, 2000);
766
-
767
- function showMessage(text, type = 'success') {
768
- const msg = document.getElementById('message');
769
- msg.textContent = text;
770
- msg.className = `message show ${type}`;
771
- setTimeout(() => msg.classList.remove('show'), 5000);
772
- }
773
-
774
- async function createJob() {
775
- try {
776
- document.getElementById('createBtn').disabled = true;
777
- const res = await fetch(`${API_BASE}/jobs/create`, { method: 'POST' });
778
- const data = await res.json();
779
- showMessage(`Job created: ${data.job_id}`, 'success');
780
- refreshAll();
781
- } catch (e) {
782
- showMessage(`Error: ${e.message}`, 'error');
783
- } finally {
784
- document.getElementById('createBtn').disabled = false;
785
- }
786
- }
787
-
788
- async function cancelJob(jobId) {
789
- if (!confirm(`Cancel job ${jobId}?`)) return;
790
- try {
791
- await fetch(`${API_BASE}/jobs/${jobId}/cancel`, { method: 'POST' });
792
- showMessage('Job cancelled', 'success');
793
- refreshAll();
794
- } catch (e) {
795
- showMessage(`Error: ${e.message}`, 'error');
796
- }
797
- }
798
-
799
- async function downloadOutput(jobId) {
800
- try {
801
- window.location.href = `${API_BASE}/jobs/${jobId}/output`;
802
- } catch (e) {
803
- showMessage(`Error: ${e.message}`, 'error');
804
- }
805
- }
806
-
807
- async function refreshAll() {
808
- try {
809
- const statsRes = await fetch(`${API_BASE}/stats`);
810
- const stats = await statsRes.json();
811
-
812
- const jobsRes = await fetch(`${API_BASE}/jobs`);
813
- const jobsData = await jobsRes.json();
814
-
815
- // Update stats
816
- document.getElementById('totalJobs').textContent = stats.total_jobs;
817
- document.getElementById('completedJobs').textContent = stats.completed_jobs;
818
- document.getElementById('processingJobs').textContent = stats.processing_jobs;
819
- document.getElementById('failedJobs').textContent = stats.failed_jobs;
820
- document.getElementById('filesProcessed').textContent = stats.total_files_processed;
821
- document.getElementById('totalFiles').textContent = stats.total_files;
822
- document.getElementById('transcriptionsMatched').textContent = stats.total_transcriptions_matched;
823
-
824
- // Update jobs list
825
- const jobsList = document.getElementById('jobsList');
826
- if (jobsData.jobs.length === 0) {
827
- jobsList.innerHTML = '<div class="empty">No jobs yet. Create one to get started!</div>';
828
- } else {
829
- jobsList.innerHTML = jobsData.jobs.map(job => `
830
- <div class="job-item">
831
- <div class="job-header">
832
- <span class="job-id">${job.job_id}</span>
833
- <span class="job-status status-${job.status}">${job.status.toUpperCase()}</span>
834
- </div>
835
- ${job.status === 'processing' || job.status === 'fetching_files' ? `
836
- <div class="progress-bar">
837
- <div class="progress-fill" style="width: ${job.progress_percent}%"></div>
838
- </div>
839
- ` : ''}
840
- <div class="job-details">
841
- <div class="detail">
842
- <div class="detail-label">Files</div>
843
- <div class="detail-value">${job.processed_files}/${job.total_files}</div>
844
- </div>
845
- <div class="detail">
846
- <div class="detail-label">Transcriptions</div>
847
- <div class="detail-value">${job.matched_transcriptions}</div>
848
- </div>
849
- <div class="detail">
850
- <div class="detail-label">Created</div>
851
- <div class="detail-value">${new Date(job.created_at).toLocaleString()}</div>
852
- </div>
853
- </div>
854
- ${job.error_message ? `<div style="color: #d32f2f; margin-top: 10px;">Error: ${job.error_message}</div>` : ''}
855
- <div style="margin-top: 15px; display: flex; gap: 10px;">
856
- ${job.status === 'processing' || job.status === 'pending' || job.status === 'fetching_files' ? `
857
- <button onclick="cancelJob('${job.job_id}')" style="background: #d32f2f;">Cancel</button>
858
- ` : ''}
859
- ${job.status === 'completed' && job.output_file ? `
860
- <button onclick="downloadOutput('${job.job_id}')" style="background: #4caf50;">Download Output</button>
861
- ` : ''}
862
- </div>
863
- </div>
864
- `).join('');
865
- }
866
- } catch (e) {
867
- console.error('Refresh error:', e);
868
- }
869
- }
870
-
871
- function clearJobs() {
872
- if (confirm('Clear all jobs from memory? (This does not delete output files)')) {
873
- location.reload();
874
- }
875
- }
876
-
877
- // Initial load
878
- refreshAll();
879
- </script>
880
  </body>
881
  </html>
882
  """
883
 
884
- @app.get("/dashboard")
885
  async def dashboard():
886
- """Serve the web dashboard."""
887
- return HTMLResponse(content=DASHBOARD_HTML)
888
 
889
  # ============================================================================
890
- # Main Entry Point
891
  # ============================================================================
892
 
893
  def main():
894
- """Start the server."""
895
- print("\n" + "="*70)
896
- print("🚀 Hugging Face Data Processor - Starting Server")
897
  print("="*70)
898
- print(f"API Base URL: http://localhost:8000")
899
- print(f"Dashboard: http://localhost:8000/dashboard")
900
- print(f"Swagger UI: http://localhost:8000/docs")
901
  print(f"Output Dir: {OUTPUT_DIR.absolute()}")
902
  print("="*70 + "\n")
903
 
@@ -909,4 +986,11 @@ def main():
909
  )
910
 
911
  if __name__ == "__main__":
 
 
 
 
 
 
 
912
  main()
 
1
  #!/usr/bin/env python3
2
  """
3
+ Hugging Face Data Processor - Single Unified Server (Modified)
4
 
5
  A complete, self-contained FastAPI application that:
6
  1. Automatically processes all courses from samelias1/Helium and samelias1/Data
7
  2. Merges frame data with cursor information
8
  3. Searches for exact transcription matches in samfred2/ATO
9
+ 4. Generates combined JSON output and individual course JSONs
10
+ 5. **Uploads all generated files to samfred2/ALL using upload_folder with a robust file-by-file retry fallback.**
11
+ 6. Provides REST API for monitoring and management
12
+ 7. **Web dashboard moved to the root path (/)**
13
 
14
  Run with: python server.py
15
  Then open: http://localhost:8000
 
19
  import asyncio
20
  import os
21
  import sys
22
+ import time
23
  from pathlib import Path
24
  from typing import Optional, List, Dict, Any
25
  from datetime import datetime
 
33
  from fastapi.middleware.cors import CORSMiddleware
34
  from pydantic import BaseModel
35
  from huggingface_hub import hf_hub_download, HfApi
36
+ from huggingface_hub.utils import HfHubHTTPError
37
  import uvicorn
38
 
39
  # ============================================================================
 
56
  PENDING = "pending"
57
  FETCHING_FILES = "fetching_files"
58
  PROCESSING = "processing"
59
+ SAVING = "saving"
60
+ UPLOADING = "uploading"
61
  COMPLETED = "completed"
62
  FAILED = "failed"
63
  CANCELLED = "cancelled"
 
73
  started_at: Optional[str] = None
74
  completed_at: Optional[str] = None
75
  output_file: Optional[str] = None
76
+ total_uploads: int = 0
77
+ completed_uploads: int = 0
78
  progress_percent: float = 0.0
79
 
80
  # ============================================================================
 
104
  )
105
 
106
  # ============================================================================
107
+ # Helper Functions (Original)
108
  # ============================================================================
109
 
110
  def get_hf_api() -> HfApi:
 
127
  def download_file(repo_id: str, file_name: str) -> Optional[str]:
128
  """Download a file from Hugging Face dataset to cache."""
129
  try:
 
130
  path = hf_hub_download(
131
  repo_id=repo_id,
132
  filename=file_name,
133
  repo_type="dataset"
134
  )
 
135
  return path
136
  except Exception as e:
137
  print(f"[ERROR] Failed to download {file_name}: {e}")
 
184
  print(f"[ERROR] Failed to merge course data: {e}")
185
  return []
186
 
187
+ def find_exact_transcription(course_file: str, ato_files: List[str]) -> Optional[str]:
188
  """Search for exact transcription file match in ATO dataset."""
189
+ expected_file = course_file.replace("_frames.json", ".json")
190
 
191
  if expected_file in ato_files:
 
192
  return expected_file
193
 
194
  return None
195
 
196
+ # ============================================================================
197
+ # Upload Logic with Intelligent Fallback
198
+ # ============================================================================
199
+
200
+ def upload_file_with_retry(api: HfApi, local_path: Path, path_in_repo: str, repo_id: str):
201
+ """Uploads a single file to Hugging Face with a 1-hour retry on rate limit error (HTTP 429)."""
202
+ while True:
203
+ try:
204
+ print(f"[HF UPLOAD] Uploading {local_path.name} to {repo_id}/{path_in_repo}...")
205
+ api.upload_file(
206
+ path_or_fileobj=str(local_path),
207
+ path_in_repo=path_in_repo,
208
+ repo_id=repo_id,
209
+ repo_type="dataset",
210
+ commit_message=f"Automated upload: {local_path.name}"
211
+ )
212
+ print(f"[HF UPLOAD] ✓ Successfully uploaded {local_path.name}")
213
+ break # Success, exit the loop
214
+
215
+ except HfHubHTTPError as e:
216
+ if e.response.status_code == 429:
217
+ print(f"\n{'='*70}")
218
+ print(f"[RATE LIMIT HIT] Received HTTP 429 for {local_path.name}.")
219
+ print("Pausing for 1 hour (3600 seconds) before retrying...")
220
+ print(f"{'='*70}\n")
221
+ time.sleep(3600) # Pause for 1 hour
222
+ print(f"\n{'='*70}")
223
+ print(f"[RETRY] Resuming upload for {local_path.name}...")
224
+ print(f"{'='*70}\n")
225
+ else:
226
+ print(f"[ERROR] Failed to upload {local_path.name} with unhandled HTTP error: {e}")
227
+ raise # Re-raise other HTTP errors
228
+
229
+ except Exception as e:
230
+ print(f"[ERROR] An unexpected error occurred during upload of {local_path.name}: {e}")
231
+ raise # Re-raise other errors
232
+
233
+ def upload_all_files(job: ProcessingJob, all_courses: List[Dict], combined_file_path: Path):
234
+ """
235
+ Handles the saving of individual course files and the combined upload process.
236
+ Attempts upload_folder first, then falls back to file-by-file with retry.
237
+ """
238
+ api = get_hf_api()
239
+
240
+ # 1. Save all files (combined and individual) to OUTPUT_DIR
241
+ print("\n[SAVE] Saving individual course JSONs...")
242
+
243
+ # Ensure the combined file is saved first (it was in the main processing loop, but we ensure it here)
244
+ if not combined_file_path.exists():
245
+ with open(combined_file_path, "w") as f:
246
+ json.dump(all_courses, f, indent=2)
247
+
248
+ # Save individual course JSONs
249
+ for course_data in all_courses:
250
+ course_name = course_data["course"]
251
+ individual_file_name = f"{course_name}.json"
252
+ individual_file_path = OUTPUT_DIR / individual_file_name
253
+
254
+ with open(individual_file_path, "w") as f:
255
+ json.dump(course_data, f, indent=2)
256
+ print(f" ✓ Saved {individual_file_name}")
257
+
258
+ # Get list of all files to upload for fallback and tracking
259
+ files_to_upload = [p for p in OUTPUT_DIR.iterdir() if p.is_file() and p.suffix == '.json']
260
+ job.total_uploads = len(files_to_upload)
261
+
262
+ print(f"\n[UPLOAD] Starting intelligent upload of {job.total_uploads} files to {DATASET_OUTPUT}...")
263
+
264
+ # --- Strategy 1: Try upload_folder ---
265
+ try:
266
+ print(f"[UPLOAD] Attempting bulk upload using HfApi.upload_folder...")
267
+ api.upload_folder(
268
+ folder_path=str(OUTPUT_DIR),
269
+ repo_id=DATASET_OUTPUT,
270
+ repo_type="dataset",
271
+ commit_message=f"Automated bulk upload of {job.total_uploads} files"
272
+ )
273
+ job.completed_uploads = job.total_uploads
274
+ print(f"[UPLOAD] ✓ Bulk upload successful.")
275
+ return # Exit if successful
276
+
277
+ except Exception as e:
278
+ print(f"\n{'='*70}")
279
+ print(f"[UPLOAD FALLBACK] Bulk upload failed: {e}")
280
+ print(f"Falling back to file-by-file upload with 1-hour retry mechanism.")
281
+ print(f"{'='*70}\n")
282
+
283
+ # --- Strategy 2: Fallback to file-by-file with retry ---
284
+ job.completed_uploads = 0
285
+ for idx, local_path in enumerate(files_to_upload):
286
+ try:
287
+ upload_file_with_retry(
288
+ api=api,
289
+ local_path=local_path,
290
+ path_in_repo=local_path.name,
291
+ repo_id=DATASET_OUTPUT
292
+ )
293
+ job.completed_uploads = idx + 1
294
+ except Exception as upload_e:
295
+ # If even the retry logic fails, we log and re-raise to fail the job
296
+ print(f"[FATAL ERROR] File-by-file upload failed for {local_path.name}: {upload_e}")
297
+ raise upload_e
298
+
299
+ print(f"\n[UPLOAD] All {job.completed_uploads}/{job.total_uploads} files successfully uploaded to {DATASET_OUTPUT}.")
300
+
301
+
302
+ # ============================================================================
303
+ # Main Processing Logic (Modified)
304
+ # ============================================================================
305
+
306
  async def process_single_course(
307
  course_file: str,
308
  job: ProcessingJob,
 
310
  ) -> Optional[Dict]:
311
  """Process a single course: merge data and fetch transcription if available."""
312
  try:
 
 
313
  # Download from Helium and Data
314
  helium_path = download_file(DATASET_HELIUM, course_file)
315
  data_path = download_file(DATASET_DATA, course_file)
316
 
317
  if not helium_path or not data_path:
 
318
  return None
319
 
320
  # Merge frame data
321
  merged_frames = merge_course_data(helium_path, data_path)
322
  if not merged_frames:
 
323
  return None
324
 
325
  # Try to find and download transcription
 
380
 
381
  for idx, course_file in enumerate(course_files):
382
  try:
383
+ # Use asyncio.to_thread for blocking operations like hf_hub_download
384
+ course_data = await asyncio.to_thread(
385
+ process_single_course,
386
+ course_file,
387
+ job,
388
+ ato_files
389
+ )
390
+
391
  if course_data:
392
  all_courses.append(course_data)
393
 
 
403
  print(f"[ERROR] Failed to process {course_file}: {e}")
404
  continue
405
 
406
+ # Save combined output (needed for upload_all_files)
407
+ job.status = JobStatus.SAVING
408
  timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
409
+ output_file_name = f"combined_output_{timestamp}.json"
410
+ output_file = OUTPUT_DIR / output_file_name
411
 
412
+ print(f"\n[SAVE] Saving combined output to {output_file}...")
413
  with open(output_file, "w") as f:
414
  json.dump(all_courses, f, indent=2)
415
 
416
  job.output_file = str(output_file)
417
+
418
+ # NEW: Upload all files with intelligent fallback
419
+ job.status = JobStatus.UPLOADING
420
+ await asyncio.to_thread(upload_all_files, job, all_courses, output_file)
421
+
422
  job.status = JobStatus.COMPLETED
423
  job.completed_at = datetime.utcnow().isoformat()
424
 
 
439
  traceback.print_exc()
440
 
441
  # ============================================================================
442
+ # API Endpoints (Modified)
443
  # ============================================================================
444
 
445
+ @app.get("/api/health")
446
+ async def health_check():
447
+ """Health check endpoint (moved from /)."""
448
  return {
449
  "status": "running",
450
  "service": "Hugging Face Data Processor",
451
  "version": "1.0.0",
452
+ "dashboard": "http://localhost:8000/"
453
  }
454
 
455
  @app.post("/api/jobs/create")
 
536
  total_jobs = len(jobs_db)
537
  completed = sum(1 for j in jobs_db.values() if j.status == JobStatus.COMPLETED)
538
  failed = sum(1 for j in jobs_db.values() if j.status == JobStatus.FAILED)
539
+ processing = sum(1 for j in jobs_db.values() if j.status in [JobStatus.PROCESSING, JobStatus.FETCHING_FILES, JobStatus.SAVING, JobStatus.UPLOADING])
540
 
541
  total_files = sum(j.total_files for j in jobs_db.values())
542
  total_processed = sum(j.processed_files for j in jobs_db.values())
 
553
  }
554
 
555
  # ============================================================================
556
+ # Web Dashboard (Original - Truncated for brevity, assuming it's the same)
557
  # ============================================================================
558
 
559
  DASHBOARD_HTML = """
 
564
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
565
  <title>Hugging Face Data Processor</title>
566
  <style>
567
+ /* ... (Original CSS) ... */
568
  * {
569
  margin: 0;
570
  padding: 0;
 
715
  color: #856404;
716
  }
717
 
718
+ .status-processing, .status-fetching_files, .status-saving, .status-uploading {
719
  background: #cfe2ff;
720
  color: #084298;
721
  }
 
732
 
733
  .status-cancelled {
734
  background: #e2e3e5;
735
+ color: #495057;
736
  }
737
 
738
+ .progress-bar-container {
739
+ background-color: #e0e0e0;
740
+ border-radius: 5px;
 
 
741
  overflow: hidden;
742
+ margin-top: 10px;
743
  }
744
 
745
+ .progress-bar {
746
+ height: 20px;
747
+ background-color: #667eea;
748
+ text-align: center;
749
+ line-height: 20px;
750
+ color: white;
751
+ transition: width 0.5s ease;
752
  }
753
 
754
  .job-details {
 
 
 
 
 
 
 
755
  font-size: 0.9em;
756
+ color: #555;
757
  }
758
 
759
+ .job-details p {
760
+ margin: 5px 0;
 
761
  }
762
 
763
+ .job-details strong {
764
  color: #333;
 
 
765
  }
766
 
767
+ .error-message {
768
+ color: #842029;
769
+ background: #f8d7da;
770
+ padding: 10px;
771
+ border-radius: 5px;
772
+ margin-top: 10px;
773
+ font-weight: 500;
774
  }
775
 
776
+ footer {
777
+ text-align: center;
778
+ margin-top: 30px;
779
+ color: rgba(255, 255, 255, 0.8);
780
+ font-size: 0.9em;
781
  }
782
+ </style>
783
+ <script>
784
+ const API_BASE = "/api";
785
+ let isProcessing = false;
786
+
787
+ function formatStatus(status) {
788
+ return status.replace('_', ' ').toUpperCase();
789
  }
790
+
791
+ function getStatusClass(status) {
792
+ return `status-${status}`;
 
 
793
  }
794
+
795
+ function updateStats(stats) {
796
+ document.getElementById('total-jobs').textContent = stats.total_jobs;
797
+ document.getElementById('completed-jobs').textContent = stats.completed_jobs;
798
+ document.getElementById('failed-jobs').textContent = stats.failed_jobs;
799
+ document.getElementById('processing-jobs').textContent = stats.processing_jobs;
800
+ document.getElementById('total-files').textContent = stats.total_files;
801
+ document.getElementById('processed-files').textContent = stats.total_files_processed;
802
+ document.getElementById('matched-transcriptions').textContent = stats.total_transcriptions_matched;
803
  }
804
+
805
+ function updateJobList(jobs) {
806
+ const jobList = document.getElementById('job-list');
807
+ jobList.innerHTML = '';
808
+
809
+ jobs.sort((a, b) => new Date(b.created_at) - new Date(a.created_at));
810
+
811
+ jobs.forEach(job => {
812
+ const jobItem = document.createElement('div');
813
+ jobItem.className = 'job-item';
814
+
815
+ const statusClass = getStatusClass(job.status);
816
+ const progress = job.progress_percent.toFixed(1);
817
+
818
+ let uploadProgress = '';
819
+ if (job.status === 'uploading' && job.total_uploads > 0) {
820
+ // Display upload progress based on completed_uploads
821
+ const uploadPercent = (job.completed_uploads / job.total_uploads) * 100;
822
+ uploadProgress = `<p><strong>Upload Progress:</strong> ${job.completed_uploads} / ${job.total_uploads} files uploaded (${uploadPercent.toFixed(1)}%)</p>`;
823
+ }
824
+
825
+ jobItem.innerHTML = `
826
+ <div class="job-header">
827
+ <span class="job-id">${job.job_id}</span>
828
+ <span class="job-status ${statusClass}">${formatStatus(job.status)}</span>
829
+ </div>
830
+ <div class="job-details">
831
+ <p><strong>Created:</strong> ${new Date(job.created_at).toLocaleString()}</p>
832
+ ${job.started_at ? `<p><strong>Started:</strong> ${new Date(job.started_at).toLocaleString()}</p>` : ''}
833
+ ${job.completed_at ? `<p><strong>Completed:</strong> ${new Date(job.completed_at).toLocaleString()}</p>` : ''}
834
+ <p><strong>Files:</strong> ${job.processed_files} / ${job.total_files} processed</p>
835
+ <p><strong>Transcriptions Matched:</strong> ${job.matched_transcriptions}</p>
836
+ ${uploadProgress}
837
+ ${job.output_file ? `<p><strong>Output:</strong> <a href="${API_BASE}/jobs/${job.job_id}/output" target="_blank">${job.output_file.split('/').pop()}</a></p>` : ''}
838
+ ${job.error_message ? `<div class="error-message">Error: ${job.error_message}</div>` : ''}
839
+ </div>
840
+ <div class="progress-bar-container">
841
+ <div class="progress-bar" style="width: ${progress}%;">
842
+ ${progress}%
843
+ </div>
844
+ </div>
845
+ `;
846
+ jobList.appendChild(jobItem);
847
+ });
848
+
849
+ isProcessing = jobs.some(j => j.status === 'processing' || j.status === 'fetching_files' || j.status === 'saving' || j.status === 'uploading');
850
+ document.getElementById('create-job-btn').disabled = isProcessing;
851
  }
852
+
853
+ async function fetchData() {
854
+ try {
855
+ const [statsResponse, jobsResponse] = await Promise.all([
856
+ fetch(`${API_BASE}/stats`),
857
+ fetch(`${API_BASE}/jobs`)
858
+ ]);
859
+
860
+ const stats = await statsResponse.json();
861
+ const jobsData = await jobsResponse.json();
862
+
863
+ updateStats(stats);
864
+ updateJobList(jobsData.jobs);
865
+
866
+ } catch (error) {
867
+ console.error("Error fetching data:", error);
868
+ }
869
  }
870
+
871
+ async function createJob() {
872
+ if (isProcessing) return;
873
+
874
+ document.getElementById('create-job-btn').disabled = true;
875
+ document.getElementById('create-job-btn').textContent = 'Starting...';
876
+
877
+ try {
878
+ const response = await fetch(`${API_BASE}/jobs/create`, { method: 'POST' });
879
+ const result = await response.json();
880
+
881
+ if (response.ok) {
882
+ console.log("Job created:", result);
883
+ } else {
884
+ alert(`Failed to create job: ${result.detail || response.statusText}`);
885
+ }
886
+ } catch (error) {
887
+ console.error("Error creating job:", error);
888
+ alert("An error occurred while trying to create the job.");
889
+ } finally {
890
+ document.getElementById('create-job-btn').textContent = 'Start New Processing Job';
891
+ fetchData(); // Refresh immediately after attempt
892
+ }
893
  }
894
+
895
+ document.addEventListener('DOMContentLoaded', () => {
896
+ document.getElementById('create-job-btn').addEventListener('click', createJob);
897
+ fetchData();
898
+ setInterval(fetchData, 5000); // Refresh every 5 seconds
899
+ });
900
+ </script>
901
  </head>
902
  <body>
903
  <div class="container">
904
  <header>
905
+ <h1>Hugging Face Data Processor</h1>
906
+ <p class="subtitle">Automated processing and upload service for Helium/Data datasets.</p>
907
  <div class="controls">
908
+ <button id="create-job-btn">Start New Processing Job</button>
 
 
909
  </div>
 
910
  </header>
911
+
912
  <div class="grid">
913
  <div class="card">
914
+ <h2>Overall Statistics</h2>
915
  <div class="stat">
916
  <span class="stat-label">Total Jobs</span>
917
+ <span class="stat-value" id="total-jobs">0</span>
918
  </div>
919
  <div class="stat">
920
+ <span class="stat-label">Completed Jobs</span>
921
+ <span class="stat-value" id="completed-jobs">0</span>
922
  </div>
923
  <div class="stat">
924
+ <span class="stat-label">Failed Jobs</span>
925
+ <span class="stat-value" id="failed-jobs">0</span>
926
  </div>
927
  <div class="stat">
928
+ <span class="stat-label">Processing Jobs</span>
929
+ <span class="stat-value" id="processing-jobs">0</span>
930
  </div>
931
  </div>
 
932
  <div class="card">
933
+ <h2>Processing Totals</h2>
934
  <div class="stat">
935
+ <span class="stat-label">Total Files Found</span>
936
+ <span class="stat-value" id="total-files">0</span>
937
  </div>
938
  <div class="stat">
939
+ <span class="stat-label">Total Files Processed</span>
940
+ <span class="stat-value" id="processed-files">0</span>
941
  </div>
942
  <div class="stat">
943
  <span class="stat-label">Transcriptions Matched</span>
944
+ <span class="stat-value" id="matched-transcriptions">0</span>
945
  </div>
946
  </div>
947
  </div>
948
+
949
  <div class="job-list">
950
+ <h2>Recent Jobs</h2>
951
+ <div id="job-list">
952
+ <!-- Job items will be inserted here by JavaScript -->
953
+ </div>
954
  </div>
955
 
956
+ <footer>
957
+ Hugging Face Data Processor v1.0.0 | Running on Uvicorn/FastAPI
958
+ </footer>
959
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
960
  </body>
961
  </html>
962
  """
963
 
964
+ @app.get("/", response_class=HTMLResponse)
965
  async def dashboard():
966
+ """Web dashboard endpoint (moved to root)."""
967
+ return DASHBOARD_HTML
968
 
969
  # ============================================================================
970
+ # Main Execution Block
971
  # ============================================================================
972
 
973
  def main():
 
 
 
974
  print("="*70)
975
+ print("Hugging Face Data Processor Server")
976
+ print(f"Dashboard: http://localhost:8000/")
977
+ print(f"Health Check: http://localhost:8000/api/health")
978
  print(f"Output Dir: {OUTPUT_DIR.absolute()}")
979
  print("="*70 + "\n")
980
 
 
986
  )
987
 
988
  if __name__ == "__main__":
989
+ # Ensure the huggingface_hub library is installed
990
+ try:
991
+ import huggingface_hub
992
+ except ImportError:
993
+ print("The 'huggingface_hub' library is not installed. Please install it with: pip install huggingface-hub")
994
+ sys.exit(1)
995
+
996
  main()