Samfredoly commited on
Commit
0a799d9
Β·
verified Β·
1 Parent(s): 1d50bd2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +912 -0
app.py ADDED
@@ -0,0 +1,912 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Hugging Face Data Processor - Single Unified Server
4
+
5
+ A complete, self-contained FastAPI application that:
6
+ 1. Automatically processes all courses from samelias1/Helium and samelias1/Data
7
+ 2. Merges frame data with cursor information
8
+ 3. Searches for exact transcription matches in samfred2/ATO
9
+ 4. Generates combined JSON output ready for upload to samfred2/ALL
10
+ 5. Provides REST API for monitoring and management
11
+ 6. Includes built-in web dashboard for real-time progress tracking
12
+
13
+ Run with: python server.py
14
+ Then open: http://localhost:8000
15
+ """
16
+
17
+ import json
18
+ import asyncio
19
+ import os
20
+ import sys
21
+ from pathlib import Path
22
+ from typing import Optional, List, Dict, Any
23
+ from datetime import datetime
24
+ from enum import Enum
25
+ from collections import defaultdict
26
+ import traceback
27
+
28
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
29
+ from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
30
+ from fastapi.staticfiles import StaticFiles
31
+ from fastapi.middleware.cors import CORSMiddleware
32
+ from pydantic import BaseModel
33
+ from huggingface_hub import hf_hub_download, HfApi
34
+ import uvicorn
35
+
36
+ # ============================================================================
37
+ # Configuration
38
+ # ============================================================================
39
+
40
+ DATASET_HELIUM = "samelias1/Helium"
41
+ DATASET_DATA = "samelias1/Data"
42
+ DATASET_ATO = "samfred2/ATO"
43
+ DATASET_OUTPUT = "samfred2/ALL"
44
+
45
+ OUTPUT_DIR = Path("./output")
46
+ OUTPUT_DIR.mkdir(exist_ok=True)
47
+
48
+ # ============================================================================
49
+ # Models & Enums
50
+ # ============================================================================
51
+
52
+ class JobStatus(str, Enum):
53
+ PENDING = "pending"
54
+ FETCHING_FILES = "fetching_files"
55
+ PROCESSING = "processing"
56
+ COMPLETED = "completed"
57
+ FAILED = "failed"
58
+ CANCELLED = "cancelled"
59
+
60
+ class ProcessingJob(BaseModel):
61
+ job_id: str
62
+ status: JobStatus
63
+ total_files: int = 0
64
+ processed_files: int = 0
65
+ matched_transcriptions: int = 0
66
+ error_message: Optional[str] = None
67
+ created_at: str
68
+ started_at: Optional[str] = None
69
+ completed_at: Optional[str] = None
70
+ output_file: Optional[str] = None
71
+ progress_percent: float = 0.0
72
+
73
+ # ============================================================================
74
+ # Global State
75
+ # ============================================================================
76
+
77
+ jobs_db: Dict[str, ProcessingJob] = {}
78
+ jobs_lock = asyncio.Lock()
79
+
80
+ # ============================================================================
81
+ # FastAPI App Setup
82
+ # ============================================================================
83
+
84
+ app = FastAPI(
85
+ title="Hugging Face Data Processor",
86
+ description="Process and merge Hugging Face datasets automatically",
87
+ version="1.0.0"
88
+ )
89
+
90
+ # Add CORS middleware
91
+ app.add_middleware(
92
+ CORSMiddleware,
93
+ allow_origins=["*"],
94
+ allow_credentials=True,
95
+ allow_methods=["*"],
96
+ allow_headers=["*"],
97
+ )
98
+
99
+ # ============================================================================
100
+ # Helper Functions
101
+ # ============================================================================
102
+
103
+ def get_hf_api() -> HfApi:
104
+ """Initialize Hugging Face API client."""
105
+ return HfApi()
106
+
107
+ def list_dataset_files(dataset_id: str) -> List[str]:
108
+ """Fetch all file names from a Hugging Face dataset."""
109
+ try:
110
+ print(f"[HF] Listing files from {dataset_id}...")
111
+ api = get_hf_api()
112
+ files = api.list_repo_files(repo_id=dataset_id, repo_type="dataset")
113
+ file_list = list(files)
114
+ print(f"[HF] Found {len(file_list)} files in {dataset_id}")
115
+ return file_list
116
+ except Exception as e:
117
+ print(f"[ERROR] Failed to list files from {dataset_id}: {e}")
118
+ return []
119
+
120
+ def download_file(repo_id: str, file_name: str) -> Optional[str]:
121
+ """Download a file from Hugging Face dataset to cache."""
122
+ try:
123
+ print(f"[DOWNLOAD] {file_name} from {repo_id}...")
124
+ path = hf_hub_download(
125
+ repo_id=repo_id,
126
+ filename=file_name,
127
+ repo_type="dataset"
128
+ )
129
+ print(f"[DOWNLOAD] βœ“ {file_name}")
130
+ return path
131
+ except Exception as e:
132
+ print(f"[ERROR] Failed to download {file_name}: {e}")
133
+ return None
134
+
135
+ def load_json_file(file_path: str) -> Optional[Dict | List]:
136
+ """Load and parse a JSON file."""
137
+ try:
138
+ with open(file_path, "r") as f:
139
+ return json.load(f)
140
+ except Exception as e:
141
+ print(f"[ERROR] Failed to load JSON from {file_path}: {e}")
142
+ return None
143
+
144
+ def merge_course_data(helium_path: str, data_path: str) -> List[Dict]:
145
+ """Merge frame data from Helium with cursor data from Data dataset."""
146
+ try:
147
+ helium_data = load_json_file(helium_path)
148
+ data_data = load_json_file(data_path)
149
+
150
+ if not helium_data or not data_data:
151
+ return []
152
+
153
+ # Create lookup dictionary from Data dataset
154
+ cursor_lookup = {}
155
+ for item in data_data:
156
+ key = (item.get("course"), item.get("image_path"))
157
+ cursor_lookup[key] = {k: v for k, v in item.items() if k not in ["course", "image_path"]}
158
+
159
+ # Merge with Helium data
160
+ merged_data = []
161
+ for index, item in enumerate(helium_data):
162
+ key = (item.get("course"), item.get("image_path"))
163
+
164
+ merged_item = item.copy()
165
+ if key in cursor_lookup:
166
+ merged_item.update(cursor_lookup[key])
167
+
168
+ # Clean up unwanted fields
169
+ merged_item.pop("server_url", None)
170
+ merged_item.pop("timestamp", None)
171
+
172
+ # Renumber image_path sequentially
173
+ merged_item["image_path"] = index + 1
174
+
175
+ merged_data.append(merged_item)
176
+
177
+ return merged_data
178
+ except Exception as e:
179
+ print(f"[ERROR] Failed to merge course data: {e}")
180
+ return []
181
+
182
+ def find_exact_transcription(course_name: str, ato_files: List[str]) -> Optional[str]:
183
+ """Search for exact transcription file match in ATO dataset."""
184
+ expected_file = course_name.replace("_frames.json", ".json")
185
+
186
+ if expected_file in ato_files:
187
+ print(f"[MATCH] Found transcription: {expected_file}")
188
+ return expected_file
189
+
190
+ return None
191
+
192
+ async def process_single_course(
193
+ course_file: str,
194
+ job: ProcessingJob,
195
+ ato_files: List[str]
196
+ ) -> Optional[Dict]:
197
+ """Process a single course: merge data and fetch transcription if available."""
198
+ try:
199
+ print(f"\n[PROCESS] Course: {course_file}")
200
+
201
+ # Download from Helium and Data
202
+ helium_path = download_file(DATASET_HELIUM, course_file)
203
+ data_path = download_file(DATASET_DATA, course_file)
204
+
205
+ if not helium_path or not data_path:
206
+ print(f"[SKIP] Missing data files for {course_file}")
207
+ return None
208
+
209
+ # Merge frame data
210
+ merged_frames = merge_course_data(helium_path, data_path)
211
+ if not merged_frames:
212
+ print(f"[SKIP] Failed to merge data for {course_file}")
213
+ return None
214
+
215
+ # Try to find and download transcription
216
+ transcription_data = None
217
+ expected_ato_file = find_exact_transcription(course_file, ato_files)
218
+
219
+ if expected_ato_file:
220
+ ato_path = download_file(DATASET_ATO, expected_ato_file)
221
+ if ato_path:
222
+ transcription_data = load_json_file(ato_path)
223
+ if transcription_data:
224
+ job.matched_transcriptions += 1
225
+
226
+ # Prepare output: frames + transcription (or "none")
227
+ course_name = course_file.replace("_frames.json", "")
228
+ output = {
229
+ "course": course_name,
230
+ "frames": merged_frames,
231
+ "transcription": transcription_data if transcription_data else "none"
232
+ }
233
+
234
+ return output
235
+
236
+ except Exception as e:
237
+ print(f"[ERROR] Failed to process {course_file}: {e}")
238
+ traceback.print_exc()
239
+ return None
240
+
241
+ async def process_all_courses_background(job_id: str):
242
+ """Main background processing function."""
243
+ job = jobs_db.get(job_id)
244
+ if not job:
245
+ return
246
+
247
+ try:
248
+ job.status = JobStatus.FETCHING_FILES
249
+ job.started_at = datetime.utcnow().isoformat()
250
+
251
+ print(f"\n{'='*70}")
252
+ print(f"[JOB] Starting job: {job_id}")
253
+ print(f"{'='*70}\n")
254
+
255
+ # Fetch file lists from all datasets
256
+ print("[INIT] Fetching file lists from datasets...")
257
+ helium_files = list_dataset_files(DATASET_HELIUM)
258
+ ato_files = list_dataset_files(DATASET_ATO)
259
+
260
+ # Filter to only _frames.json files from Helium
261
+ course_files = [f for f in helium_files if f.endswith("_frames.json")]
262
+ job.total_files = len(course_files)
263
+
264
+ print(f"[INIT] Found {len(course_files)} courses to process")
265
+ print(f"[INIT] Found {len(ato_files)} files in ATO dataset\n")
266
+
267
+ # Process each course
268
+ job.status = JobStatus.PROCESSING
269
+ all_courses = []
270
+
271
+ for idx, course_file in enumerate(course_files):
272
+ try:
273
+ course_data = await process_single_course(course_file, job, ato_files)
274
+ if course_data:
275
+ all_courses.append(course_data)
276
+
277
+ job.processed_files = idx + 1
278
+ job.progress_percent = (job.processed_files / job.total_files) * 100
279
+
280
+ print(f"[PROGRESS] {job.processed_files}/{job.total_files} ({job.progress_percent:.1f}%)")
281
+
282
+ # Small delay to avoid rate limiting
283
+ await asyncio.sleep(0.05)
284
+
285
+ except Exception as e:
286
+ print(f"[ERROR] Failed to process {course_file}: {e}")
287
+ continue
288
+
289
+ # Save combined output
290
+ timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
291
+ output_file = OUTPUT_DIR / f"combined_output_{timestamp}.json"
292
+
293
+ print(f"\n[SAVE] Saving output to {output_file}...")
294
+ with open(output_file, "w") as f:
295
+ json.dump(all_courses, f, indent=2)
296
+
297
+ job.output_file = str(output_file)
298
+ job.status = JobStatus.COMPLETED
299
+ job.completed_at = datetime.utcnow().isoformat()
300
+
301
+ print(f"\n{'='*70}")
302
+ print(f"[SUCCESS] Job completed!")
303
+ print(f"{'='*70}")
304
+ print(f"Total courses processed: {len(all_courses)}")
305
+ print(f"Transcriptions matched: {job.matched_transcriptions}")
306
+ print(f"Output file: {output_file}")
307
+ print(f"File size: {output_file.stat().st_size / (1024*1024):.2f} MB")
308
+ print(f"{'='*70}\n")
309
+
310
+ except Exception as e:
311
+ job.status = JobStatus.FAILED
312
+ job.error_message = str(e)
313
+ job.completed_at = datetime.utcnow().isoformat()
314
+ print(f"\n[FAILED] Job failed: {e}")
315
+ traceback.print_exc()
316
+
317
+ # ============================================================================
318
+ # API Endpoints
319
+ # ============================================================================
320
+
321
+ @app.get("/")
322
+ async def root():
323
+ """Health check endpoint."""
324
+ return {
325
+ "status": "running",
326
+ "service": "Hugging Face Data Processor",
327
+ "version": "1.0.0",
328
+ "dashboard": "http://localhost:8000/dashboard"
329
+ }
330
+
331
+ @app.post("/api/jobs/create")
332
+ async def create_job(background_tasks: BackgroundTasks):
333
+ """Create and start a new processing job."""
334
+ job_id = f"job_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}"
335
+
336
+ job = ProcessingJob(
337
+ job_id=job_id,
338
+ status=JobStatus.PENDING,
339
+ created_at=datetime.utcnow().isoformat()
340
+ )
341
+
342
+ async with jobs_lock:
343
+ jobs_db[job_id] = job
344
+
345
+ # Start processing in background
346
+ background_tasks.add_task(process_all_courses_background, job_id)
347
+
348
+ return {
349
+ "job_id": job_id,
350
+ "status": "started",
351
+ "message": "Processing job created and started"
352
+ }
353
+
354
+ @app.get("/api/jobs/{job_id}")
355
+ async def get_job_status(job_id: str):
356
+ """Get the status of a processing job."""
357
+ job = jobs_db.get(job_id)
358
+ if not job:
359
+ raise HTTPException(status_code=404, detail="Job not found")
360
+
361
+ return job
362
+
363
+ @app.get("/api/jobs")
364
+ async def list_jobs():
365
+ """List all processing jobs."""
366
+ return {
367
+ "total_jobs": len(jobs_db),
368
+ "jobs": list(jobs_db.values())
369
+ }
370
+
371
+ @app.post("/api/jobs/{job_id}/cancel")
372
+ async def cancel_job(job_id: str):
373
+ """Cancel a processing job."""
374
+ job = jobs_db.get(job_id)
375
+ if not job:
376
+ raise HTTPException(status_code=404, detail="Job not found")
377
+
378
+ if job.status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
379
+ raise HTTPException(status_code=400, detail="Cannot cancel completed or failed job")
380
+
381
+ job.status = JobStatus.CANCELLED
382
+ job.error_message = "Job cancelled by user"
383
+ job.completed_at = datetime.utcnow().isoformat()
384
+
385
+ return {"status": "cancelled", "job_id": job_id}
386
+
387
+ @app.get("/api/jobs/{job_id}/output")
388
+ async def get_job_output(job_id: str):
389
+ """Download the combined output JSON for a completed job."""
390
+ job = jobs_db.get(job_id)
391
+ if not job:
392
+ raise HTTPException(status_code=404, detail="Job not found")
393
+
394
+ if job.status != JobStatus.COMPLETED:
395
+ raise HTTPException(status_code=400, detail="Job not completed yet")
396
+
397
+ if not job.output_file:
398
+ raise HTTPException(status_code=404, detail="Output file not found")
399
+
400
+ try:
401
+ return FileResponse(
402
+ path=job.output_file,
403
+ filename=Path(job.output_file).name,
404
+ media_type="application/json"
405
+ )
406
+ except Exception as e:
407
+ raise HTTPException(status_code=500, detail=f"Error reading output: {str(e)}")
408
+
409
+ @app.get("/api/stats")
410
+ async def get_stats():
411
+ """Get overall statistics about all jobs."""
412
+ total_jobs = len(jobs_db)
413
+ completed = sum(1 for j in jobs_db.values() if j.status == JobStatus.COMPLETED)
414
+ failed = sum(1 for j in jobs_db.values() if j.status == JobStatus.FAILED)
415
+ processing = sum(1 for j in jobs_db.values() if j.status in [JobStatus.PROCESSING, JobStatus.FETCHING_FILES])
416
+
417
+ total_files = sum(j.total_files for j in jobs_db.values())
418
+ total_processed = sum(j.processed_files for j in jobs_db.values())
419
+ total_matched = sum(j.matched_transcriptions for j in jobs_db.values())
420
+
421
+ return {
422
+ "total_jobs": total_jobs,
423
+ "completed_jobs": completed,
424
+ "failed_jobs": failed,
425
+ "processing_jobs": processing,
426
+ "total_files_processed": total_processed,
427
+ "total_files": total_files,
428
+ "total_transcriptions_matched": total_matched
429
+ }
430
+
431
+ # ============================================================================
432
+ # Web Dashboard
433
+ # ============================================================================
434
+
435
+ DASHBOARD_HTML = """
436
+ <!DOCTYPE html>
437
+ <html lang="en">
438
+ <head>
439
+ <meta charset="UTF-8">
440
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
441
+ <title>Hugging Face Data Processor</title>
442
+ <style>
443
+ * {
444
+ margin: 0;
445
+ padding: 0;
446
+ box-sizing: border-box;
447
+ }
448
+
449
+ body {
450
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
451
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
452
+ min-height: 100vh;
453
+ padding: 20px;
454
+ }
455
+
456
+ .container {
457
+ max-width: 1200px;
458
+ margin: 0 auto;
459
+ }
460
+
461
+ header {
462
+ background: rgba(255, 255, 255, 0.95);
463
+ padding: 30px;
464
+ border-radius: 12px;
465
+ margin-bottom: 30px;
466
+ box-shadow: 0 10px 40px rgba(0, 0, 0, 0.1);
467
+ }
468
+
469
+ h1 {
470
+ color: #333;
471
+ margin-bottom: 10px;
472
+ font-size: 2.5em;
473
+ }
474
+
475
+ .subtitle {
476
+ color: #666;
477
+ font-size: 1.1em;
478
+ }
479
+
480
+ .controls {
481
+ display: flex;
482
+ gap: 15px;
483
+ margin-top: 20px;
484
+ flex-wrap: wrap;
485
+ }
486
+
487
+ button {
488
+ background: #667eea;
489
+ color: white;
490
+ border: none;
491
+ padding: 12px 24px;
492
+ border-radius: 6px;
493
+ cursor: pointer;
494
+ font-size: 1em;
495
+ font-weight: 600;
496
+ transition: all 0.3s ease;
497
+ }
498
+
499
+ button:hover {
500
+ background: #764ba2;
501
+ transform: translateY(-2px);
502
+ box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2);
503
+ }
504
+
505
+ button:disabled {
506
+ background: #ccc;
507
+ cursor: not-allowed;
508
+ transform: none;
509
+ }
510
+
511
+ .grid {
512
+ display: grid;
513
+ grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
514
+ gap: 20px;
515
+ margin-bottom: 30px;
516
+ }
517
+
518
+ .card {
519
+ background: rgba(255, 255, 255, 0.95);
520
+ padding: 25px;
521
+ border-radius: 12px;
522
+ box-shadow: 0 10px 40px rgba(0, 0, 0, 0.1);
523
+ }
524
+
525
+ .card h2 {
526
+ color: #333;
527
+ margin-bottom: 15px;
528
+ font-size: 1.3em;
529
+ }
530
+
531
+ .stat {
532
+ display: flex;
533
+ justify-content: space-between;
534
+ padding: 10px 0;
535
+ border-bottom: 1px solid #eee;
536
+ }
537
+
538
+ .stat:last-child {
539
+ border-bottom: none;
540
+ }
541
+
542
+ .stat-label {
543
+ color: #666;
544
+ font-weight: 500;
545
+ }
546
+
547
+ .stat-value {
548
+ color: #333;
549
+ font-weight: 700;
550
+ font-size: 1.1em;
551
+ }
552
+
553
+ .job-list {
554
+ background: rgba(255, 255, 255, 0.95);
555
+ padding: 25px;
556
+ border-radius: 12px;
557
+ box-shadow: 0 10px 40px rgba(0, 0, 0, 0.1);
558
+ }
559
+
560
+ .job-item {
561
+ padding: 20px;
562
+ border: 1px solid #eee;
563
+ border-radius: 8px;
564
+ margin-bottom: 15px;
565
+ background: #f9f9f9;
566
+ }
567
+
568
+ .job-header {
569
+ display: flex;
570
+ justify-content: space-between;
571
+ align-items: center;
572
+ margin-bottom: 15px;
573
+ }
574
+
575
+ .job-id {
576
+ font-family: monospace;
577
+ color: #667eea;
578
+ font-weight: 600;
579
+ }
580
+
581
+ .job-status {
582
+ padding: 6px 12px;
583
+ border-radius: 20px;
584
+ font-size: 0.9em;
585
+ font-weight: 600;
586
+ }
587
+
588
+ .status-pending {
589
+ background: #fff3cd;
590
+ color: #856404;
591
+ }
592
+
593
+ .status-processing {
594
+ background: #cfe2ff;
595
+ color: #084298;
596
+ }
597
+
598
+ .status-completed {
599
+ background: #d1e7dd;
600
+ color: #0f5132;
601
+ }
602
+
603
+ .status-failed {
604
+ background: #f8d7da;
605
+ color: #842029;
606
+ }
607
+
608
+ .status-cancelled {
609
+ background: #e2e3e5;
610
+ color: #383d41;
611
+ }
612
+
613
+ .progress-bar {
614
+ width: 100%;
615
+ height: 8px;
616
+ background: #e0e0e0;
617
+ border-radius: 4px;
618
+ overflow: hidden;
619
+ margin: 10px 0;
620
+ }
621
+
622
+ .progress-fill {
623
+ height: 100%;
624
+ background: linear-gradient(90deg, #667eea, #764ba2);
625
+ transition: width 0.3s ease;
626
+ }
627
+
628
+ .job-details {
629
+ display: grid;
630
+ grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
631
+ gap: 15px;
632
+ margin-top: 15px;
633
+ }
634
+
635
+ .detail {
636
+ font-size: 0.9em;
637
+ }
638
+
639
+ .detail-label {
640
+ color: #666;
641
+ font-weight: 500;
642
+ }
643
+
644
+ .detail-value {
645
+ color: #333;
646
+ font-weight: 600;
647
+ margin-top: 5px;
648
+ }
649
+
650
+ .message {
651
+ padding: 15px;
652
+ border-radius: 6px;
653
+ margin-bottom: 15px;
654
+ display: none;
655
+ }
656
+
657
+ .message.show {
658
+ display: block;
659
+ }
660
+
661
+ .message.success {
662
+ background: #d1e7dd;
663
+ color: #0f5132;
664
+ border: 1px solid #badbcc;
665
+ }
666
+
667
+ .message.error {
668
+ background: #f8d7da;
669
+ color: #842029;
670
+ border: 1px solid #f5c2c7;
671
+ }
672
+
673
+ .loading {
674
+ display: inline-block;
675
+ width: 20px;
676
+ height: 20px;
677
+ border: 3px solid #f3f3f3;
678
+ border-top: 3px solid #667eea;
679
+ border-radius: 50%;
680
+ animation: spin 1s linear infinite;
681
+ }
682
+
683
+ @keyframes spin {
684
+ 0% { transform: rotate(0deg); }
685
+ 100% { transform: rotate(360deg); }
686
+ }
687
+
688
+ .empty {
689
+ text-align: center;
690
+ padding: 40px;
691
+ color: #999;
692
+ }
693
+
694
+ .footer {
695
+ text-align: center;
696
+ color: rgba(255, 255, 255, 0.8);
697
+ margin-top: 30px;
698
+ font-size: 0.9em;
699
+ }
700
+ </style>
701
+ </head>
702
+ <body>
703
+ <div class="container">
704
+ <header>
705
+ <h1>πŸš€ Hugging Face Data Processor</h1>
706
+ <p class="subtitle">Automated dataset processing and merging</p>
707
+ <div class="controls">
708
+ <button onclick="createJob()" id="createBtn">Create New Job</button>
709
+ <button onclick="refreshAll()">Refresh</button>
710
+ <button onclick="clearJobs()">Clear All</button>
711
+ </div>
712
+ <div id="message" class="message"></div>
713
+ </header>
714
+
715
+ <div class="grid">
716
+ <div class="card">
717
+ <h2>πŸ“Š Statistics</h2>
718
+ <div class="stat">
719
+ <span class="stat-label">Total Jobs</span>
720
+ <span class="stat-value" id="totalJobs">0</span>
721
+ </div>
722
+ <div class="stat">
723
+ <span class="stat-label">Completed</span>
724
+ <span class="stat-value" id="completedJobs">0</span>
725
+ </div>
726
+ <div class="stat">
727
+ <span class="stat-label">Processing</span>
728
+ <span class="stat-value" id="processingJobs">0</span>
729
+ </div>
730
+ <div class="stat">
731
+ <span class="stat-label">Failed</span>
732
+ <span class="stat-value" id="failedJobs">0</span>
733
+ </div>
734
+ </div>
735
+
736
+ <div class="card">
737
+ <h2>πŸ“ˆ Progress</h2>
738
+ <div class="stat">
739
+ <span class="stat-label">Files Processed</span>
740
+ <span class="stat-value" id="filesProcessed">0</span>
741
+ </div>
742
+ <div class="stat">
743
+ <span class="stat-label">Total Files</span>
744
+ <span class="stat-value" id="totalFiles">0</span>
745
+ </div>
746
+ <div class="stat">
747
+ <span class="stat-label">Transcriptions Matched</span>
748
+ <span class="stat-value" id="transcriptionsMatched">0</span>
749
+ </div>
750
+ </div>
751
+ </div>
752
+
753
+ <div class="job-list">
754
+ <h2>πŸ“‹ Processing Jobs</h2>
755
+ <div id="jobsList" class="empty">No jobs yet. Create one to get started!</div>
756
+ </div>
757
+
758
+ <div class="footer">
759
+ <p>Hugging Face Data Processor v1.0 | API: http://localhost:8000/docs</p>
760
+ </div>
761
+ </div>
762
+
763
+ <script>
764
+ const API_BASE = '/api';
765
+ let autoRefresh = setInterval(refreshAll, 2000);
766
+
767
+ function showMessage(text, type = 'success') {
768
+ const msg = document.getElementById('message');
769
+ msg.textContent = text;
770
+ msg.className = `message show ${type}`;
771
+ setTimeout(() => msg.classList.remove('show'), 5000);
772
+ }
773
+
774
+ async function createJob() {
775
+ try {
776
+ document.getElementById('createBtn').disabled = true;
777
+ const res = await fetch(`${API_BASE}/jobs/create`, { method: 'POST' });
778
+ const data = await res.json();
779
+ showMessage(`Job created: ${data.job_id}`, 'success');
780
+ refreshAll();
781
+ } catch (e) {
782
+ showMessage(`Error: ${e.message}`, 'error');
783
+ } finally {
784
+ document.getElementById('createBtn').disabled = false;
785
+ }
786
+ }
787
+
788
+ async function cancelJob(jobId) {
789
+ if (!confirm(`Cancel job ${jobId}?`)) return;
790
+ try {
791
+ await fetch(`${API_BASE}/jobs/${jobId}/cancel`, { method: 'POST' });
792
+ showMessage('Job cancelled', 'success');
793
+ refreshAll();
794
+ } catch (e) {
795
+ showMessage(`Error: ${e.message}`, 'error');
796
+ }
797
+ }
798
+
799
+ async function downloadOutput(jobId) {
800
+ try {
801
+ window.location.href = `${API_BASE}/jobs/${jobId}/output`;
802
+ } catch (e) {
803
+ showMessage(`Error: ${e.message}`, 'error');
804
+ }
805
+ }
806
+
807
+ async function refreshAll() {
808
+ try {
809
+ const statsRes = await fetch(`${API_BASE}/stats`);
810
+ const stats = await statsRes.json();
811
+
812
+ const jobsRes = await fetch(`${API_BASE}/jobs`);
813
+ const jobsData = await jobsRes.json();
814
+
815
+ // Update stats
816
+ document.getElementById('totalJobs').textContent = stats.total_jobs;
817
+ document.getElementById('completedJobs').textContent = stats.completed_jobs;
818
+ document.getElementById('processingJobs').textContent = stats.processing_jobs;
819
+ document.getElementById('failedJobs').textContent = stats.failed_jobs;
820
+ document.getElementById('filesProcessed').textContent = stats.total_files_processed;
821
+ document.getElementById('totalFiles').textContent = stats.total_files;
822
+ document.getElementById('transcriptionsMatched').textContent = stats.total_transcriptions_matched;
823
+
824
+ // Update jobs list
825
+ const jobsList = document.getElementById('jobsList');
826
+ if (jobsData.jobs.length === 0) {
827
+ jobsList.innerHTML = '<div class="empty">No jobs yet. Create one to get started!</div>';
828
+ } else {
829
+ jobsList.innerHTML = jobsData.jobs.map(job => `
830
+ <div class="job-item">
831
+ <div class="job-header">
832
+ <span class="job-id">${job.job_id}</span>
833
+ <span class="job-status status-${job.status}">${job.status.toUpperCase()}</span>
834
+ </div>
835
+ ${job.status === 'processing' || job.status === 'fetching_files' ? `
836
+ <div class="progress-bar">
837
+ <div class="progress-fill" style="width: ${job.progress_percent}%"></div>
838
+ </div>
839
+ ` : ''}
840
+ <div class="job-details">
841
+ <div class="detail">
842
+ <div class="detail-label">Files</div>
843
+ <div class="detail-value">${job.processed_files}/${job.total_files}</div>
844
+ </div>
845
+ <div class="detail">
846
+ <div class="detail-label">Transcriptions</div>
847
+ <div class="detail-value">${job.matched_transcriptions}</div>
848
+ </div>
849
+ <div class="detail">
850
+ <div class="detail-label">Created</div>
851
+ <div class="detail-value">${new Date(job.created_at).toLocaleString()}</div>
852
+ </div>
853
+ </div>
854
+ ${job.error_message ? `<div style="color: #d32f2f; margin-top: 10px;">Error: ${job.error_message}</div>` : ''}
855
+ <div style="margin-top: 15px; display: flex; gap: 10px;">
856
+ ${job.status === 'processing' || job.status === 'pending' || job.status === 'fetching_files' ? `
857
+ <button onclick="cancelJob('${job.job_id}')" style="background: #d32f2f;">Cancel</button>
858
+ ` : ''}
859
+ ${job.status === 'completed' && job.output_file ? `
860
+ <button onclick="downloadOutput('${job.job_id}')" style="background: #4caf50;">Download Output</button>
861
+ ` : ''}
862
+ </div>
863
+ </div>
864
+ `).join('');
865
+ }
866
+ } catch (e) {
867
+ console.error('Refresh error:', e);
868
+ }
869
+ }
870
+
871
+ function clearJobs() {
872
+ if (confirm('Clear all jobs from memory? (This does not delete output files)')) {
873
+ location.reload();
874
+ }
875
+ }
876
+
877
+ // Initial load
878
+ refreshAll();
879
+ </script>
880
+ </body>
881
+ </html>
882
+ """
883
+
884
+ @app.get("/dashboard")
885
+ async def dashboard():
886
+ """Serve the web dashboard."""
887
+ return HTMLResponse(content=DASHBOARD_HTML)
888
+
889
+ # ============================================================================
890
+ # Main Entry Point
891
+ # ============================================================================
892
+
893
+ def main():
894
+ """Start the server."""
895
+ print("\n" + "="*70)
896
+ print("πŸš€ Hugging Face Data Processor - Starting Server")
897
+ print("="*70)
898
+ print(f"API Base URL: http://localhost:8000")
899
+ print(f"Dashboard: http://localhost:8000/dashboard")
900
+ print(f"Swagger UI: http://localhost:8000/docs")
901
+ print(f"Output Dir: {OUTPUT_DIR.absolute()}")
902
+ print("="*70 + "\n")
903
+
904
+ uvicorn.run(
905
+ app,
906
+ host="0.0.0.0",
907
+ port=8000,
908
+ log_level="info"
909
+ )
910
+
911
+ if __name__ == "__main__":
912
+ main()