factorstudios commited on
Commit
e30b533
ยท
verified ยท
1 Parent(s): a31ce2e

Create transcription_server.py

Browse files
Files changed (1) hide show
  1. transcription_server.py +492 -0
transcription_server.py ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import tempfile
4
+ import shutil
5
+ from pathlib import Path
6
+ from datetime import datetime
7
+ from dotenv import load_dotenv
8
+
9
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
10
+ from fastapi.responses import HTMLResponse, JSONResponse
11
+ from fastapi.staticfiles import StaticFiles
12
+ from pydantic import BaseModel
13
+ import uvicorn
14
+
15
+ try:
16
+ from huggingface_hub import hf_hub_download, upload_file, list_repo_files
17
+ import whisper
18
+ except ImportError as e:
19
+ print(f"Missing dependency: {e}")
20
+ exit(1)
21
+
22
+ # Load environment variables
23
+ load_dotenv()
24
+ HF_TOKEN = os.getenv("HF_TOKEN")
25
+ if not HF_TOKEN:
26
+ print("Error: HF_TOKEN not found in .env file")
27
+ exit(1)
28
+
29
+ app = FastAPI(title="Movie Transcription Service")
30
+
31
+ # In-memory job tracking
32
+ jobs = {}
33
+
34
+ class TranscriptionRequest(BaseModel):
35
+ dataset_link: str
36
+ model_size: str = "small"
37
+
38
+ def format_timestamp(seconds: float) -> str:
39
+ """Convert seconds to HH:MM:SS format."""
40
+ hours = int(seconds // 3600)
41
+ minutes = int((seconds % 3600) // 60)
42
+ secs = int(seconds % 60)
43
+ return f"{hours:02d}:{minutes:02d}:{secs:02d}"
44
+
45
+ def transcribe_with_timestamps(video_path: str, model_size: str) -> str:
46
+ """Transcribe video and include timestamps."""
47
+ print(f"Loading Whisper model: {model_size}")
48
+ model = whisper.load_model(model_size)
49
+
50
+ print(f"Transcribing audio from: {video_path}")
51
+ result = model.transcribe(video_path)
52
+
53
+ # Format transcript with timestamps
54
+ transcript_lines = []
55
+ transcript_lines.append("=" * 80)
56
+ transcript_lines.append("MOVIE TRANSCRIPTION WITH TIMESTAMPS")
57
+ transcript_lines.append("=" * 80)
58
+ transcript_lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
59
+ transcript_lines.append("")
60
+
61
+ if "segments" in result:
62
+ for segment in result["segments"]:
63
+ timestamp = format_timestamp(segment["start"])
64
+ text = segment["text"].strip()
65
+ if text:
66
+ transcript_lines.append(f"[{timestamp}] {text}")
67
+ else:
68
+ # Fallback if segments not available
69
+ transcript_lines.append(result.get("text", ""))
70
+
71
+ return "\n".join(transcript_lines)
72
+
73
+ def extract_dataset_info(dataset_link: str) -> tuple:
74
+ """Extract repo_id and filename from dataset link."""
75
+ # Examples:
76
+ # https://huggingface.co/datasets/factorstudios/movs/blob/main/Captain.America.Brave.New.World.(NKIRI.COM).2025.mkv
77
+ # factorstudios/movs/Captain.America.Brave.New.World.(NKIRI.COM).2025.mkv
78
+
79
+ link = dataset_link.strip()
80
+
81
+ if "huggingface.co" in link:
82
+ # Parse HF URL
83
+ parts = link.split("/")
84
+ if "datasets" in parts:
85
+ idx = parts.index("datasets")
86
+ owner = parts[idx + 1]
87
+ repo = parts[idx + 2]
88
+ # Find filename (after /blob/main/ or /blob/[branch]/)
89
+ if "blob" in parts:
90
+ blob_idx = parts.index("blob")
91
+ filename = "/".join(parts[blob_idx + 2:])
92
+ else:
93
+ filename = parts[-1]
94
+ repo_id = f"{owner}/{repo}"
95
+ return repo_id, filename
96
+ else:
97
+ # Assume it's in format: owner/repo/filename
98
+ parts = link.split("/")
99
+ if len(parts) >= 3:
100
+ repo_id = f"{parts[0]}/{parts[1]}"
101
+ filename = "/".join(parts[2:])
102
+ return repo_id, filename
103
+
104
+ raise ValueError(f"Cannot parse dataset link: {link}")
105
+
106
+ async def process_transcription(job_id: str, dataset_link: str, model_size: str):
107
+ """Background task to process transcription and upload."""
108
+ try:
109
+ jobs[job_id]["status"] = "extracting_info"
110
+
111
+ # Parse dataset link
112
+ repo_id, filename = extract_dataset_info(dataset_link)
113
+ jobs[job_id]["repo_id"] = repo_id
114
+ jobs[job_id]["filename"] = filename
115
+
116
+ # Create temp directory
117
+ temp_dir = tempfile.mkdtemp()
118
+ try:
119
+ jobs[job_id]["status"] = "downloading"
120
+ print(f"Downloading {filename} from {repo_id}...")
121
+
122
+ # Download video
123
+ local_path = hf_hub_download(
124
+ repo_id=repo_id,
125
+ filename=filename,
126
+ repo_type="dataset",
127
+ token=HF_TOKEN,
128
+ )
129
+
130
+ # Resolve symlink if needed
131
+ if os.path.islink(local_path):
132
+ local_path = os.path.realpath(local_path)
133
+
134
+ # Copy to temp location
135
+ video_path = os.path.join(temp_dir, os.path.basename(filename))
136
+ shutil.copy2(local_path, video_path)
137
+
138
+ jobs[job_id]["status"] = "transcribing"
139
+ print(f"Starting transcription...")
140
+
141
+ # Transcribe with timestamps
142
+ transcript = transcribe_with_timestamps(video_path, model_size)
143
+
144
+ # Prepare transcript file
145
+ transcript_filename = os.path.splitext(os.path.basename(filename))[0] + ".transcript.txt"
146
+ transcript_path = os.path.join(temp_dir, transcript_filename)
147
+
148
+ with open(transcript_path, "w", encoding="utf-8") as f:
149
+ f.write(transcript)
150
+
151
+ jobs[job_id]["status"] = "uploading"
152
+ print(f"Uploading transcript to dataset...")
153
+
154
+ # Upload transcript to transcriptions folder
155
+ repo_upload_path = f"transcriptions/{transcript_filename}"
156
+
157
+ upload_file(
158
+ path_or_fileobj=transcript_path,
159
+ path_in_repo=repo_upload_path,
160
+ repo_id=repo_id,
161
+ repo_type="dataset",
162
+ token=HF_TOKEN,
163
+ commit_message=f"Add transcription for {os.path.basename(filename)}"
164
+ )
165
+
166
+ jobs[job_id]["status"] = "completed"
167
+ jobs[job_id]["transcript_path"] = repo_upload_path
168
+ print(f"โœ“ Transcription completed and uploaded to {repo_upload_path}")
169
+
170
+ finally:
171
+ # Cleanup temp directory
172
+ shutil.rmtree(temp_dir, ignore_errors=True)
173
+
174
+ except Exception as e:
175
+ jobs[job_id]["status"] = "failed"
176
+ jobs[job_id]["error"] = str(e)
177
+ print(f"โœ— Error: {e}")
178
+
179
+ @app.get("/", response_class=HTMLResponse)
180
+ async def serve_ui():
181
+ """Serve the transcription UI."""
182
+ return """
183
+ <!DOCTYPE html>
184
+ <html lang="en">
185
+ <head>
186
+ <meta charset="UTF-8">
187
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
188
+ <title>Movie Transcription Service</title>
189
+ <style>
190
+ * {
191
+ margin: 0;
192
+ padding: 0;
193
+ box-sizing: border-box;
194
+ }
195
+
196
+ body {
197
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
198
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
199
+ min-height: 100vh;
200
+ display: flex;
201
+ align-items: center;
202
+ justify-content: center;
203
+ padding: 20px;
204
+ }
205
+
206
+ .container {
207
+ background: white;
208
+ border-radius: 12px;
209
+ box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
210
+ max-width: 600px;
211
+ width: 100%;
212
+ padding: 40px;
213
+ }
214
+
215
+ .header {
216
+ text-align: center;
217
+ margin-bottom: 30px;
218
+ }
219
+
220
+ .header h1 {
221
+ color: #333;
222
+ font-size: 28px;
223
+ margin-bottom: 10px;
224
+ }
225
+
226
+ .header p {
227
+ color: #666;
228
+ font-size: 14px;
229
+ }
230
+
231
+ .form-group {
232
+ margin-bottom: 20px;
233
+ }
234
+
235
+ label {
236
+ display: block;
237
+ margin-bottom: 8px;
238
+ color: #333;
239
+ font-weight: 500;
240
+ font-size: 14px;
241
+ }
242
+
243
+ input, select {
244
+ width: 100%;
245
+ padding: 12px;
246
+ border: 2px solid #e0e0e0;
247
+ border-radius: 6px;
248
+ font-size: 14px;
249
+ transition: border-color 0.3s;
250
+ }
251
+
252
+ input:focus, select:focus {
253
+ outline: none;
254
+ border-color: #667eea;
255
+ }
256
+
257
+ button {
258
+ width: 100%;
259
+ padding: 12px;
260
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
261
+ color: white;
262
+ border: none;
263
+ border-radius: 6px;
264
+ font-size: 16px;
265
+ font-weight: 600;
266
+ cursor: pointer;
267
+ transition: transform 0.2s;
268
+ }
269
+
270
+ button:hover {
271
+ transform: translateY(-2px);
272
+ }
273
+
274
+ button:disabled {
275
+ opacity: 0.6;
276
+ cursor: not-allowed;
277
+ transform: none;
278
+ }
279
+
280
+ .status-section {
281
+ margin-top: 30px;
282
+ padding-top: 30px;
283
+ border-top: 2px solid #f0f0f0;
284
+ }
285
+
286
+ .status-item {
287
+ display: none;
288
+ padding: 16px;
289
+ border-radius: 6px;
290
+ margin-bottom: 12px;
291
+ font-size: 14px;
292
+ }
293
+
294
+ .status-item.active {
295
+ display: block;
296
+ }
297
+
298
+ .status-item.info {
299
+ background: #e3f2fd;
300
+ color: #1976d2;
301
+ border-left: 4px solid #1976d2;
302
+ }
303
+
304
+ .status-item.success {
305
+ background: #e8f5e9;
306
+ color: #388e3c;
307
+ border-left: 4px solid #388e3c;
308
+ }
309
+
310
+ .status-item.error {
311
+ background: #ffebee;
312
+ color: #d32f2f;
313
+ border-left: 4px solid #d32f2f;
314
+ }
315
+
316
+ .spinner {
317
+ display: inline-block;
318
+ width: 12px;
319
+ height: 12px;
320
+ border: 2px solid #ccc;
321
+ border-top-color: #1976d2;
322
+ border-radius: 50%;
323
+ animation: spin 0.6s linear infinite;
324
+ margin-right: 8px;
325
+ }
326
+
327
+ @keyframes spin {
328
+ to { transform: rotate(360deg); }
329
+ }
330
+
331
+ .job-id {
332
+ font-family: 'Courier New', monospace;
333
+ font-size: 12px;
334
+ color: #999;
335
+ margin-top: 8px;
336
+ word-break: break-all;
337
+ }
338
+ </style>
339
+ </head>
340
+ <body>
341
+ <div class="container">
342
+ <div class="header">
343
+ <h1>๐ŸŽฌ Movie Transcription Service</h1>
344
+ <p>Download, transcribe, and upload movie transcriptions with timestamps</p>
345
+ </div>
346
+
347
+ <form id="transcriptionForm">
348
+ <div class="form-group">
349
+ <label for="datasetLink">Dataset Link or URL</label>
350
+ <input
351
+ type="text"
352
+ id="datasetLink"
353
+ placeholder="e.g., https://huggingface.co/datasets/factorstudios/movs/blob/main/movie.mkv"
354
+ required
355
+ >
356
+ </div>
357
+
358
+ <div class="form-group">
359
+ <label for="modelSize">Whisper Model Size</label>
360
+ <select id="modelSize">
361
+ <option value="tiny">Tiny (Fast)</option>
362
+ <option value="base">Base</option>
363
+ <option value="small" selected>Small (Recommended)</option>
364
+ <option value="medium">Medium</option>
365
+ <option value="large">Large (Slow but Accurate)</option>
366
+ </select>
367
+ </div>
368
+
369
+ <button type="submit" id="submitBtn">Start Transcription</button>
370
+ </form>
371
+
372
+ <div class="status-section" id="statusSection" style="display: none;">
373
+ <div id="statusMessages"></div>
374
+ <div class="job-id" id="jobId"></div>
375
+ </div>
376
+ </div>
377
+
378
+ <script>
379
+ const form = document.getElementById('transcriptionForm');
380
+ const statusSection = document.getElementById('statusSection');
381
+ const statusMessages = document.getElementById('statusMessages');
382
+ const jobId = document.getElementById('jobId');
383
+ const submitBtn = document.getElementById('submitBtn');
384
+
385
+ form.addEventListener('submit', async (e) => {
386
+ e.preventDefault();
387
+
388
+ const datasetLink = document.getElementById('datasetLink').value;
389
+ const modelSize = document.getElementById('modelSize').value;
390
+
391
+ submitBtn.disabled = true;
392
+ statusSection.style.display = 'block';
393
+ statusMessages.innerHTML = '';
394
+
395
+ try {
396
+ // Submit transcription request
397
+ const response = await fetch('/transcribe', {
398
+ method: 'POST',
399
+ headers: { 'Content-Type': 'application/json' },
400
+ body: JSON.stringify({
401
+ dataset_link: datasetLink,
402
+ model_size: modelSize
403
+ })
404
+ });
405
+
406
+ if (!response.ok) {
407
+ throw new Error(await response.text());
408
+ }
409
+
410
+ const data = await response.json();
411
+ const currentJobId = data.job_id;
412
+ jobId.textContent = `Job ID: ${currentJobId}`;
413
+
414
+ addStatus('info', '<span class="spinner"></span>Transcription started...', true);
415
+
416
+ // Poll for status updates
417
+ let completed = false;
418
+ while (!completed) {
419
+ await new Promise(resolve => setTimeout(resolve, 2000));
420
+
421
+ const statusResponse = await fetch(`/status/${currentJobId}`);
422
+ const statusData = await statusResponse.json();
423
+
424
+ const status = statusData.status;
425
+
426
+ if (status === 'completed') {
427
+ addStatus('success', 'โœ“ Transcription completed and uploaded!');
428
+ addStatus('info', `๐Ÿ“ File: ${statusData.transcript_path}`);
429
+ completed = true;
430
+ } else if (status === 'failed') {
431
+ addStatus('error', `โœ— Error: ${statusData.error}`);
432
+ completed = true;
433
+ } else {
434
+ const statusText = status.charAt(0).toUpperCase() + status.slice(1).replace(/_/g, ' ');
435
+ addStatus('info', `<span class="spinner"></span>${statusText}...`, true);
436
+ }
437
+ }
438
+ } catch (error) {
439
+ addStatus('error', `โœ— Error: ${error.message}`);
440
+ } finally {
441
+ submitBtn.disabled = false;
442
+ }
443
+ });
444
+
445
+ function addStatus(type, message, replace = false) {
446
+ if (replace) {
447
+ statusMessages.innerHTML = '';
448
+ }
449
+ const div = document.createElement('div');
450
+ div.className = `status-item active ${type}`;
451
+ div.innerHTML = message;
452
+ statusMessages.appendChild(div);
453
+ statusMessages.parentElement.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
454
+ }
455
+ </script>
456
+ </body>
457
+ </html>
458
+ """
459
+
460
+ @app.post("/transcribe")
461
+ async def start_transcription(request: TranscriptionRequest, background_tasks: BackgroundTasks):
462
+ """Start a transcription job."""
463
+ import uuid
464
+
465
+ job_id = str(uuid.uuid4())
466
+ jobs[job_id] = {
467
+ "status": "queued",
468
+ "dataset_link": request.dataset_link,
469
+ "model_size": request.model_size,
470
+ }
471
+
472
+ background_tasks.add_task(
473
+ process_transcription,
474
+ job_id,
475
+ request.dataset_link,
476
+ request.model_size
477
+ )
478
+
479
+ return JSONResponse({"job_id": job_id})
480
+
481
+ @app.get("/status/{job_id}")
482
+ async def get_status(job_id: str):
483
+ """Get the status of a transcription job."""
484
+ if job_id not in jobs:
485
+ raise HTTPException(status_code=404, detail="Job not found")
486
+
487
+ return JSONResponse(jobs[job_id])
488
+
489
+ if __name__ == "__main__":
490
+ print("Starting Movie Transcription Service...")
491
+ print("Open http://localhost:7860 in your browser")
492
+ uvicorn.run(app, host="0.0.0.0", port=7860)