Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| import os | |
| import tempfile | |
| import shutil | |
| from pathlib import Path | |
| from datetime import datetime | |
| from dotenv import load_dotenv | |
| from fastapi import FastAPI, HTTPException, BackgroundTasks | |
| from fastapi.responses import HTMLResponse, JSONResponse | |
| from fastapi.staticfiles import StaticFiles | |
| from pydantic import BaseModel | |
| import uvicorn | |
| try: | |
| from huggingface_hub import hf_hub_download, upload_file, list_repo_files | |
| import whisper | |
| except ImportError as e: | |
| print(f"Missing dependency: {e}") | |
| exit(1) | |
| # Load environment variables | |
| load_dotenv() | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| if not HF_TOKEN: | |
| print("Error: HF_TOKEN not found in .env file") | |
| exit(1) | |
| app = FastAPI(title="Movie Transcription Service") | |
| # In-memory job tracking | |
| jobs = {} | |
| class TranscriptionRequest(BaseModel): | |
| dataset_link: str | |
| model_size: str = "small" | |
| def format_timestamp(seconds: float) -> str: | |
| """Convert seconds to HH:MM:SS format.""" | |
| hours = int(seconds // 3600) | |
| minutes = int((seconds % 3600) // 60) | |
| secs = int(seconds % 60) | |
| return f"{hours:02d}:{minutes:02d}:{secs:02d}" | |
| def transcribe_with_timestamps(video_path: str, model_size: str) -> str: | |
| """Transcribe video and include timestamps.""" | |
| print(f"Loading Whisper model: {model_size}") | |
| model = whisper.load_model(model_size) | |
| print(f"Transcribing audio from: {video_path}") | |
| result = model.transcribe(video_path) | |
| # Format transcript with timestamps | |
| transcript_lines = [] | |
| transcript_lines.append("=" * 80) | |
| transcript_lines.append("MOVIE TRANSCRIPTION WITH TIMESTAMPS") | |
| transcript_lines.append("=" * 80) | |
| transcript_lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| transcript_lines.append("") | |
| if "segments" in result: | |
| for segment in result["segments"]: | |
| timestamp = format_timestamp(segment["start"]) | |
| text = segment["text"].strip() | |
| if text: | |
| transcript_lines.append(f"[{timestamp}] {text}") | |
| else: | |
| # Fallback if segments not available | |
| transcript_lines.append(result.get("text", "")) | |
| return "\n".join(transcript_lines) | |
| def extract_dataset_info(dataset_link: str) -> tuple: | |
| """Extract repo_id and filename from dataset link.""" | |
| # Examples: | |
| # https://huggingface.co/datasets/factorstudios/movs/blob/main/Captain.America.Brave.New.World.(NKIRI.COM).2025.mkv | |
| # factorstudios/movs/Captain.America.Brave.New.World.(NKIRI.COM).2025.mkv | |
| link = dataset_link.strip() | |
| # Validate input | |
| if not link: | |
| raise ValueError("Dataset link cannot be empty") | |
| if any(char in link for char in ["=", "\n", "\r", "DASHSCOPE", "API", "TOKEN"]): | |
| raise ValueError( | |
| "Invalid dataset link format. Please provide a valid Hugging Face dataset URL or path.\n" | |
| "Examples:\n" | |
| " https://huggingface.co/datasets/factorstudios/movs/blob/main/movie.mkv\n" | |
| " factorstudios/movs/movie.mkv" | |
| ) | |
| if "huggingface.co" in link: | |
| # Parse HF URL | |
| parts = link.split("/") | |
| if "datasets" in parts: | |
| try: | |
| idx = parts.index("datasets") | |
| owner = parts[idx + 1] | |
| repo = parts[idx + 2] | |
| # Find filename (after /blob/main/ or /blob/[branch]/) | |
| if "blob" in parts: | |
| blob_idx = parts.index("blob") | |
| filename = "/".join(parts[blob_idx + 2:]) | |
| else: | |
| filename = parts[-1] | |
| repo_id = f"{owner}/{repo}" | |
| if not filename: | |
| raise ValueError("No filename found in URL") | |
| return repo_id, filename | |
| except (IndexError, ValueError) as e: | |
| raise ValueError(f"Invalid Hugging Face dataset URL format: {e}") | |
| else: | |
| # Assume it's in format: owner/repo/filename | |
| parts = link.split("/") | |
| if len(parts) >= 3: | |
| repo_id = f"{parts[0]}/{parts[1]}" | |
| filename = "/".join(parts[2:]) | |
| if not filename: | |
| raise ValueError("No filename found in path") | |
| return repo_id, filename | |
| raise ValueError( | |
| f"Cannot parse dataset link. Please use:\n" | |
| f" https://huggingface.co/datasets/owner/repo/blob/main/file.mkv\n" | |
| f" or: owner/repo/file.mkv" | |
| ) | |
| async def process_transcription(job_id: str, dataset_link: str, model_size: str): | |
| """Background task to process transcription and upload.""" | |
| try: | |
| jobs[job_id]["status"] = "extracting_info" | |
| # Parse and validate dataset link | |
| try: | |
| repo_id, filename = extract_dataset_info(dataset_link) | |
| except ValueError as e: | |
| raise ValueError(f"Invalid dataset link: {str(e)}") | |
| jobs[job_id]["repo_id"] = repo_id | |
| jobs[job_id]["filename"] = filename | |
| # Create temp directory | |
| temp_dir = tempfile.mkdtemp() | |
| try: | |
| jobs[job_id]["status"] = "downloading" | |
| print(f"Downloading {filename} from {repo_id}...") | |
| # Download video | |
| local_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=filename, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| ) | |
| # Resolve symlink if needed | |
| if os.path.islink(local_path): | |
| local_path = os.path.realpath(local_path) | |
| # Copy to temp location | |
| video_path = os.path.join(temp_dir, os.path.basename(filename)) | |
| shutil.copy2(local_path, video_path) | |
| jobs[job_id]["status"] = "transcribing" | |
| print(f"Starting transcription...") | |
| # Transcribe with timestamps | |
| transcript = transcribe_with_timestamps(video_path, model_size) | |
| # Prepare transcript file | |
| transcript_filename = os.path.splitext(os.path.basename(filename))[0] + ".transcript.txt" | |
| transcript_path = os.path.join(temp_dir, transcript_filename) | |
| with open(transcript_path, "w", encoding="utf-8") as f: | |
| f.write(transcript) | |
| jobs[job_id]["status"] = "uploading" | |
| print(f"Uploading transcript to dataset...") | |
| # Upload transcript to transcriptions folder | |
| repo_upload_path = f"transcriptions/{transcript_filename}" | |
| upload_file( | |
| path_or_fileobj=transcript_path, | |
| path_in_repo=repo_upload_path, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| commit_message=f"Add transcription for {os.path.basename(filename)}" | |
| ) | |
| jobs[job_id]["status"] = "completed" | |
| jobs[job_id]["transcript_path"] = repo_upload_path | |
| print(f"✓ Transcription completed and uploaded to {repo_upload_path}") | |
| finally: | |
| # Cleanup temp directory | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| except Exception as e: | |
| jobs[job_id]["status"] = "failed" | |
| jobs[job_id]["error"] = str(e) | |
| print(f"✗ Error: {e}") | |
| async def serve_ui(): | |
| """Serve the transcription UI.""" | |
| return """ | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Movie Transcription Service</title> | |
| <style> | |
| * { | |
| margin: 0; | |
| padding: 0; | |
| box-sizing: border-box; | |
| } | |
| body { | |
| font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| min-height: 100vh; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| padding: 20px; | |
| } | |
| .container { | |
| background: white; | |
| border-radius: 12px; | |
| box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3); | |
| max-width: 600px; | |
| width: 100%; | |
| padding: 40px; | |
| } | |
| .header { | |
| text-align: center; | |
| margin-bottom: 30px; | |
| } | |
| .header h1 { | |
| color: #333; | |
| font-size: 28px; | |
| margin-bottom: 10px; | |
| } | |
| .header p { | |
| color: #666; | |
| font-size: 14px; | |
| } | |
| .form-group { | |
| margin-bottom: 20px; | |
| } | |
| label { | |
| display: block; | |
| margin-bottom: 8px; | |
| color: #333; | |
| font-weight: 500; | |
| font-size: 14px; | |
| } | |
| input, select { | |
| width: 100%; | |
| padding: 12px; | |
| border: 2px solid #e0e0e0; | |
| border-radius: 6px; | |
| font-size: 14px; | |
| transition: border-color 0.3s; | |
| } | |
| input:focus, select:focus { | |
| outline: none; | |
| border-color: #667eea; | |
| } | |
| button { | |
| width: 100%; | |
| padding: 12px; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| border: none; | |
| border-radius: 6px; | |
| font-size: 16px; | |
| font-weight: 600; | |
| cursor: pointer; | |
| transition: transform 0.2s; | |
| } | |
| button:hover { | |
| transform: translateY(-2px); | |
| } | |
| button:disabled { | |
| opacity: 0.6; | |
| cursor: not-allowed; | |
| transform: none; | |
| } | |
| .status-section { | |
| margin-top: 30px; | |
| padding-top: 30px; | |
| border-top: 2px solid #f0f0f0; | |
| } | |
| .status-item { | |
| display: none; | |
| padding: 16px; | |
| border-radius: 6px; | |
| margin-bottom: 12px; | |
| font-size: 14px; | |
| } | |
| .status-item.active { | |
| display: block; | |
| } | |
| .status-item.info { | |
| background: #e3f2fd; | |
| color: #1976d2; | |
| border-left: 4px solid #1976d2; | |
| } | |
| .status-item.success { | |
| background: #e8f5e9; | |
| color: #388e3c; | |
| border-left: 4px solid #388e3c; | |
| } | |
| .status-item.error { | |
| background: #ffebee; | |
| color: #d32f2f; | |
| border-left: 4px solid #d32f2f; | |
| } | |
| .spinner { | |
| display: inline-block; | |
| width: 12px; | |
| height: 12px; | |
| border: 2px solid #ccc; | |
| border-top-color: #1976d2; | |
| border-radius: 50%; | |
| animation: spin 0.6s linear infinite; | |
| margin-right: 8px; | |
| } | |
| @keyframes spin { | |
| to { transform: rotate(360deg); } | |
| } | |
| .job-id { | |
| font-family: 'Courier New', monospace; | |
| font-size: 12px; | |
| color: #999; | |
| margin-top: 8px; | |
| word-break: break-all; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <div class="header"> | |
| <h1>🎬 Movie Transcription Service</h1> | |
| <p>Download, transcribe, and upload movie transcriptions with timestamps</p> | |
| </div> | |
| <form id="transcriptionForm"> | |
| <div class="form-group"> | |
| <label for="datasetLink">Dataset Link or URL</label> | |
| <input | |
| type="text" | |
| id="datasetLink" | |
| placeholder="https://huggingface.co/datasets/factorstudios/movs/blob/main/movie.mkv" | |
| title="Enter a Hugging Face dataset URL or path (owner/repo/filename.mkv)" | |
| required | |
| > | |
| <small style="display: block; margin-top: 6px; color: #999; font-size: 12px;"> | |
| Format: https://huggingface.co/datasets/owner/repo/blob/main/filename.mkv<br> | |
| or: owner/repo/filename.mkv | |
| </small> | |
| </div> | |
| <div class="form-group"> | |
| <label for="modelSize">Whisper Model Size</label> | |
| <select id="modelSize"> | |
| <option value="tiny">Tiny (Fast)</option> | |
| <option value="base">Base</option> | |
| <option value="small" selected>Small (Recommended)</option> | |
| <option value="medium">Medium</option> | |
| <option value="large">Large (Slow but Accurate)</option> | |
| </select> | |
| </div> | |
| <button type="submit" id="submitBtn">Start Transcription</button> | |
| </form> | |
| <div class="status-section" id="statusSection" style="display: none;"> | |
| <div id="statusMessages"></div> | |
| <div class="job-id" id="jobId"></div> | |
| </div> | |
| </div> | |
| <script> | |
| const form = document.getElementById('transcriptionForm'); | |
| const statusSection = document.getElementById('statusSection'); | |
| const statusMessages = document.getElementById('statusMessages'); | |
| const jobId = document.getElementById('jobId'); | |
| const submitBtn = document.getElementById('submitBtn'); | |
| form.addEventListener('submit', async (e) => { | |
| e.preventDefault(); | |
| const datasetLink = document.getElementById('datasetLink').value; | |
| const modelSize = document.getElementById('modelSize').value; | |
| submitBtn.disabled = true; | |
| statusSection.style.display = 'block'; | |
| statusMessages.innerHTML = ''; | |
| try { | |
| // Submit transcription request | |
| const response = await fetch('/transcribe', { | |
| method: 'POST', | |
| headers: { 'Content-Type': 'application/json' }, | |
| body: JSON.stringify({ | |
| dataset_link: datasetLink, | |
| model_size: modelSize | |
| }) | |
| }); | |
| if (!response.ok) { | |
| throw new Error(await response.text()); | |
| } | |
| const data = await response.json(); | |
| const currentJobId = data.job_id; | |
| jobId.textContent = `Job ID: ${currentJobId}`; | |
| addStatus('info', '<span class="spinner"></span>Transcription started...', true); | |
| // Poll for status updates | |
| let completed = false; | |
| while (!completed) { | |
| await new Promise(resolve => setTimeout(resolve, 2000)); | |
| const statusResponse = await fetch(`/status/${currentJobId}`); | |
| const statusData = await statusResponse.json(); | |
| const status = statusData.status; | |
| if (status === 'completed') { | |
| addStatus('success', '✓ Transcription completed and uploaded!'); | |
| addStatus('info', `📁 File: ${statusData.transcript_path}`); | |
| completed = true; | |
| } else if (status === 'failed') { | |
| addStatus('error', `✗ Error: ${statusData.error}`); | |
| completed = true; | |
| } else { | |
| const statusText = status.charAt(0).toUpperCase() + status.slice(1).replace(/_/g, ' '); | |
| addStatus('info', `<span class="spinner"></span>${statusText}...`, true); | |
| } | |
| } | |
| } catch (error) { | |
| addStatus('error', `✗ Error: ${error.message}`); | |
| } finally { | |
| submitBtn.disabled = false; | |
| } | |
| }); | |
| function addStatus(type, message, replace = false) { | |
| if (replace) { | |
| statusMessages.innerHTML = ''; | |
| } | |
| const div = document.createElement('div'); | |
| div.className = `status-item active ${type}`; | |
| div.innerHTML = message; | |
| statusMessages.appendChild(div); | |
| statusMessages.parentElement.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); | |
| } | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| async def start_transcription(request: TranscriptionRequest, background_tasks: BackgroundTasks): | |
| """Start a transcription job.""" | |
| import uuid | |
| job_id = str(uuid.uuid4()) | |
| jobs[job_id] = { | |
| "status": "queued", | |
| "dataset_link": request.dataset_link, | |
| "model_size": request.model_size, | |
| } | |
| background_tasks.add_task( | |
| process_transcription, | |
| job_id, | |
| request.dataset_link, | |
| request.model_size | |
| ) | |
| return JSONResponse({"job_id": job_id}) | |
| async def get_status(job_id: str): | |
| """Get the status of a transcription job.""" | |
| if job_id not in jobs: | |
| raise HTTPException(status_code=404, detail="Job not found") | |
| return JSONResponse(jobs[job_id]) | |
| if __name__ == "__main__": | |
| print("Starting Movie Transcription Service...") | |
| print("Open http://localhost:7860 in your browser") | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |