Spaces:
Paused
Paused
| import os | |
| import re | |
| import uuid | |
| import time | |
| import asyncio | |
| from typing import Dict, List, Optional | |
| from fastapi import FastAPI, BackgroundTasks, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import HTMLResponse | |
| from pydantic import BaseModel | |
| import httpx | |
| # ------------------------------------------ | |
| # CONFIGURATION | |
| # ------------------------------------------ | |
| def _fetch_cloud_name(): | |
| import urllib.request as _ur, json as _j, ssl as _ssl | |
| ctx = _ssl.create_default_context() | |
| req = _ur.Request("https://media.toolxp.org/config", headers={"User-Agent": "Mozilla/5.0"}) | |
| for _i in range(3): | |
| try: | |
| with _ur.urlopen(req, timeout=10, context=ctx) as r: | |
| name = _j.loads(r.read().decode())["cloud_name"] | |
| if name: | |
| print(f"[config] cloud_name={name}") | |
| return name | |
| except Exception as _e: | |
| print(f"[config] attempt {_i+1} failed: {_e}") | |
| raise RuntimeError("[config] FATAL: could not fetch cloud_name after 3 attempts") | |
| CLOUD_NAME = _fetch_cloud_name() | |
| # Media proxy hides Cloudinary origin from end-users. | |
| # Route: media.toolxp.org → res.cloudinary.com/doxoms9hd (via Cloudflare Worker) | |
| CLOUDINARY_BASE = f"https://media.toolxp.org/video/upload" | |
| # ------------------------------------------ | |
| # IN-MEMORY JOB STORE | |
| # ------------------------------------------ | |
| JOBS: Dict[str, dict] = {} | |
| # ------------------------------------------ | |
| # APP SETUP | |
| # ------------------------------------------ | |
| app = FastAPI() | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| class VideoRequest(BaseModel): | |
| video_url: str | |
| # ------------------------------------------ | |
| # URL PARSING HELPERS | |
| # ------------------------------------------ | |
| def parse_cloudinary_url(url: str) -> dict: | |
| """ | |
| Parse Cloudinary URL to extract video_id, start_time, and duration. | |
| Expected format: https://res.cloudinary.com/.../so_55,du_30/.../video_id.mp4 | |
| or: https://res.cloudinary.com/.../so_55,du_30/fl_getinfo/video_id.jpg | |
| """ | |
| # Extract video ID (last segment before extension) | |
| video_id_match = re.search(r'/([^/]+)\.(mp4|jpg|webm|mov)$', url) | |
| video_id = video_id_match.group(1) if video_id_match else None | |
| # Extract start offset (so_X) | |
| start_match = re.search(r'so_(\d+(?:\.\d+)?)', url) | |
| start_time = float(start_match.group(1)) if start_match else 0 | |
| # Extract duration (du_X) | |
| duration_match = re.search(r'du_(\d+(?:\.\d+)?)', url) | |
| duration = float(duration_match.group(1)) if duration_match else 30 | |
| return { | |
| "video_id": video_id, | |
| "start_time": start_time, | |
| "duration": duration, | |
| "end_time": start_time + duration | |
| } | |
| def get_face_info_url(video_id: str, time_sec: float) -> str: | |
| """ | |
| Build URL to fetch face data for a specific frame. | |
| Returns JSON with landmarks when fetched. | |
| """ | |
| return f"{CLOUDINARY_BASE}/so_{time_sec},f_jpg/c_thumb,g_face,w_450/fl_getinfo/{video_id}.jpg" | |
| async def fetch_face_data(client: httpx.AsyncClient, video_id: str, time_sec: float) -> dict: | |
| """ | |
| Fetch face detection data for a specific timestamp. | |
| Returns the number of faces, their positions, and source video dimensions. | |
| """ | |
| url = get_face_info_url(video_id, time_sec) | |
| try: | |
| response = await client.get(url, timeout=10.0) | |
| if response.status_code == 200: | |
| data = response.json() | |
| landmarks = data.get("landmarks", [[]]) | |
| input_info = data.get("input", {}) | |
| # landmarks[0] is array of face objects | |
| face_count = len(landmarks[0]) if landmarks and landmarks[0] else 0 | |
| return { | |
| "time": time_sec, | |
| "face_count": face_count, | |
| "landmarks": landmarks[0] if landmarks else [], | |
| "source_w": input_info.get("width", 1920), | |
| "source_h": input_info.get("height", 1080) | |
| } | |
| except Exception as e: | |
| print(f"Error fetching face data at {time_sec}s: {e}") | |
| return {"time": time_sec, "face_count": 0, "landmarks": [], "source_w": 1920, "source_h": 1080} | |
| def find_multi_face_segments(frame_data: List[dict]) -> List[dict]: | |
| """ | |
| Analyze frame data to find segments where 2+ REAL faces are detected. | |
| Ghost face filtering happens HERE (before segment detection), not downstream. | |
| This prevents hands/objects from ever triggering a false split-screen. | |
| Returns list of segments with start/end times and averaged face coordinates. | |
| """ | |
| # Extract source video dimensions from the first frame's API response | |
| source_w = frame_data[0].get("source_w", 1920) if frame_data else 1920 | |
| source_h = frame_data[0].get("source_h", 1080) if frame_data else 1080 | |
| segments = [] | |
| in_multi_face = False | |
| segment_start = None | |
| segment_faces = [] # collect FILTERED face landmarks for calculating averages | |
| for frame in frame_data: | |
| # STEP 1: Extract face centers from raw landmarks | |
| raw_faces = [] | |
| for face in frame.get("landmarks", []): | |
| center = _extract_face_center(face) | |
| if center: | |
| raw_faces.append(center) | |
| # STEP 2: Filter out ghost/fake faces BEFORE deciding face count | |
| real_faces = _filter_ghost_faces(raw_faces) | |
| real_face_count = len(real_faces) | |
| if real_face_count >= 2: | |
| if not in_multi_face: | |
| # Start new segment | |
| in_multi_face = True | |
| segment_start = frame["time"] | |
| segment_faces = [] | |
| segment_faces.append(frame["landmarks"]) | |
| else: | |
| if in_multi_face: | |
| # End segment and calculate averages | |
| in_multi_face = False | |
| left_avg, right_avg = compute_face_crops(segment_faces, source_w, source_h) | |
| segments.append({ | |
| "start": segment_start, | |
| "end": frame["time"], | |
| "top_face": left_avg, | |
| "bottom_face": right_avg | |
| }) | |
| # Close any open segment | |
| if in_multi_face and segment_start is not None: | |
| left_avg, right_avg = compute_face_crops(segment_faces, source_w, source_h) | |
| segments.append({ | |
| "start": segment_start, | |
| "end": frame_data[-1]["time"] if frame_data else segment_start, | |
| "top_face": left_avg, | |
| "bottom_face": right_avg | |
| }) | |
| return segments | |
| def _extract_face_center(face: dict) -> dict: | |
| """ | |
| Extract the geometric center (cx, cy_eyes) of a face from Cloudinary landmarks. | |
| Also computes 'span' — the diagonal of the landmark bounding box — used to | |
| detect and reject ghost/fake face detections. | |
| cy_eyes = eye-level Y, which is the most reliable vertical anchor. | |
| Works for both frontal and profile views. | |
| """ | |
| pts = [v for v in face.values() if isinstance(v, dict) and 'x' in v and 'y' in v] | |
| if not pts: | |
| return None | |
| xs = [p['x'] for p in pts] | |
| ys = [p['y'] for p in pts] | |
| cx = sum(xs) / len(xs) | |
| # Use the topmost Y coordinate as the eye-level reference | |
| # (eyes are always the highest landmarks returned) | |
| cy_eyes = min(ys) | |
| # Landmark bounding box diagonal — measures "face size on screen" | |
| # Real faces: 80-300px diagonal. Ghost faces (hands, objects): 10-40px. | |
| span_x = max(xs) - min(xs) | |
| span_y = max(ys) - min(ys) | |
| span = (span_x ** 2 + span_y ** 2) ** 0.5 | |
| return {'cx': cx, 'cy_eyes': cy_eyes, 'span': span} | |
| def _filter_ghost_faces(processed_faces: list) -> list: | |
| """ | |
| Filter out ghost/fake face detections from a single frame. | |
| Ghost faces are typically: | |
| - Hands, fingers, or objects misidentified as faces | |
| - Very small landmark span compared to real faces in the same frame | |
| - Landmark span < 40% of the largest face → rejected | |
| - Absolute minimum span of 30px (any face smaller than this is too tiny to be real) | |
| """ | |
| if len(processed_faces) < 2: | |
| return processed_faces | |
| # Find the largest face in this frame | |
| max_span = max(f['span'] for f in processed_faces) | |
| # Reject faces whose span is less than 40% of the largest face | |
| # Also reject faces with absolute span < 30px (too small to be a real face) | |
| MIN_RELATIVE_SPAN = 0.40 | |
| MIN_ABSOLUTE_SPAN = 30.0 | |
| filtered = [ | |
| f for f in processed_faces | |
| if f['span'] >= max_span * MIN_RELATIVE_SPAN and f['span'] >= MIN_ABSOLUTE_SPAN | |
| ] | |
| return filtered if filtered else processed_faces[:1] # Always keep at least the biggest face | |
| def compute_face_crops(segment_faces_data: List[List[dict]], source_w: int, source_h: int) -> tuple[dict, dict]: | |
| """ | |
| ╔═══════════════════════════════════════════════════════════════╗ | |
| ║ PROPORTIONAL FACE CROP ALGORITHM ║ | |
| ║ ║ | |
| ║ Core principle: Crop SIZE comes from the source video ║ | |
| ║ dimensions (always proportional). Landmarks are used ║ | |
| ║ ONLY for positioning (centering on the face). ║ | |
| ║ ║ | |
| ║ This ensures consistent framing regardless of whether ║ | |
| ║ the subject is close-up or far from the camera. ║ | |
| ╚═══════════════════════════════════════════════════════════════╝ | |
| Algorithm steps: | |
| 1. Collect face center points from all frames in the segment | |
| 2. Filter ghost/fake faces (hands, objects) using landmark span comparison | |
| 3. Sort left vs right speaker by horizontal position | |
| 4. Average each speaker's position across all frames (temporal smoothing) | |
| 5. Calculate crop width = 50% of source video width (standard interview framing) | |
| 6. Apply anti-overlap: if faces are close, reduce crop width so boxes don't overlap | |
| 7. Force 9:8 aspect ratio (matches 1080x960 layer) so c_fill = pure scale | |
| 8. Position: face centered horizontally, eye-level at 35% from top (rule of thirds) | |
| 9. Clamp to source video bounds | |
| """ | |
| TARGET_ASPECT = 1080 / 960 # 9:8 = 1.125 | |
| BASE_CROP_RATIO = 0.50 # Each speaker gets 50% of source width as base crop | |
| FACE_VERTICAL_POS = 0.35 # Eyes sit at 35% from top of frame (rule of thirds) | |
| # --- STEP 1-3: Collect, filter, and average face centers --- | |
| left_centers = [] | |
| right_centers = [] | |
| for frame_faces in segment_faces_data: | |
| processed = [] | |
| for face in frame_faces: | |
| center = _extract_face_center(face) | |
| if center: | |
| processed.append(center) | |
| # Filter out ghost/fake faces (hands, objects, etc.) | |
| processed = _filter_ghost_faces(processed) | |
| # Sort left-to-right by horizontal position | |
| sorted_faces = sorted(processed, key=lambda f: f['cx']) | |
| if len(sorted_faces) >= 2: | |
| left_centers.append(sorted_faces[0]) | |
| right_centers.append(sorted_faces[-1]) | |
| # Average positions across all frames (temporal smoothing) | |
| def avg_center(centers, fallback_x, fallback_y): | |
| if not centers: | |
| return fallback_x, fallback_y | |
| cx = sum(c['cx'] for c in centers) / len(centers) | |
| cy = sum(c['cy_eyes'] for c in centers) / len(centers) | |
| return cx, cy | |
| left_cx, left_cy = avg_center(left_centers, source_w * 0.25, source_h * 0.40) | |
| right_cx, right_cy = avg_center(right_centers, source_w * 0.75, source_h * 0.40) | |
| # --- STEP 4: Base crop size from source dimensions --- | |
| crop_w = int(source_w * BASE_CROP_RATIO) | |
| # --- STEP 5: Anti-overlap --- | |
| # If the two faces are close together, reduce crop width so boxes don't overlap | |
| face_gap = abs(right_cx - left_cx) | |
| max_allowed_w = int(face_gap * 0.92) # Leave 8% gap between the two crops | |
| if crop_w > max_allowed_w and max_allowed_w > 200: | |
| crop_w = max_allowed_w | |
| # --- STEP 6: Force 9:8 aspect ratio --- | |
| crop_h = int(crop_w / TARGET_ASPECT) | |
| # Ensure crop height fits within source video | |
| if crop_h > source_h: | |
| crop_h = source_h | |
| crop_w = int(crop_h * TARGET_ASPECT) | |
| # --- STEP 7-8: Position each crop --- | |
| def position_crop(face_cx, face_cy_eyes): | |
| # Center horizontally on the face | |
| x = int(face_cx - crop_w / 2) | |
| # Vertically: place eye-level at 35% from top of crop (rule of thirds) | |
| # This naturally gives correct headroom above and shows shoulders below | |
| y = int(face_cy_eyes - crop_h * FACE_VERTICAL_POS) | |
| # Clamp to source video bounds | |
| x = max(0, min(x, source_w - crop_w)) | |
| y = max(0, min(y, source_h - crop_h)) | |
| return {"x": x, "y": y, "w": crop_w, "h": crop_h} | |
| left_crop = position_crop(left_cx, left_cy) | |
| right_crop = position_crop(right_cx, right_cy) | |
| return left_crop, right_crop | |
| def build_final_url(video_id: str, start_time: float, end_time: float, multi_face_segments: List[dict]) -> str: | |
| """ | |
| Build the final Cloudinary URL with layers for multi-face segments. | |
| Base: Full 9:16 video with g_auto:face | |
| Layers: Split-screen overlays during multi-face segments using exact face coordinates | |
| """ | |
| duration = end_time - start_time | |
| # Base transformation: 9:16 vertical with face tracking fallback | |
| base = f"so_{start_time},eo_{end_time}/w_1080,h_1920,c_fill,g_auto:face" | |
| # Build layers for each multi-face segment | |
| layers = [] | |
| for segment in multi_face_segments: | |
| seg_start = round(segment["start"], 2) | |
| seg_end = round(segment["end"], 2) | |
| seg_duration = round(seg_end - seg_start, 2) | |
| # Skip segments shorter than 1 second | |
| if seg_duration < 1: | |
| continue | |
| # Calculate offsets in OUTPUT video timeline | |
| layer_start_offset = round(seg_start - start_time, 2) | |
| layer_end_offset = round(seg_end - start_time, 2) | |
| # Use our pre-calculated bounding boxes | |
| t_face = segment.get("top_face", {"x": 0, "y": 0, "w": 300, "h": 300}) | |
| b_face = segment.get("bottom_face", {"x": 0, "y": 0, "w": 300, "h": 300}) | |
| # Top layer - left speaker | |
| # 1. c_crop extracts just their face box | |
| # 2. c_fill scales that tight box strictly up/down to 1080x960 | |
| top_layer = ( | |
| f"l_video:{video_id}," | |
| f"so_{seg_start},eo_{seg_end},du_{seg_duration},ac_none/" | |
| f"c_crop,w_{t_face['w']},h_{t_face['h']},x_{t_face['x']},y_{t_face['y']}/" | |
| f"c_fill,w_1080,h_960/" | |
| f"fl_layer_apply,g_north,so_{layer_start_offset},eo_{layer_end_offset}" | |
| ) | |
| # Bottom layer - right speaker | |
| bottom_layer = ( | |
| f"l_video:{video_id}," | |
| f"so_{seg_start},eo_{seg_end},du_{seg_duration},ac_none/" | |
| f"c_crop,w_{b_face['w']},h_{b_face['h']},x_{b_face['x']},y_{b_face['y']}/" | |
| f"c_fill,w_1080,h_960/" | |
| f"fl_layer_apply,g_south,so_{layer_start_offset},eo_{layer_end_offset}" | |
| ) | |
| layers.append(top_layer) | |
| layers.append(bottom_layer) | |
| # Combine all parts | |
| if layers: | |
| transformations = f"{base}/{'/'.join(layers)}" | |
| else: | |
| transformations = f"{base}" | |
| return f"{CLOUDINARY_BASE}/{transformations}/{video_id}.mp4" | |
| # ------------------------------------------ | |
| # BACKGROUND WORKER | |
| # ------------------------------------------ | |
| def process_video_sync(job_id: str, video_url: str): | |
| """ | |
| Synchronous wrapper for async processing. | |
| """ | |
| asyncio.run(process_video_async(job_id, video_url)) | |
| async def process_video_async(job_id: str, video_url: str): | |
| """ | |
| Main video processing logic: | |
| 1. Parse URL to get video_id and time range | |
| 2. Fetch face data for each frame (500ms intervals) | |
| 3. Find multi-face segments | |
| 4. Build final URL with layers | |
| """ | |
| print(f"[{job_id}] Starting job: {video_url}") | |
| JOBS[job_id]["status"] = "processing" | |
| JOBS[job_id]["progress"] = "Parsing video URL..." | |
| try: | |
| # 1. Parse URL | |
| parsed = parse_cloudinary_url(video_url) | |
| video_id = parsed["video_id"] | |
| start_time = parsed["start_time"] | |
| end_time = parsed["end_time"] | |
| duration = parsed["duration"] | |
| if not video_id: | |
| raise Exception("Could not extract video ID from URL") | |
| JOBS[job_id]["progress"] = f"Analyzing {duration}s of video..." | |
| print(f"[{job_id}] Video: {video_id}, Range: {start_time}s - {end_time}s") | |
| # 2. Fetch face data for each frame (500ms intervals) | |
| frame_times = [] | |
| t = start_time | |
| while t <= end_time: | |
| frame_times.append(round(t, 1)) | |
| t += 0.5 | |
| total_frames = len(frame_times) | |
| JOBS[job_id]["progress"] = f"Fetching face data for {total_frames} frames..." | |
| frame_data = [] | |
| async with httpx.AsyncClient() as client: | |
| # Process in batches of 10 to avoid overwhelming the API | |
| batch_size = 10 | |
| for i in range(0, len(frame_times), batch_size): | |
| batch = frame_times[i:i + batch_size] | |
| tasks = [fetch_face_data(client, video_id, t) for t in batch] | |
| results = await asyncio.gather(*tasks) | |
| frame_data.extend(results) | |
| progress_pct = min(100, int((i + batch_size) / total_frames * 100)) | |
| JOBS[job_id]["progress"] = f"Analyzing frames... {progress_pct}%" | |
| # 3. Find multi-face segments | |
| JOBS[job_id]["progress"] = "Detecting multi-face segments..." | |
| multi_face_segments = find_multi_face_segments(frame_data) | |
| print(f"[{job_id}] Found {len(multi_face_segments)} multi-face segments") | |
| # 4. Build final URL | |
| JOBS[job_id]["progress"] = "Building final video URL..." | |
| final_url = build_final_url(video_id, start_time, end_time, multi_face_segments) | |
| # 5. Complete | |
| JOBS[job_id]["status"] = "completed" | |
| JOBS[job_id]["progress"] = "Done" | |
| JOBS[job_id]["result"] = { | |
| "video_url": final_url, | |
| "video_id": video_id, | |
| "start_time": start_time, | |
| "end_time": end_time, | |
| "multi_face_segments": multi_face_segments, | |
| "total_frames_analyzed": total_frames | |
| } | |
| print(f"[{job_id}] Completed: {final_url}") | |
| except Exception as e: | |
| print(f"[{job_id}] FAILED: {str(e)}") | |
| JOBS[job_id]["status"] = "failed" | |
| JOBS[job_id]["error"] = str(e) | |
| JOBS[job_id]["progress"] = "Failed" | |
| # ------------------------------------------ | |
| # API ENDPOINTS | |
| # ------------------------------------------ | |
| def submit_job(req: VideoRequest, background_tasks: BackgroundTasks): | |
| job_id = str(uuid.uuid4()) | |
| JOBS[job_id] = { | |
| "status": "queued", | |
| "progress": "Waiting in queue...", | |
| "result": None, | |
| "error": None, | |
| "created_at": time.time() | |
| } | |
| background_tasks.add_task(process_video_sync, job_id, req.video_url) | |
| return {"job_id": job_id, "status": "queued"} | |
| def get_job_status(job_id: str): | |
| job = JOBS.get(job_id) | |
| if not job: | |
| raise HTTPException(status_code=404, detail="Job not found") | |
| return job | |
| def home(): | |
| return {"message": "Magic Cut API is Running", "version": "1.0"} | |
| def serve_client(): | |
| """Serve the embedded HTML client.""" | |
| html_content = """ | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Magic Cut - Video Face Splitter</title> | |
| <style> | |
| :root { | |
| --primary: #a855f7; | |
| --bg: #0f0f1a; | |
| --surface: #1a1a2e; | |
| --text: #f3f4f6; | |
| } | |
| body { | |
| font-family: 'Inter', system-ui, sans-serif; | |
| background: var(--bg); | |
| color: var(--text); | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| min-height: 100vh; | |
| margin: 0; | |
| padding: 1rem; | |
| } | |
| .container { | |
| background: var(--surface); | |
| padding: 2rem; | |
| border-radius: 16px; | |
| width: 100%; | |
| max-width: 600px; | |
| box-shadow: 0 20px 40px rgba(0,0,0,0.4); | |
| border: 1px solid #2a2a4a; | |
| } | |
| h2 { | |
| margin-top: 0; | |
| text-align: center; | |
| background: linear-gradient(135deg, #a855f7, #ec4899); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| font-size: 1.8rem; | |
| } | |
| h4 { | |
| margin: 0; | |
| color: #9ca3af; | |
| text-align: center; | |
| font-weight: 400; | |
| margin-bottom: 1.5rem; | |
| } | |
| .form-group { | |
| margin-bottom: 1.5rem; | |
| } | |
| label { | |
| display: block; | |
| margin-bottom: 0.5rem; | |
| font-size: 0.9rem; | |
| color: #d1d5db; | |
| } | |
| input, textarea { | |
| width: 100%; | |
| padding: 0.75rem; | |
| background: #0f0f1a; | |
| border: 1px solid #374151; | |
| border-radius: 8px; | |
| color: white; | |
| box-sizing: border-box; | |
| font-family: inherit; | |
| } | |
| input:focus, textarea:focus { | |
| outline: 2px solid var(--primary); | |
| border-color: transparent; | |
| } | |
| button { | |
| width: 100%; | |
| padding: 0.875rem; | |
| background: linear-gradient(135deg, #a855f7, #ec4899); | |
| color: white; | |
| border: none; | |
| border-radius: 8px; | |
| font-weight: 700; | |
| cursor: pointer; | |
| transition: all 0.2s; | |
| font-size: 1rem; | |
| } | |
| button:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 10px 20px rgba(168, 85, 247, 0.3); | |
| } | |
| button:disabled { | |
| opacity: 0.5; | |
| cursor: not-allowed; | |
| transform: none; | |
| box-shadow: none; | |
| } | |
| #statusBox { | |
| margin-top: 2rem; | |
| display: none; | |
| background: #0f0f1a; | |
| padding: 1.5rem; | |
| border-radius: 12px; | |
| border: 1px solid #374151; | |
| } | |
| .status-badge { | |
| display: inline-block; | |
| padding: 6px 14px; | |
| border-radius: 99px; | |
| font-size: 0.8rem; | |
| font-weight: 600; | |
| margin-bottom: 1rem; | |
| } | |
| .status-badge.queued { background: #f59e0b; color: black; } | |
| .status-badge.processing { background: #3b82f6; color: white; } | |
| .status-badge.completed { background: #10b981; color: black; } | |
| .status-badge.failed { background: #ef4444; color: white; } | |
| #progressText { | |
| color: #d1d5db; | |
| margin-bottom: 1rem; | |
| font-size: 0.95rem; | |
| } | |
| .result-box { | |
| background: #1a1a2e; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| margin-top: 1rem; | |
| } | |
| .result-url { | |
| word-break: break-all; | |
| font-size: 0.85rem; | |
| color: var(--primary); | |
| margin-bottom: 0.5rem; | |
| } | |
| .copy-btn { | |
| background: #374151; | |
| border: none; | |
| color: white; | |
| padding: 8px 16px; | |
| border-radius: 6px; | |
| cursor: pointer; | |
| font-size: 0.85rem; | |
| width: auto; | |
| margin-top: 0.5rem; | |
| } | |
| .copy-btn:hover { | |
| background: #4b5563; | |
| transform: none; | |
| box-shadow: none; | |
| } | |
| .spinner { | |
| border: 4px solid #374151; | |
| border-top: 4px solid var(--primary); | |
| border-radius: 50%; | |
| width: 30px; | |
| height: 30px; | |
| animation: spin 1s linear infinite; | |
| margin: 0 auto 1rem auto; | |
| display: none; | |
| } | |
| @keyframes spin { | |
| 0% { transform: rotate(0deg); } | |
| 100% { transform: rotate(360deg); } | |
| } | |
| .info-box { | |
| background: rgba(168, 85, 247, 0.1); | |
| border: 1px solid rgba(168, 85, 247, 0.3); | |
| border-radius: 8px; | |
| padding: 1rem; | |
| margin-bottom: 1.5rem; | |
| font-size: 0.85rem; | |
| color: #d1d5db; | |
| } | |
| .segments-info { | |
| margin-top: 1rem; | |
| font-size: 0.85rem; | |
| color: #9ca3af; | |
| } | |
| video { | |
| width: 100%; | |
| max-height: 400px; | |
| border-radius: 8px; | |
| margin-top: 1rem; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <h2>✂️ Magic Cut</h2> | |
| <h4>Transform 16:9 videos into vertical shorts with face tracking</h4> | |
| <div class="info-box"> | |
| <strong>How it works:</strong><br> | |
| 1. Paste your Cloudinary video URL with <code>so_X,du_Y</code> (start time, duration)<br> | |
| 2. We analyze each frame for faces (every 500ms)<br> | |
| 3. When 2+ faces detected → split-screen layout<br> | |
| 4. Get your final 9:16 video URL! | |
| </div> | |
| <div class="form-group"> | |
| <label>Cloudinary Video URL</label> | |
| <textarea id="videoUrl" rows="3" placeholder="https://res.cloudinary.com/doxoms9hd/video/upload/so_55,du_30/fl_getinfo/video_id.jpg"></textarea> | |
| <small style="color: #6b7280; display: block; margin-top: 4px;"> | |
| Format: so_X,du_Y (start at X seconds, duration Y seconds) | |
| </small> | |
| </div> | |
| <button id="processBtn" onclick="submitJob()">🎬 Process Video</button> | |
| <div id="statusBox"> | |
| <div id="spinner" class="spinner"></div> | |
| <span id="statusBadge" class="status-badge">Waiting</span> | |
| <div id="progressText">Initializing...</div> | |
| <div id="resultBox"></div> | |
| </div> | |
| </div> | |
| <script> | |
| const API_BASE = window.location.origin; | |
| let pollInterval = null; | |
| async function submitJob() { | |
| const videoUrl = document.getElementById('videoUrl').value.trim(); | |
| const btn = document.getElementById('processBtn'); | |
| const statusBox = document.getElementById('statusBox'); | |
| if (!videoUrl) { | |
| alert("Please enter a video URL"); | |
| return; | |
| } | |
| btn.disabled = true; | |
| statusBox.style.display = 'block'; | |
| document.getElementById('resultBox').innerHTML = ''; | |
| updateStatus("queued", "Submitting job..."); | |
| try { | |
| const response = await fetch(`${API_BASE}/jobs`, { | |
| method: 'POST', | |
| headers: { 'Content-Type': 'application/json' }, | |
| body: JSON.stringify({ video_url: videoUrl }) | |
| }); | |
| const data = await response.json(); | |
| if (data.job_id) { | |
| console.log("Job Submitted:", data.job_id); | |
| startPolling(data.job_id); | |
| } else { | |
| updateStatus("failed", "Failed to get Job ID"); | |
| btn.disabled = false; | |
| } | |
| } catch (error) { | |
| console.error(error); | |
| updateStatus("failed", "Connection Error. Check URL."); | |
| btn.disabled = false; | |
| } | |
| } | |
| function startPolling(jobId) { | |
| if (pollInterval) clearInterval(pollInterval); | |
| pollInterval = setInterval(async () => { | |
| try { | |
| const res = await fetch(`${API_BASE}/jobs/${jobId}`); | |
| const job = await res.json(); | |
| updateStatus(job.status, job.progress); | |
| if (job.status === 'completed') { | |
| clearInterval(pollInterval); | |
| showResults(job.result); | |
| document.getElementById('processBtn').disabled = false; | |
| } | |
| if (job.status === 'failed') { | |
| clearInterval(pollInterval); | |
| document.getElementById('progressText').innerText = "Error: " + job.error; | |
| document.getElementById('processBtn').disabled = false; | |
| } | |
| } catch (e) { | |
| console.error("Polling error", e); | |
| } | |
| }, 2000); | |
| } | |
| function updateStatus(status, message) { | |
| const badge = document.getElementById('statusBadge'); | |
| const spinner = document.getElementById('spinner'); | |
| const text = document.getElementById('progressText'); | |
| badge.className = `status-badge ${status}`; | |
| badge.innerText = status.toUpperCase(); | |
| text.innerText = message || "Processing..."; | |
| if (status === 'processing' || status === 'queued') { | |
| spinner.style.display = 'block'; | |
| } else { | |
| spinner.style.display = 'none'; | |
| } | |
| } | |
| function showResults(result) { | |
| const box = document.getElementById('resultBox'); | |
| const segments = result.multi_face_segments || []; | |
| let segmentsHtml = ''; | |
| if (segments.length > 0) { | |
| segmentsHtml = ` | |
| <div class="segments-info"> | |
| <strong>🎭 Multi-face segments found:</strong><br> | |
| ${segments.map((s, i) => `Segment ${i+1}: ${s.start}s - ${s.end}s`).join('<br>')} | |
| </div> | |
| `; | |
| } else { | |
| segmentsHtml = `<div class="segments-info">No multi-face segments detected (single speaker throughout)</div>`; | |
| } | |
| box.innerHTML = ` | |
| <div class="result-box"> | |
| <div style="margin-bottom: 0.5rem; color: #10b981; font-weight: 600;">✅ Video Ready!</div> | |
| <div class="result-url">${result.video_url}</div> | |
| <button class="copy-btn" onclick="navigator.clipboard.writeText('${result.video_url}').then(() => this.innerText = 'Copied!')"> | |
| 📋 Copy URL | |
| </button> | |
| ${segmentsHtml} | |
| <div class="segments-info"> | |
| <strong>📊 Stats:</strong> ${result.total_frames_analyzed} frames analyzed | |
| </div> | |
| <video controls src="${result.video_url}"></video> | |
| </div> | |
| `; | |
| } | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| return html_content | |