magic_cut

Sleeping

App Files Files Community

magic_cut / app.py

ADXabhi

Upload app.py

3600cce verified 2 months ago

raw

history blame contribute delete

31.7 kB

	import os
	import re
	import uuid
	import time
	import asyncio
	from typing import Dict, List, Optional
	from fastapi import FastAPI, BackgroundTasks, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import HTMLResponse
	from pydantic import BaseModel
	import httpx

	# ------------------------------------------
	# CONFIGURATION
	# ------------------------------------------
	def _fetch_cloud_name():
	import urllib.request as _ur, json as _j, ssl as _ssl
	ctx = _ssl.create_default_context()
	req = _ur.Request("https://media.toolxp.org/config", headers={"User-Agent": "Mozilla/5.0"})
	for _i in range(3):
	try:
	with _ur.urlopen(req, timeout=10, context=ctx) as r:
	name = _j.loads(r.read().decode())["cloud_name"]
	if name:
	print(f"[config] cloud_name={name}")
	return name
	except Exception as _e:
	print(f"[config] attempt {_i+1} failed: {_e}")
	raise RuntimeError("[config] FATAL: could not fetch cloud_name after 3 attempts")
	CLOUD_NAME = _fetch_cloud_name()
	# Media proxy hides Cloudinary origin from end-users.
	# Route: media.toolxp.org → res.cloudinary.com/doxoms9hd (via Cloudflare Worker)
	CLOUDINARY_BASE = f"https://media.toolxp.org/video/upload"

	# ------------------------------------------
	# IN-MEMORY JOB STORE
	# ------------------------------------------
	JOBS: Dict[str, dict] = {}

	# ------------------------------------------
	# APP SETUP
	# ------------------------------------------
	app = FastAPI()

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	class VideoRequest(BaseModel):
	video_url: str


	# ------------------------------------------
	# URL PARSING HELPERS
	# ------------------------------------------
	def parse_cloudinary_url(url: str) -> dict:
	"""
	Parse Cloudinary URL to extract video_id, start_time, and duration.
	Expected format: https://res.cloudinary.com/.../so_55,du_30/.../video_id.mp4
	or: https://res.cloudinary.com/.../so_55,du_30/fl_getinfo/video_id.jpg
	"""
	# Extract video ID (last segment before extension)
	video_id_match = re.search(r'/([^/]+)\.(mp4\|jpg\|webm\|mov)$', url)
	video_id = video_id_match.group(1) if video_id_match else None

	# Extract start offset (so_X)
	start_match = re.search(r'so_(\d+(?:\.\d+)?)', url)
	start_time = float(start_match.group(1)) if start_match else 0

	# Extract duration (du_X)
	duration_match = re.search(r'du_(\d+(?:\.\d+)?)', url)
	duration = float(duration_match.group(1)) if duration_match else 30

	return {
	"video_id": video_id,
	"start_time": start_time,
	"duration": duration,
	"end_time": start_time + duration
	}


	def get_face_info_url(video_id: str, time_sec: float) -> str:
	"""
	Build URL to fetch face data for a specific frame.
	Returns JSON with landmarks when fetched.
	"""
	return f"{CLOUDINARY_BASE}/so_{time_sec},f_jpg/c_thumb,g_face,w_450/fl_getinfo/{video_id}.jpg"


	async def fetch_face_data(client: httpx.AsyncClient, video_id: str, time_sec: float) -> dict:
	"""
	Fetch face detection data for a specific timestamp.
	Returns the number of faces, their positions, and source video dimensions.
	"""
	url = get_face_info_url(video_id, time_sec)
	try:
	response = await client.get(url, timeout=10.0)
	if response.status_code == 200:
	data = response.json()
	landmarks = data.get("landmarks", [[]])
	input_info = data.get("input", {})
	# landmarks[0] is array of face objects
	face_count = len(landmarks[0]) if landmarks and landmarks[0] else 0
	return {
	"time": time_sec,
	"face_count": face_count,
	"landmarks": landmarks[0] if landmarks else [],
	"source_w": input_info.get("width", 1920),
	"source_h": input_info.get("height", 1080)
	}
	except Exception as e:
	print(f"Error fetching face data at {time_sec}s: {e}")

	return {"time": time_sec, "face_count": 0, "landmarks": [], "source_w": 1920, "source_h": 1080}


	def find_multi_face_segments(frame_data: List[dict]) -> List[dict]:
	"""
	Analyze frame data to find segments where 2+ REAL faces are detected.

	Ghost face filtering happens HERE (before segment detection), not downstream.
	This prevents hands/objects from ever triggering a false split-screen.

	Returns list of segments with start/end times and averaged face coordinates.
	"""
	# Extract source video dimensions from the first frame's API response
	source_w = frame_data[0].get("source_w", 1920) if frame_data else 1920
	source_h = frame_data[0].get("source_h", 1080) if frame_data else 1080

	segments = []
	in_multi_face = False
	segment_start = None
	segment_faces = [] # collect FILTERED face landmarks for calculating averages

	for frame in frame_data:
	# STEP 1: Extract face centers from raw landmarks
	raw_faces = []
	for face in frame.get("landmarks", []):
	center = _extract_face_center(face)
	if center:
	raw_faces.append(center)

	# STEP 2: Filter out ghost/fake faces BEFORE deciding face count
	real_faces = _filter_ghost_faces(raw_faces)
	real_face_count = len(real_faces)

	if real_face_count >= 2:
	if not in_multi_face:
	# Start new segment
	in_multi_face = True
	segment_start = frame["time"]
	segment_faces = []
	segment_faces.append(frame["landmarks"])
	else:
	if in_multi_face:
	# End segment and calculate averages
	in_multi_face = False
	left_avg, right_avg = compute_face_crops(segment_faces, source_w, source_h)
	segments.append({
	"start": segment_start,
	"end": frame["time"],
	"top_face": left_avg,
	"bottom_face": right_avg
	})

	# Close any open segment
	if in_multi_face and segment_start is not None:
	left_avg, right_avg = compute_face_crops(segment_faces, source_w, source_h)
	segments.append({
	"start": segment_start,
	"end": frame_data[-1]["time"] if frame_data else segment_start,
	"top_face": left_avg,
	"bottom_face": right_avg
	})

	return segments


	def _extract_face_center(face: dict) -> dict:
	"""
	Extract the geometric center (cx, cy_eyes) of a face from Cloudinary landmarks.
	Also computes 'span' — the diagonal of the landmark bounding box — used to
	detect and reject ghost/fake face detections.

	cy_eyes = eye-level Y, which is the most reliable vertical anchor.
	Works for both frontal and profile views.
	"""
	pts = [v for v in face.values() if isinstance(v, dict) and 'x' in v and 'y' in v]
	if not pts:
	return None

	xs = [p['x'] for p in pts]
	ys = [p['y'] for p in pts]

	cx = sum(xs) / len(xs)

	# Use the topmost Y coordinate as the eye-level reference
	# (eyes are always the highest landmarks returned)
	cy_eyes = min(ys)

	# Landmark bounding box diagonal — measures "face size on screen"
	# Real faces: 80-300px diagonal. Ghost faces (hands, objects): 10-40px.
	span_x = max(xs) - min(xs)
	span_y = max(ys) - min(ys)
	span = (span_x 2 + span_y 2) ** 0.5

	return {'cx': cx, 'cy_eyes': cy_eyes, 'span': span}


	def _filter_ghost_faces(processed_faces: list) -> list:
	"""
	Filter out ghost/fake face detections from a single frame.

	Ghost faces are typically:
	- Hands, fingers, or objects misidentified as faces
	- Very small landmark span compared to real faces in the same frame
	- Landmark span < 40% of the largest face → rejected
	- Absolute minimum span of 30px (any face smaller than this is too tiny to be real)
	"""
	if len(processed_faces) < 2:
	return processed_faces

	# Find the largest face in this frame
	max_span = max(f['span'] for f in processed_faces)

	# Reject faces whose span is less than 40% of the largest face
	# Also reject faces with absolute span < 30px (too small to be a real face)
	MIN_RELATIVE_SPAN = 0.40
	MIN_ABSOLUTE_SPAN = 30.0

	filtered = [
	f for f in processed_faces
	if f['span'] >= max_span * MIN_RELATIVE_SPAN and f['span'] >= MIN_ABSOLUTE_SPAN
	]

	return filtered if filtered else processed_faces[:1] # Always keep at least the biggest face


	def compute_face_crops(segment_faces_data: List[List[dict]], source_w: int, source_h: int) -> tuple[dict, dict]:
	"""
	╔═══════════════════════════════════════════════════════════════╗
	║ PROPORTIONAL FACE CROP ALGORITHM ║
	║ ║
	║ Core principle: Crop SIZE comes from the source video ║
	║ dimensions (always proportional). Landmarks are used ║
	║ ONLY for positioning (centering on the face). ║
	║ ║
	║ This ensures consistent framing regardless of whether ║
	║ the subject is close-up or far from the camera. ║
	╚═══════════════════════════════════════════════════════════════╝

	Algorithm steps:
	1. Collect face center points from all frames in the segment
	2. Filter ghost/fake faces (hands, objects) using landmark span comparison
	3. Sort left vs right speaker by horizontal position
	4. Average each speaker's position across all frames (temporal smoothing)
	5. Calculate crop width = 50% of source video width (standard interview framing)
	6. Apply anti-overlap: if faces are close, reduce crop width so boxes don't overlap
	7. Force 9:8 aspect ratio (matches 1080x960 layer) so c_fill = pure scale
	8. Position: face centered horizontally, eye-level at 35% from top (rule of thirds)
	9. Clamp to source video bounds
	"""
	TARGET_ASPECT = 1080 / 960 # 9:8 = 1.125
	BASE_CROP_RATIO = 0.50 # Each speaker gets 50% of source width as base crop
	FACE_VERTICAL_POS = 0.35 # Eyes sit at 35% from top of frame (rule of thirds)

	# --- STEP 1-3: Collect, filter, and average face centers ---
	left_centers = []
	right_centers = []

	for frame_faces in segment_faces_data:
	processed = []
	for face in frame_faces:
	center = _extract_face_center(face)
	if center:
	processed.append(center)

	# Filter out ghost/fake faces (hands, objects, etc.)
	processed = _filter_ghost_faces(processed)

	# Sort left-to-right by horizontal position
	sorted_faces = sorted(processed, key=lambda f: f['cx'])
	if len(sorted_faces) >= 2:
	left_centers.append(sorted_faces[0])
	right_centers.append(sorted_faces[-1])

	# Average positions across all frames (temporal smoothing)
	def avg_center(centers, fallback_x, fallback_y):
	if not centers:
	return fallback_x, fallback_y
	cx = sum(c['cx'] for c in centers) / len(centers)
	cy = sum(c['cy_eyes'] for c in centers) / len(centers)
	return cx, cy

	left_cx, left_cy = avg_center(left_centers, source_w * 0.25, source_h * 0.40)
	right_cx, right_cy = avg_center(right_centers, source_w * 0.75, source_h * 0.40)

	# --- STEP 4: Base crop size from source dimensions ---
	crop_w = int(source_w * BASE_CROP_RATIO)

	# --- STEP 5: Anti-overlap ---
	# If the two faces are close together, reduce crop width so boxes don't overlap
	face_gap = abs(right_cx - left_cx)
	max_allowed_w = int(face_gap * 0.92) # Leave 8% gap between the two crops
	if crop_w > max_allowed_w and max_allowed_w > 200:
	crop_w = max_allowed_w

	# --- STEP 6: Force 9:8 aspect ratio ---
	crop_h = int(crop_w / TARGET_ASPECT)

	# Ensure crop height fits within source video
	if crop_h > source_h:
	crop_h = source_h
	crop_w = int(crop_h * TARGET_ASPECT)

	# --- STEP 7-8: Position each crop ---
	def position_crop(face_cx, face_cy_eyes):
	# Center horizontally on the face
	x = int(face_cx - crop_w / 2)

	# Vertically: place eye-level at 35% from top of crop (rule of thirds)
	# This naturally gives correct headroom above and shows shoulders below
	y = int(face_cy_eyes - crop_h * FACE_VERTICAL_POS)

	# Clamp to source video bounds
	x = max(0, min(x, source_w - crop_w))
	y = max(0, min(y, source_h - crop_h))

	return {"x": x, "y": y, "w": crop_w, "h": crop_h}

	left_crop = position_crop(left_cx, left_cy)
	right_crop = position_crop(right_cx, right_cy)

	return left_crop, right_crop



	def build_final_url(video_id: str, start_time: float, end_time: float, multi_face_segments: List[dict]) -> str:
	"""
	Build the final Cloudinary URL with layers for multi-face segments.

	Base: Full 9:16 video with g_auto:face
	Layers: Split-screen overlays during multi-face segments using exact face coordinates
	"""
	duration = end_time - start_time

	# Base transformation: 9:16 vertical with face tracking fallback
	base = f"so_{start_time},eo_{end_time}/w_1080,h_1920,c_fill,g_auto:face"

	# Build layers for each multi-face segment
	layers = []
	for segment in multi_face_segments:
	seg_start = round(segment["start"], 2)
	seg_end = round(segment["end"], 2)
	seg_duration = round(seg_end - seg_start, 2)

	# Skip segments shorter than 1 second
	if seg_duration < 1:
	continue

	# Calculate offsets in OUTPUT video timeline
	layer_start_offset = round(seg_start - start_time, 2)
	layer_end_offset = round(seg_end - start_time, 2)

	# Use our pre-calculated bounding boxes
	t_face = segment.get("top_face", {"x": 0, "y": 0, "w": 300, "h": 300})
	b_face = segment.get("bottom_face", {"x": 0, "y": 0, "w": 300, "h": 300})

	# Top layer - left speaker
	# 1. c_crop extracts just their face box
	# 2. c_fill scales that tight box strictly up/down to 1080x960
	top_layer = (
	f"l_video:{video_id},"
	f"so_{seg_start},eo_{seg_end},du_{seg_duration},ac_none/"
	f"c_crop,w_{t_face['w']},h_{t_face['h']},x_{t_face['x']},y_{t_face['y']}/"
	f"c_fill,w_1080,h_960/"
	f"fl_layer_apply,g_north,so_{layer_start_offset},eo_{layer_end_offset}"
	)

	# Bottom layer - right speaker
	bottom_layer = (
	f"l_video:{video_id},"
	f"so_{seg_start},eo_{seg_end},du_{seg_duration},ac_none/"
	f"c_crop,w_{b_face['w']},h_{b_face['h']},x_{b_face['x']},y_{b_face['y']}/"
	f"c_fill,w_1080,h_960/"
	f"fl_layer_apply,g_south,so_{layer_start_offset},eo_{layer_end_offset}"
	)

	layers.append(top_layer)
	layers.append(bottom_layer)

	# Combine all parts
	if layers:
	transformations = f"{base}/{'/'.join(layers)}"
	else:
	transformations = f"{base}"

	return f"{CLOUDINARY_BASE}/{transformations}/{video_id}.mp4"


	# ------------------------------------------
	# BACKGROUND WORKER
	# ------------------------------------------
	def process_video_sync(job_id: str, video_url: str):
	"""
	Synchronous wrapper for async processing.
	"""
	asyncio.run(process_video_async(job_id, video_url))


	async def process_video_async(job_id: str, video_url: str):
	"""
	Main video processing logic:
	1. Parse URL to get video_id and time range
	2. Fetch face data for each frame (500ms intervals)
	3. Find multi-face segments
	4. Build final URL with layers
	"""
	print(f"[{job_id}] Starting job: {video_url}")
	JOBS[job_id]["status"] = "processing"
	JOBS[job_id]["progress"] = "Parsing video URL..."

	try:
	# 1. Parse URL
	parsed = parse_cloudinary_url(video_url)
	video_id = parsed["video_id"]
	start_time = parsed["start_time"]
	end_time = parsed["end_time"]
	duration = parsed["duration"]

	if not video_id:
	raise Exception("Could not extract video ID from URL")

	JOBS[job_id]["progress"] = f"Analyzing {duration}s of video..."
	print(f"[{job_id}] Video: {video_id}, Range: {start_time}s - {end_time}s")

	# 2. Fetch face data for each frame (500ms intervals)
	frame_times = []
	t = start_time
	while t <= end_time:
	frame_times.append(round(t, 1))
	t += 0.5

	total_frames = len(frame_times)
	JOBS[job_id]["progress"] = f"Fetching face data for {total_frames} frames..."

	frame_data = []
	async with httpx.AsyncClient() as client:
	# Process in batches of 10 to avoid overwhelming the API
	batch_size = 10
	for i in range(0, len(frame_times), batch_size):
	batch = frame_times[i:i + batch_size]
	tasks = [fetch_face_data(client, video_id, t) for t in batch]
	results = await asyncio.gather(*tasks)
	frame_data.extend(results)

	progress_pct = min(100, int((i + batch_size) / total_frames * 100))
	JOBS[job_id]["progress"] = f"Analyzing frames... {progress_pct}%"

	# 3. Find multi-face segments
	JOBS[job_id]["progress"] = "Detecting multi-face segments..."
	multi_face_segments = find_multi_face_segments(frame_data)
	print(f"[{job_id}] Found {len(multi_face_segments)} multi-face segments")

	# 4. Build final URL
	JOBS[job_id]["progress"] = "Building final video URL..."
	final_url = build_final_url(video_id, start_time, end_time, multi_face_segments)

	# 5. Complete
	JOBS[job_id]["status"] = "completed"
	JOBS[job_id]["progress"] = "Done"
	JOBS[job_id]["result"] = {
	"video_url": final_url,
	"video_id": video_id,
	"start_time": start_time,
	"end_time": end_time,
	"multi_face_segments": multi_face_segments,
	"total_frames_analyzed": total_frames
	}
	print(f"[{job_id}] Completed: {final_url}")

	except Exception as e:
	print(f"[{job_id}] FAILED: {str(e)}")
	JOBS[job_id]["status"] = "failed"
	JOBS[job_id]["error"] = str(e)
	JOBS[job_id]["progress"] = "Failed"


	# ------------------------------------------
	# API ENDPOINTS
	# ------------------------------------------

	@app.post("/jobs")
	def submit_job(req: VideoRequest, background_tasks: BackgroundTasks):
	job_id = str(uuid.uuid4())

	JOBS[job_id] = {
	"status": "queued",
	"progress": "Waiting in queue...",
	"result": None,
	"error": None,
	"created_at": time.time()
	}

	background_tasks.add_task(process_video_sync, job_id, req.video_url)

	return {"job_id": job_id, "status": "queued"}


	@app.get("/jobs/{job_id}")
	def get_job_status(job_id: str):
	job = JOBS.get(job_id)
	if not job:
	raise HTTPException(status_code=404, detail="Job not found")
	return job


	@app.get("/")
	def home():
	return {"message": "Magic Cut API is Running", "version": "1.0"}


	@app.get("/client", response_class=HTMLResponse)
	def serve_client():
	"""Serve the embedded HTML client."""
	html_content = """
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Magic Cut - Video Face Splitter</title>
	<style>
	:root {
	--primary: #a855f7;
	--bg: #0f0f1a;
	--surface: #1a1a2e;
	--text: #f3f4f6;
	}
	body {
	font-family: 'Inter', system-ui, sans-serif;
	background: var(--bg);
	color: var(--text);
	display: flex;
	justify-content: center;
	align-items: center;
	min-height: 100vh;
	margin: 0;
	padding: 1rem;
	}
	.container {
	background: var(--surface);
	padding: 2rem;
	border-radius: 16px;
	width: 100%;
	max-width: 600px;
	box-shadow: 0 20px 40px rgba(0,0,0,0.4);
	border: 1px solid #2a2a4a;
	}
	h2 {
	margin-top: 0;
	text-align: center;
	background: linear-gradient(135deg, #a855f7, #ec4899);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	font-size: 1.8rem;
	}
	h4 {
	margin: 0;
	color: #9ca3af;
	text-align: center;
	font-weight: 400;
	margin-bottom: 1.5rem;
	}
	.form-group {
	margin-bottom: 1.5rem;
	}
	label {
	display: block;
	margin-bottom: 0.5rem;
	font-size: 0.9rem;
	color: #d1d5db;
	}
	input, textarea {
	width: 100%;
	padding: 0.75rem;
	background: #0f0f1a;
	border: 1px solid #374151;
	border-radius: 8px;
	color: white;
	box-sizing: border-box;
	font-family: inherit;
	}
	input:focus, textarea:focus {
	outline: 2px solid var(--primary);
	border-color: transparent;
	}
	button {
	width: 100%;
	padding: 0.875rem;
	background: linear-gradient(135deg, #a855f7, #ec4899);
	color: white;
	border: none;
	border-radius: 8px;
	font-weight: 700;
	cursor: pointer;
	transition: all 0.2s;
	font-size: 1rem;
	}
	button:hover {
	transform: translateY(-2px);
	box-shadow: 0 10px 20px rgba(168, 85, 247, 0.3);
	}
	button:disabled {
	opacity: 0.5;
	cursor: not-allowed;
	transform: none;
	box-shadow: none;
	}
	#statusBox {
	margin-top: 2rem;
	display: none;
	background: #0f0f1a;
	padding: 1.5rem;
	border-radius: 12px;
	border: 1px solid #374151;
	}
	.status-badge {
	display: inline-block;
	padding: 6px 14px;
	border-radius: 99px;
	font-size: 0.8rem;
	font-weight: 600;
	margin-bottom: 1rem;
	}
	.status-badge.queued { background: #f59e0b; color: black; }
	.status-badge.processing { background: #3b82f6; color: white; }
	.status-badge.completed { background: #10b981; color: black; }
	.status-badge.failed { background: #ef4444; color: white; }
	#progressText {
	color: #d1d5db;
	margin-bottom: 1rem;
	font-size: 0.95rem;
	}
	.result-box {
	background: #1a1a2e;
	padding: 1rem;
	border-radius: 8px;
	margin-top: 1rem;
	}
	.result-url {
	word-break: break-all;
	font-size: 0.85rem;
	color: var(--primary);
	margin-bottom: 0.5rem;
	}
	.copy-btn {
	background: #374151;
	border: none;
	color: white;
	padding: 8px 16px;
	border-radius: 6px;
	cursor: pointer;
	font-size: 0.85rem;
	width: auto;
	margin-top: 0.5rem;
	}
	.copy-btn:hover {
	background: #4b5563;
	transform: none;
	box-shadow: none;
	}
	.spinner {
	border: 4px solid #374151;
	border-top: 4px solid var(--primary);
	border-radius: 50%;
	width: 30px;
	height: 30px;
	animation: spin 1s linear infinite;
	margin: 0 auto 1rem auto;
	display: none;
	}
	@keyframes spin {
	0% { transform: rotate(0deg); }
	100% { transform: rotate(360deg); }
	}
	.info-box {
	background: rgba(168, 85, 247, 0.1);
	border: 1px solid rgba(168, 85, 247, 0.3);
	border-radius: 8px;
	padding: 1rem;
	margin-bottom: 1.5rem;
	font-size: 0.85rem;
	color: #d1d5db;
	}
	.segments-info {
	margin-top: 1rem;
	font-size: 0.85rem;
	color: #9ca3af;
	}
	video {
	width: 100%;
	max-height: 400px;
	border-radius: 8px;
	margin-top: 1rem;
	}
	</style>
	</head>
	<body>
	<div class="container">
	<h2>✂️ Magic Cut</h2>
	<h4>Transform 16:9 videos into vertical shorts with face tracking</h4>

	<div class="info-box">
	<strong>How it works:</strong><br>
	1. Paste your Cloudinary video URL with <code>so_X,du_Y</code> (start time, duration)<br>
	2. We analyze each frame for faces (every 500ms)<br>
	3. When 2+ faces detected → split-screen layout<br>
	4. Get your final 9:16 video URL!
	</div>

	<div class="form-group">
	<label>Cloudinary Video URL</label>
	<textarea id="videoUrl" rows="3" placeholder="https://res.cloudinary.com/doxoms9hd/video/upload/so_55,du_30/fl_getinfo/video_id.jpg"></textarea>
	<small style="color: #6b7280; display: block; margin-top: 4px;">
	Format: so_X,du_Y (start at X seconds, duration Y seconds)
	</small>
	</div>

	<button id="processBtn" onclick="submitJob()">🎬 Process Video</button>

	<div id="statusBox">
	<div id="spinner" class="spinner"></div>
	<span id="statusBadge" class="status-badge">Waiting</span>
	<div id="progressText">Initializing...</div>
	<div id="resultBox"></div>
	</div>
	</div>

	<script>
	const API_BASE = window.location.origin;
	let pollInterval = null;

	async function submitJob() {
	const videoUrl = document.getElementById('videoUrl').value.trim();
	const btn = document.getElementById('processBtn');
	const statusBox = document.getElementById('statusBox');

	if (!videoUrl) {
	alert("Please enter a video URL");
	return;
	}

	btn.disabled = true;
	statusBox.style.display = 'block';
	document.getElementById('resultBox').innerHTML = '';
	updateStatus("queued", "Submitting job...");

	try {
	const response = await fetch(`${API_BASE}/jobs`, {
	method: 'POST',
	headers: { 'Content-Type': 'application/json' },
	body: JSON.stringify({ video_url: videoUrl })
	});

	const data = await response.json();

	if (data.job_id) {
	console.log("Job Submitted:", data.job_id);
	startPolling(data.job_id);
	} else {
	updateStatus("failed", "Failed to get Job ID");
	btn.disabled = false;
	}

	} catch (error) {
	console.error(error);
	updateStatus("failed", "Connection Error. Check URL.");
	btn.disabled = false;
	}
	}

	function startPolling(jobId) {
	if (pollInterval) clearInterval(pollInterval);

	pollInterval = setInterval(async () => {
	try {
	const res = await fetch(`${API_BASE}/jobs/${jobId}`);
	const job = await res.json();

	updateStatus(job.status, job.progress);

	if (job.status === 'completed') {
	clearInterval(pollInterval);
	showResults(job.result);
	document.getElementById('processBtn').disabled = false;
	}

	if (job.status === 'failed') {
	clearInterval(pollInterval);
	document.getElementById('progressText').innerText = "Error: " + job.error;
	document.getElementById('processBtn').disabled = false;
	}

	} catch (e) {
	console.error("Polling error", e);
	}
	}, 2000);
	}

	function updateStatus(status, message) {
	const badge = document.getElementById('statusBadge');
	const spinner = document.getElementById('spinner');
	const text = document.getElementById('progressText');

	badge.className = `status-badge ${status}`;
	badge.innerText = status.toUpperCase();
	text.innerText = message \|\| "Processing...";

	if (status === 'processing' \|\| status === 'queued') {
	spinner.style.display = 'block';
	} else {
	spinner.style.display = 'none';
	}
	}

	function showResults(result) {
	const box = document.getElementById('resultBox');
	const segments = result.multi_face_segments \|\| [];

	let segmentsHtml = '';
	if (segments.length > 0) {
	segmentsHtml = `
	<div class="segments-info">
	<strong>🎭 Multi-face segments found:</strong><br>
	${segments.map((s, i) => `Segment ${i+1}: ${s.start}s - ${s.end}s`).join('<br>')}
	</div>
	`;
	} else {
	segmentsHtml = `<div class="segments-info">No multi-face segments detected (single speaker throughout)</div>`;
	}

	box.innerHTML = `
	<div class="result-box">
	<div style="margin-bottom: 0.5rem; color: #10b981; font-weight: 600;">✅ Video Ready!</div>
	<div class="result-url">${result.video_url}</div>
	<button class="copy-btn" onclick="navigator.clipboard.writeText('${result.video_url}').then(() => this.innerText = 'Copied!')">
	📋 Copy URL
	</button>
	${segmentsHtml}
	<div class="segments-info">
	<strong>📊 Stats:</strong> ${result.total_frames_analyzed} frames analyzed
	</div>
	<video controls src="${result.video_url}"></video>
	</div>
	`;
	}
	</script>
	</body>
	</html>
	"""
	return html_content