magic_cut

Sleeping

App Files Files Community

ADXabhi commited on Feb 16

Commit

a3655da

verified ·

1 Parent(s): 623f8c3

Upload 5 files

Browse files

Files changed (2) hide show

app.py +75 -244
client.html +6 -16

app.py CHANGED Viewed

@@ -71,269 +71,119 @@ def parse_cloudinary_url(url: str) -> dict:
 def get_face_info_url(video_id: str, time_sec: float) -> str:
     """
     Build URL to fetch face data for a specific frame.
-    Uses g_faces (plural) to trigger detection of ALL faces.
-    fl_getinfo returns face coordinates relative to the INPUT image,
-    so we get positions in original frame space.
     """
-    return f"{CLOUDINARY_BASE}/so_{time_sec},f_jpg/c_thumb,g_faces,w_450/fl_getinfo/{video_id}.jpg"
 async def fetch_face_data(client: httpx.AsyncClient, video_id: str, time_sec: float) -> dict:
     """
     Fetch face detection data for a specific timestamp.
-    Returns face count, bounding boxes [x, y, w, h], and image dimensions.
     """
     url = get_face_info_url(video_id, time_sec)
     try:
         response = await client.get(url, timeout=10.0)
         if response.status_code == 200:
             data = response.json()
-            # Get image dimensions from the input info (original frame before cropping)
-            input_info = data.get("input", {})
-            img_width = input_info.get("width", 1920)
-            img_height = input_info.get("height", 1080)
-            # Cloudinary returns face data in 'landmarks' with g_face/g_faces
-            # landmarks[0] is the array of face coordinate arrays
-            # Each face is [x, y, width, height] relative to the INPUT image
-            faces_raw = []
             landmarks = data.get("landmarks", [[]])
-            if landmarks and landmarks[0]:
-                faces_raw = landmarks[0]
-            # Also check 'faces' key as fallback
-            if not faces_raw:
-                faces_raw = data.get("faces", [])
-            face_count = len(faces_raw) if faces_raw else 0
-            # Debug: log first few frames
-            if time_sec <= 2.0 or face_count >= 2:
-                print(f"  [face_data] t={time_sec}s: {face_count} faces, img={img_width}x{img_height}, raw={faces_raw[:3] if faces_raw else '[]'}")
-            # Parse face bounding boxes
-            faces = []
-            for face in faces_raw:
-                if isinstance(face, (list, tuple)) and len(face) >= 4:
-                    faces.append({
-                        "x": face[0],
-                        "y": face[1],
-                        "w": face[2],
-                        "h": face[3],
-                        "center_x": face[0] + face[2] / 2
-                    })
-                elif isinstance(face, dict):
-                    fx = face.get("x", 0)
-                    fy = face.get("y", 0)
-                    fw = face.get("width", face.get("w", 0))
-                    fh = face.get("height", face.get("h", 0))
-                    faces.append({
-                        "x": fx, "y": fy, "w": fw, "h": fh,
-                        "center_x": fx + fw / 2
-                    })
             return {
                 "time": time_sec,
                 "face_count": face_count,
-                "faces": faces,
-                "img_width": img_width,
-                "img_height": img_height
             }
     except Exception as e:
         print(f"Error fetching face data at {time_sec}s: {e}")
-    return {"time": time_sec, "face_count": 0, "faces": [], "img_width": 1920, "img_height": 1080}
-# ------------------------------------------
-# LAYOUT MODES
-# ------------------------------------------
-LAYOUT_LETTERBOX = "LETTERBOX"        # 0 faces - full frame with blurred bars
-LAYOUT_SINGLE_TRACK = "SINGLE_TRACK"  # 1 face - track and crop on face
-LAYOUT_SPLIT_SCREEN = "SPLIT_SCREEN"  # 2 faces far apart - top/bottom split
-LAYOUT_DUAL_TRACK = "DUAL_TRACK"      # 2 faces close together - crop around both
-LAYOUT_GROUP_SHOT = "GROUP_SHOT"      # 3+ faces - crop to fit group
-# Threshold: if two faces' centers are more than 40% of frame width apart, split screen
-FACE_DISTANCE_THRESHOLD = 0.40
-# Minimum segment duration in seconds (segments shorter than this get merged)
-MIN_SEGMENT_DURATION = 1.5
-def classify_frame(frame: dict) -> str:
-    """
-    Classify a frame into one of the 5 layout modes based on face data.
-    """
-    face_count = frame["face_count"]
-    faces = frame["faces"]
-    img_width = frame["img_width"]
-    if face_count == 0:
-        return LAYOUT_LETTERBOX
-    if face_count == 1:
-        return LAYOUT_SINGLE_TRACK
-    if face_count == 2:
-        # Check distance between the two faces
-        if len(faces) >= 2 and img_width > 0:
-            distance = abs(faces[0]["center_x"] - faces[1]["center_x"])
-            relative_distance = distance / img_width
-            if relative_distance > FACE_DISTANCE_THRESHOLD:
-                return LAYOUT_SPLIT_SCREEN
-            else:
-                return LAYOUT_DUAL_TRACK
-        # Fallback if face position data is missing
-        return LAYOUT_SPLIT_SCREEN
-    # 3+ faces
-    return LAYOUT_GROUP_SHOT
-def build_layout_segments(frame_data: List[dict]) -> List[dict]:
     """
-    Build layout segments from classified frame data.
-    1. Classify each frame
-    2. Group consecutive frames with same mode into segments
-    3. Smooth: merge segments shorter than MIN_SEGMENT_DURATION into neighbors
     """
-    if not frame_data:
-        return []
-    # Step 1: Classify all frames
     for frame in frame_data:
-        frame["layout"] = classify_frame(frame)
-    # Step 2: Group consecutive frames with same layout into segments
-    raw_segments = []
-    current_mode = frame_data[0]["layout"]
-    segment_start = frame_data[0]["time"]
-    for i in range(1, len(frame_data)):
-        if frame_data[i]["layout"] != current_mode:
-            raw_segments.append({
-                "start": segment_start,
-                "end": frame_data[i]["time"],
-                "mode": current_mode
-            })
-            current_mode = frame_data[i]["layout"]
-            segment_start = frame_data[i]["time"]
-    # Close final segment
-    raw_segments.append({
-        "start": segment_start,
-        "end": frame_data[-1]["time"],
-        "mode": current_mode
-    })
-    # Step 3: Smooth - merge short segments into their neighbors
-    if len(raw_segments) <= 1:
-        return raw_segments
-    smoothed = [raw_segments[0]]
-    for seg in raw_segments[1:]:
-        seg_duration = seg["end"] - seg["start"]
-        if seg_duration < MIN_SEGMENT_DURATION:
-            # Merge into previous segment (extend previous)
-            smoothed[-1]["end"] = seg["end"]
         else:
-            # Check if previous segment is too short, merge it into this one
-            prev_duration = smoothed[-1]["end"] - smoothed[-1]["start"]
-            if prev_duration < MIN_SEGMENT_DURATION and len(smoothed) > 1:
-                # Extend the one before that
-                smoothed[-2]["end"] = smoothed[-1]["end"]
-                smoothed[-1] = seg
-            else:
-                smoothed.append(seg)
-    # Final pass: merge any remaining tiny segments
-    final = [smoothed[0]]
-    for seg in smoothed[1:]:
-        if seg["end"] - seg["start"] < MIN_SEGMENT_DURATION:
-            final[-1]["end"] = seg["end"]
-        else:
-            final.append(seg)
-    return final
-def build_final_url(video_id: str, start_time: float, end_time: float, segments: List[dict]) -> str:
     """
-    Build the final Cloudinary URL with timed overlay layers for each layout segment.
-    Base: Full 9:16 video with g_auto:face (handles SINGLE_TRACK natively)
-    Overlay layers are added for other modes:
-    - LETTERBOX:     Full-cover layer with c_pad,b_blurred:400:15
-    - SPLIT_SCREEN:  Two layers (g_west top, g_east bottom)
-    - DUAL_TRACK:    Full-cover layer with g_auto:faces
-    - GROUP_SHOT:    Full-cover layer with g_auto:faces
     Important:
-    - Layers shorter than 1 second are skipped
     - eo_X in fl_layer_apply makes layers DISAPPEAR completely (not freeze)
-    - SINGLE_TRACK segments need no layers (handled by base)
     """
-    # Base transformation: 9:16 vertical with single-face tracking
     base = f"so_{start_time},eo_{end_time}/w_1080,h_1920,c_fill,g_auto:face"
-    # Build layers for each non-SINGLE_TRACK segment
     layers = []
-    for segment in segments:
         seg_start = segment["start"]
         seg_end = segment["end"]
         seg_duration = seg_end - seg_start
-        mode = segment["mode"]
-        # SINGLE_TRACK is handled by the base transformation - no layer needed
-        if mode == LAYOUT_SINGLE_TRACK:
-            continue
         # Skip segments shorter than 1 second
         if seg_duration < 1:
             continue
         # Calculate offsets in OUTPUT video timeline
-        layer_start_offset = seg_start - start_time
-        layer_end_offset = seg_end - start_time
-        if mode == LAYOUT_LETTERBOX:
-            # Full-cover letterbox: pad to 9:16 with blurred background
-            letterbox_layer = (
-                f"l_video:{video_id},"
-                f"so_{seg_start},eo_{seg_end},du_{seg_duration},"
-                f"w_1080,h_1920,c_pad,b_blurred:400:15,ac_none/"
-                f"fl_layer_apply,g_center,so_{layer_start_offset},eo_{layer_end_offset}"
-            )
-            layers.append(letterbox_layer)
-        elif mode == LAYOUT_SPLIT_SCREEN:
-            # Top layer - left side of original (g_west)
-            top_layer = (
-                f"l_video:{video_id},"
-                f"so_{seg_start},eo_{seg_end},du_{seg_duration},"
-                f"w_1080,h_960,c_fill,g_west,ac_none/"
-                f"fl_layer_apply,g_north,so_{layer_start_offset},eo_{layer_end_offset}"
-            )
-            # Bottom layer - right side of original (g_east)
-            bottom_layer = (
-                f"l_video:{video_id},"
-                f"so_{seg_start},eo_{seg_end},du_{seg_duration},"
-                f"w_1080,h_960,c_fill,g_east,ac_none/"
-                f"fl_layer_apply,g_south,so_{layer_start_offset},eo_{layer_end_offset}"
-            )
-            layers.append(top_layer)
-            layers.append(bottom_layer)
-        elif mode in (LAYOUT_DUAL_TRACK, LAYOUT_GROUP_SHOT):
-            # Full-cover layer with multi-face tracking (g_auto:faces)
-            faces_layer = (
-                f"l_video:{video_id},"
-                f"so_{seg_start},eo_{seg_end},du_{seg_duration},"
-                f"w_1080,h_1920,c_fill,g_auto:faces,ac_none/"
-                f"fl_layer_apply,g_center,so_{layer_start_offset},eo_{layer_end_offset}"
-            )
-            layers.append(faces_layer)
     # Combine all parts
     if layers:
@@ -359,9 +209,8 @@ async def process_video_async(job_id: str, video_url: str):
     Main video processing logic:
     1. Parse URL to get video_id and time range
     2. Fetch face data for each frame (500ms intervals)
-    3. Classify frames into layout modes
-    4. Build layout segments with smoothing
-    5. Build final URL with timed layers
     """
     print(f"[{job_id}] Starting job: {video_url}")
     JOBS[job_id]["status"] = "processing"
@@ -404,22 +253,14 @@ async def process_video_async(job_id: str, video_url: str):
                 progress_pct = min(100, int((i + batch_size) / total_frames * 100))
                 JOBS[job_id]["progress"] = f"Analyzing frames... {progress_pct}%"
-        # 3. Classify frames & build layout segments with smoothing
-        JOBS[job_id]["progress"] = "Classifying layout segments..."
-        layout_segments = build_layout_segments(frame_data)
-        # Log segment breakdown
-        mode_counts = {}
-        for seg in layout_segments:
-            mode = seg["mode"]
-            mode_counts[mode] = mode_counts.get(mode, 0) + 1
-        print(f"[{job_id}] Layout segments: {mode_counts}")
-        for seg in layout_segments:
-            print(f"  {seg['mode']}: {seg['start']}s - {seg['end']}s ({seg['end'] - seg['start']:.1f}s)")
         # 4. Build final URL
         JOBS[job_id]["progress"] = "Building final video URL..."
-        final_url = build_final_url(video_id, start_time, end_time, layout_segments)
         # 5. Complete
         JOBS[job_id]["status"] = "completed"
@@ -429,7 +270,7 @@ async def process_video_async(job_id: str, video_url: str):
             "video_id": video_id,
             "start_time": start_time,
             "end_time": end_time,
-            "layout_segments": layout_segments,
             "total_frames_analyzed": total_frames
         }
         print(f"[{job_id}] Completed: {final_url}")
@@ -670,7 +511,7 @@ def serve_client():
             <strong>How it works:</strong><br>
             1. Paste your Cloudinary video URL with <code>so_X,du_Y</code> (start time, duration)<br>
             2. We analyze each frame for faces (every 500ms)<br>
-            3. Smart layout: 👤 Face Track · 🎭 Split Screen · 👥 Dual Track · 👨‍👩‍👧 Group · 📺 Letterbox<br>
             4. Get your final 9:16 video URL!
         </div>
@@ -781,28 +622,18 @@ def serve_client():
         function showResults(result) {
             const box = document.getElementById('resultBox');
-            const segments = result.layout_segments || [];
-            const modeIcons = {
-                'SINGLE_TRACK': '👤 Face Track',
-                'SPLIT_SCREEN': '🎭 Split Screen',
-                'DUAL_TRACK': '👥 Dual Track',
-                'GROUP_SHOT': '👨‍👩‍👧 Group Shot',
-                'LETTERBOX': '📺 Letterbox'
-            };
             let segmentsHtml = '';
             if (segments.length > 0) {
                 segmentsHtml = `
                     <div class="segments-info">
-                        <strong>🎬 Layout segments:</strong><br>
-                        ${segments.map((s, i) => {
-                            const icon = modeIcons[s.mode] || s.mode;
-                            const dur = (s.end - s.start).toFixed(1);
-                            return `${icon}: ${s.start}s - ${s.end}s (${dur}s)`;
-                        }).join('<br>')}
                     </div>
                 `;
             }
             box.innerHTML = `

 def get_face_info_url(video_id: str, time_sec: float) -> str:
     """
     Build URL to fetch face data for a specific frame.
+    Returns JSON with landmarks when fetched.
     """
+    return f"{CLOUDINARY_BASE}/so_{time_sec},f_jpg/c_thumb,g_face,w_450/fl_getinfo/{video_id}.jpg"
 async def fetch_face_data(client: httpx.AsyncClient, video_id: str, time_sec: float) -> dict:
     """
     Fetch face detection data for a specific timestamp.
+    Returns the number of faces and their positions.
     """
     url = get_face_info_url(video_id, time_sec)
     try:
         response = await client.get(url, timeout=10.0)
         if response.status_code == 200:
             data = response.json()
             landmarks = data.get("landmarks", [[]])
+            # landmarks[0] is array of face objects
+            face_count = len(landmarks[0]) if landmarks and landmarks[0] else 0
             return {
                 "time": time_sec,
                 "face_count": face_count,
+                "landmarks": landmarks[0] if landmarks else []
             }
     except Exception as e:
         print(f"Error fetching face data at {time_sec}s: {e}")
+    return {"time": time_sec, "face_count": 0, "landmarks": []}
+def find_multi_face_segments(frame_data: List[dict]) -> List[dict]:
     """
+    Analyze frame data to find segments where 2+ faces are detected.
+    Returns list of segments with start/end times.
     """
+    segments = []
+    in_multi_face = False
+    segment_start = None
     for frame in frame_data:
+        if frame["face_count"] >= 2:
+            if not in_multi_face:
+                # Start new segment
+                in_multi_face = True
+                segment_start = frame["time"]
         else:
+            if in_multi_face:
+                # End segment
+                in_multi_face = False
+                segments.append({
+                    "start": segment_start,
+                    "end": frame["time"]
+                })
+    # Close any open segment
+    if in_multi_face and segment_start is not None:
+        segments.append({
+            "start": segment_start,
+            "end": frame_data[-1]["time"] if frame_data else segment_start
+        })
+    return segments
+def build_final_url(video_id: str, start_time: float, end_time: float, multi_face_segments: List[dict]) -> str:
     """
+    Build the final Cloudinary URL with layers for multi-face segments.
+    Base: Full 9:16 video with g_auto:face
+    Layers: Split-screen overlays during multi-face segments
     Important:
+    - Layers shorter than 1 second are ignored
     - eo_X in fl_layer_apply makes layers DISAPPEAR completely (not freeze)
     """
+    duration = end_time - start_time
+    # Base transformation: 9:16 vertical with face tracking
     base = f"so_{start_time},eo_{end_time}/w_1080,h_1920,c_fill,g_auto:face"
+    # Build layers for each multi-face segment
     layers = []
+    for segment in multi_face_segments:
         seg_start = segment["start"]
         seg_end = segment["end"]
         seg_duration = seg_end - seg_start
         # Skip segments shorter than 1 second
         if seg_duration < 1:
             continue
         # Calculate offsets in OUTPUT video timeline
+        layer_start_offset = seg_start - start_time  # When layer appears (so_X in fl_layer_apply)
+        layer_end_offset = seg_end - start_time      # When layer disappears (eo_X in fl_layer_apply)
+        # Top layer - left side of original (g_west)
+        # du_X sets the layer video duration, eo_X in fl_layer_apply makes it VANISH at that time
+        top_layer = (
+            f"l_video:{video_id},"
+            f"so_{seg_start},eo_{seg_end},du_{seg_duration},"
+            f"w_1080,h_960,c_fill,g_west,ac_none/"
+            f"fl_layer_apply,g_north,so_{layer_start_offset},eo_{layer_end_offset}"
+        )
+        # Bottom layer - right side of original (g_east)
+        bottom_layer = (
+            f"l_video:{video_id},"
+            f"so_{seg_start},eo_{seg_end},du_{seg_duration},"
+            f"w_1080,h_960,c_fill,g_east,ac_none/"
+            f"fl_layer_apply,g_south,so_{layer_start_offset},eo_{layer_end_offset}"
+        )
+        layers.append(top_layer)
+        layers.append(bottom_layer)
     # Combine all parts
     if layers:
     Main video processing logic:
     1. Parse URL to get video_id and time range
     2. Fetch face data for each frame (500ms intervals)
+    3. Find multi-face segments
+    4. Build final URL with layers
     """
     print(f"[{job_id}] Starting job: {video_url}")
     JOBS[job_id]["status"] = "processing"
                 progress_pct = min(100, int((i + batch_size) / total_frames * 100))
                 JOBS[job_id]["progress"] = f"Analyzing frames... {progress_pct}%"
+        # 3. Find multi-face segments
+        JOBS[job_id]["progress"] = "Detecting multi-face segments..."
+        multi_face_segments = find_multi_face_segments(frame_data)
+        print(f"[{job_id}] Found {len(multi_face_segments)} multi-face segments")
         # 4. Build final URL
         JOBS[job_id]["progress"] = "Building final video URL..."
+        final_url = build_final_url(video_id, start_time, end_time, multi_face_segments)
         # 5. Complete
         JOBS[job_id]["status"] = "completed"
             "video_id": video_id,
             "start_time": start_time,
             "end_time": end_time,
+            "multi_face_segments": multi_face_segments,
             "total_frames_analyzed": total_frames
         }
         print(f"[{job_id}] Completed: {final_url}")
             <strong>How it works:</strong><br>
             1. Paste your Cloudinary video URL with <code>so_X,du_Y</code> (start time, duration)<br>
             2. We analyze each frame for faces (every 500ms)<br>
+            3. When 2+ faces detected → split-screen layout<br>
             4. Get your final 9:16 video URL!
         </div>
         function showResults(result) {
             const box = document.getElementById('resultBox');
+            const segments = result.multi_face_segments || [];
             let segmentsHtml = '';
             if (segments.length > 0) {
                 segmentsHtml = `
                     <div class="segments-info">
+                        <strong>🎭 Multi-face segments found:</strong><br>
+                        ${segments.map((s, i) => `Segment ${i+1}: ${s.start}s - ${s.end}s`).join('<br>')}
                     </div>
                 `;
+            } else {
+                segmentsHtml = `<div class="segments-info">No multi-face segments detected (single speaker throughout)</div>`;
             }
             box.innerHTML = `

client.html CHANGED Viewed

@@ -237,7 +237,7 @@
             <strong>How it works:</strong><br>
             1. Paste your Cloudinary video URL with <code>so_X,du_Y</code> (start time, duration)<br>
             2. We analyze each frame for faces (every 500ms)<br>
-            3. Smart layout: 👤 Face Track · 🎭 Split Screen · 👥 Dual Track · 👨‍👩‍👧 Group · 📺 Letterbox<br>
             4. Get your final 9:16 video URL!
         </div>
@@ -359,28 +359,18 @@
         function showResults(result) {
             const box = document.getElementById('resultBox');
-            const segments = result.layout_segments || [];
-            const modeIcons = {
-                'SINGLE_TRACK': '👤 Face Track',
-                'SPLIT_SCREEN': '🎭 Split Screen',
-                'DUAL_TRACK': '👥 Dual Track',
-                'GROUP_SHOT': '👨‍👩‍👧 Group Shot',
-                'LETTERBOX': '📺 Letterbox'
-            };
             let segmentsHtml = '';
             if (segments.length > 0) {
                 segmentsHtml = `
                     <div class="segments-info">
-                        <strong>🎬 Layout segments:</strong><br>
-                        ${segments.map((s, i) => {
-                    const icon = modeIcons[s.mode] || s.mode;
-                    const dur = (s.end - s.start).toFixed(1);
-                    return `${icon}: ${s.start}s - ${s.end}s (${dur}s)`;
-                }).join('<br>')}
                     </div>
                 `;
             }
             box.innerHTML = `

             <strong>How it works:</strong><br>
             1. Paste your Cloudinary video URL with <code>so_X,du_Y</code> (start time, duration)<br>
             2. We analyze each frame for faces (every 500ms)<br>
+            3. When 2+ faces detected → split-screen layout<br>
             4. Get your final 9:16 video URL!
         </div>
         function showResults(result) {
             const box = document.getElementById('resultBox');
+            const segments = result.multi_face_segments || [];
             let segmentsHtml = '';
             if (segments.length > 0) {
                 segmentsHtml = `
                     <div class="segments-info">
+                        <strong>🎭 Multi-face segments found:</strong><br>
+                        ${segments.map((s, i) => `Segment ${i + 1}: ${s.start}s - ${s.end}s`).join('<br>')}
                     </div>
                 `;
+            } else {
+                segmentsHtml = `<div class="segments-info">No multi-face segments detected (single speaker throughout)</div>`;
             }
             box.innerHTML = `