Spaces:

lablab-ai-amd-developer-hackathon
/

ElevenClip-AI

Runtime error

App Files Files Community

JakgritB commited on May 9

Commit

102f4d2

1 Parent(s): 89e1dc4

Deploy safe hackathon demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env.example +0 -18
.github/workflows/sync-to-hf.yml +0 -22
.gitignore +28 -16
Dockerfile +53 -4
LICENSE +1 -1
README.md +190 -433
backend/Dockerfile +0 -17
backend/app/__init__.py +0 -1
backend/app/core/__init__.py +0 -1
backend/app/core/config.py +0 -68
backend/app/core/timing.py +0 -20
backend/app/main.py +0 -240
backend/app/models/__init__.py +0 -1
backend/app/models/schemas.py +0 -127
backend/app/services/__init__.py +0 -1
backend/app/services/clips.py +0 -219
backend/app/services/highlight.py +0 -434
backend/app/services/multimodal.py +0 -200
backend/app/services/pipeline.py +0 -236
backend/app/services/subtitles.py +0 -151
backend/app/services/transcription.py +0 -366
backend/app/services/video_input.py +0 -80
backend/app/storage.py +0 -58
backend/app/utils/__init__.py +0 -1
backend/app/utils/rocm.py +0 -33
backend/app/workers/__init__.py +0 -1
backend/app/workers/celery_app.py +0 -15
backend/main.py +466 -0
backend/pyproject.toml +0 -44
backend/requirements.txt +37 -0
backend/src/__init__.py +0 -0
backend/src/analysis/__init__.py +0 -0
backend/src/analysis/highlight_scorer.py +166 -0
backend/src/analysis/scene_detector.py +111 -0
backend/src/analysis/vision.py +305 -0
backend/src/gpu/__init__.py +0 -0
backend/src/gpu/rocm_utils.py +92 -0
backend/src/gpu/vllm_manager.py +208 -0
backend/src/ingestion/__init__.py +0 -0
backend/src/ingestion/uploader.py +34 -0
backend/src/ingestion/youtube.py +147 -0
backend/src/processing/__init__.py +0 -0
backend/src/processing/clip_extractor.py +131 -0
backend/src/processing/emoji_overlay.py +36 -0
backend/src/processing/high_retention.py +491 -0
backend/src/processing/subtitle.py +291 -0
backend/src/transcription/__init__.py +0 -0
backend/src/transcription/whisper.py +234 -0
deploy/setup_droplet.sh +87 -0
deploy/start_fastapi.sh +10 -0

.env.example DELETED Viewed

@@ -1,18 +0,0 @@
-DEMO_MODE=true
-STORAGE_DIR=backend/data
-FRONTEND_ORIGIN=http://localhost:5173
-WHISPER_MODEL_ID=openai/whisper-large-v3
-QWEN_TEXT_MODEL_ID=Qwen/Qwen2.5-7B-Instruct
-QWEN_VL_MODEL_ID=Qwen/Qwen2-VL-7B-Instruct
-HF_TOKEN=
-TARGET_CLIP_COUNT=5
-MAX_CLIPS=10
-FFMPEG_BINARY=ffmpeg
-FFPROBE_BINARY=ffprobe
-FFMPEG_VIDEO_CODEC=h264_amf
-FFMPEG_CPU_CODEC=libx264
-REDIS_URL=redis://redis:6379/0
-CELERY_ENABLED=false

.github/workflows/sync-to-hf.yml DELETED Viewed

@@ -1,22 +0,0 @@
-name: Sync to Hugging Face Space
-on:
-  push:
-    branches:
-      - main
-jobs:
-  sync:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          lfs: true
-      - name: Push to Hugging Face Space
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          git remote add space https://JakgritB:$HF_TOKEN@huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI
-          git push space main --force

.gitignore CHANGED Viewed

@@ -1,25 +1,37 @@
-.env
-.hf-home/
-.venv/
-.python_packages/
 __pycache__/
-*.pyc
 *.egg-info/
-.pytest_cache/
-.mypy_cache/
-.ruff_cache/
-node_modules/
 dist/
-.vite/
-data/
-backend/data/
-tmp/
-pip-tmp/
 *.log
-hf-space-live/
 .DS_Store
 Thumbs.db

+# Python
 __pycache__/
+*.py[cod]
 *.egg-info/
+.venv/
+venv/
 dist/
+build/
+# Node
+frontend/node_modules/
+frontend/.next/
+frontend/out/
+# Temp files
+graphify-out/
+/tmp/
+*.wav
+*.mp4
+*.ass
 *.log
+# Env
+.env
+.env.local
+.env.*.local
+# SSH keys (never commit)
+*.pem
+*_key
+*_key.pub
+id_rsa*
+id_ed25519*
+# OS
 .DS_Store
 Thumbs.db

Dockerfile CHANGED Viewed

@@ -1,9 +1,58 @@
-FROM python:3.11-slim
 WORKDIR /app
-RUN pip install --no-cache-dir fastapi uvicorn
-COPY landing.py ./landing.py
 EXPOSE 7860
-CMD ["uvicorn", "landing:app", "--host", "0.0.0.0", "--port", "7860"]

+# ElevenClip AI — HuggingFace Spaces (AMD ROCm)
+FROM rocm/pytorch:rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
 WORKDIR /app
+# System dependencies
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    nginx \
+    curl \
+    git \
+    nodejs \
+    npm \
+    && rm -rf /var/lib/apt/lists/*
+# ─── Backend Python dependencies ───────────────────────────────────────────
+COPY backend/requirements.txt /app/backend/requirements.txt
+RUN pip install --no-cache-dir -r /app/backend/requirements.txt
+# vLLM with ROCm support
+RUN pip install --no-cache-dir \
+    "vllm>=0.6.0" \
+    --extra-index-url https://download.pytorch.org/whl/rocm6.2
+COPY backend/ /app/backend/
+# ─── Frontend (Next.js standalone build) ──────────────────────────────────
+COPY frontend/package*.json /app/frontend/
+RUN cd /app/frontend && npm ci --production=false
+COPY frontend/ /app/frontend/
+# Relative API URL — nginx proxies /api/* and /ws/* to FastAPI :8080
+ENV NEXT_PUBLIC_API_URL=""
+ENV NEXT_PUBLIC_DEMO_ENABLED="true"
+ENV NEXT_PUBLIC_DEMO_ONLY="true"
+RUN cd /app/frontend && npm run build
+# ─── nginx config ──────────────────────────────────────────────────────────
+COPY nginx.conf /app/nginx.conf
+# ─── Runtime directories ───────────────────────────────────────────────────
+RUN mkdir -p /tmp/elevnclip /root/.cache/huggingface /root/ElevenClip-AI/demo_videos
+# ─── Startup ──────────────────────────────────────────────────────────────
+COPY start.sh /app/start.sh
+RUN chmod +x /app/start.sh
 EXPOSE 7860
+# vLLM managed on-demand by vllm_manager.py (not started at container startup)
+ENV VLLM_ON_DEMAND="true"
+ENV VLLM_PORT="8000"
+ENV VLLM_IDLE_TIMEOUT="300"
+ENV VLLM_DOCKER_CONTAINER=""
+CMD ["/app/start.sh"]

LICENSE CHANGED Viewed

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2026 ElevenClip.AI contributors
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

 MIT License
+Copyright (c) 2026 ElevenClip AI
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

README.md CHANGED Viewed

@@ -1,498 +1,255 @@
----
-title: ElevenClip AI
-emoji: 🎬
-colorFrom: purple
-colorTo: red
-sdk: docker
-pinned: false
----
-# ElevenClip.AI
-ElevenClip.AI is an AI-powered clip studio for turning long-form videos into personalized short-form content for TikTok, YouTube Shorts, and Instagram Reels.
-This project is built for the **AMD Developer Hackathon** on lablab.ai, targeting **Track 3: Vision & Multimodal AI**. The system is designed to run on **AMD Developer Cloud** with **ROCm** and **AMD Instinct MI300X** acceleration, while using **Hugging Face** as the model hub/deployment layer and **Qwen** models for profile-aware highlight reasoning.
-## One-Sentence Pitch
-ElevenClip.AI helps creators convert long videos into ready-to-edit short clips by combining Whisper transcription, Qwen highlight detection, optional Qwen-VL visual understanding, ffmpeg rendering, and a human-in-the-loop clip editor.
-## Problem
-Long-form creators, podcasters, educators, streamers, and marketing teams often publish hours of video but still need short clips for modern discovery platforms.
-The manual workflow is painful:
-- Watch the full video.
-- Find high-retention moments.
-- Trim each clip.
-- Rewrite subtitles.
-- Reframe to vertical 9:16.
-- Export platform-ready MP4 files.
-For a two-hour video, this can take several hours of editing time. The bottleneck is not just cutting video; it is understanding which moments match the creator's audience, channel style, language, and target platform.
-## Solution
-ElevenClip.AI automates the first pass of short-form production:
-1. The creator sets up a reusable channel profile.
-2. The creator provides a YouTube URL or uploads a video file.
-3. Whisper Large V3 transcribes the video, including Thai and multilingual speech.
-4. Qwen2.5 analyzes the transcript and scores candidate highlights based on engagement potential and the creator profile.
-5. Optional Qwen2-VL analysis can enrich the scores with visual signals such as reactions, scene changes, and on-screen text.
-6. ffmpeg renders vertical clips with subtitle files and burn-in support.
-7. The React editor lets the human approve, delete, trim, regenerate, and edit subtitles before download.
-The product is intentionally human-AI collaborative: AI finds and prepares the clips quickly, while the creator keeps editorial control.
-## Hackathon Alignment
-### Track
-**Track 3: Vision & Multimodal AI**
-ElevenClip.AI processes multiple media types:
-- Audio: speech transcription with Whisper Large V3.
-- Text: transcript reasoning and highlight ranking with Qwen2.5.
-- Video: frame-aware multimodal analysis with Qwen2-VL as the next pipeline stage.
-- Rendered media: ffmpeg exports platform-ready video clips.
-### AMD Technology
-The production target is AMD Developer Cloud:
-- **AMD Instinct MI300X** for high-throughput model inference.
-- **ROCm 6.x** as the GPU software stack.
-- **PyTorch with ROCm support** for Whisper inference.
-- **vLLM ROCm backend** for fast Qwen2.5 inference.
-- **Optimum-AMD** as an optimization path for Hugging Face models on AMD hardware.
-- **ffmpeg hardware acceleration hooks** for faster video encoding where available.
-The app has a local `DEMO_MODE=true` path so judges and teammates can inspect the UI/API without downloading large models. On AMD Developer Cloud, set `DEMO_MODE=false` to activate the real model stack.
-### Hugging Face Integration
-Hugging Face is used as the model hub and deployment layer:
-- `openai/whisper-large-v3` for transcription.
-- `Qwen/Qwen2.5-7B-Instruct` for highlight analysis.
-- `Qwen/Qwen2-VL-7B-Instruct` for multimodal video understanding.
-- Public Hugging Face Space for the hackathon demo page:
-  `https://huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI`
-### Qwen Integration
-Qwen is not used as a generic chatbot. It is part of the core product logic:
-- Reads timestamped transcript segments.
-- Considers creator profile settings.
-- Scores engagement potential.
-- Explains why a segment should become a clip.
-- Returns structured JSON with timestamps, titles, scores, reasons, and subtitle text.
-## Current MVP Features
-- Channel profile onboarding:
-  - niche
-  - preferred clip style
-  - preferred clip length
-  - primary language
-  - target platform
-- YouTube URL ingestion through `yt-dlp`.
-- Direct video upload endpoint.
-- Whisper transcription service boundary.
-- Qwen highlight detection service boundary.
-- Optional Qwen2-VL multimodal analysis service boundary.
-- ffmpeg clip generation with subtitle file creation.
-- Vertical 9:16 export path for TikTok, Shorts, and Reels.
-- Human-AI review UI:
-  - trim start/end
-  - edit subtitles inline
-  - approve clips
-  - delete clips
-  - regenerate a clip
-  - download MP4 output
-- Timing logs for benchmark demos.
-- Docker and AMD Cloud deployment notes.
-## Architecture
-```mermaid
-flowchart LR
-  A["Creator Profile"] --> D["Qwen2.5 Highlight Scoring"]
-  B["YouTube URL"] --> C["yt-dlp / Video Input"]
-  B2["Uploaded Video"] --> C
-  C --> W["Whisper Large V3 Transcription"]
-  W --> D
-  C --> V["Qwen2-VL Visual Analysis (Optional)"]
-  D --> R["Clip Plan JSON"]
-  V --> R
-  R --> F["ffmpeg Clip Rendering + Subtitles"]
-  F --> E["React Human-AI Editor"]
-  E --> O["Approved Short-Form Clips"]
 ```
-## Repository Structure
-```text
-.
-├── backend/
-│   ├── app/
-│   │   ├── core/          # configuration and timing instrumentation
-│   │   ├── models/        # Pydantic request/response schemas
-│   │   ├── services/      # ingest, transcription, Qwen scoring, subtitles, rendering
-│   │   ├── utils/         # ROCm / accelerator detection
-│   │   ├── workers/       # optional Celery wiring
-│   │   ├── main.py        # FastAPI application
-│   │   └── storage.py     # file-backed job storage for MVP
-│   ├── Dockerfile
-│   └── pyproject.toml
-├── frontend/
-│   ├── src/
-│   │   ├── App.jsx        # creator workflow and clip editor
-│   │   ├── main.jsx
-│   │   └── styles.css
-│   ├── Dockerfile
-│   └── package.json
-├── infra/
-│   └── amd-cloud.md       # AMD Developer Cloud deployment guide
-├── scripts/
-│   └── benchmark.py       # end-to-end API benchmark helper
-├── docker-compose.yml
-└── README.md
 ```
-## Processing Pipeline
-### 1. Video Input
-The backend accepts:
-- YouTube URL through `POST /api/jobs/youtube`
-- Uploaded video file through `POST /api/jobs/upload`
-In production, YouTube videos are downloaded with `yt-dlp`. In demo mode, the app can generate a synthetic ffmpeg test video so the workflow can be tested without external downloads.
-### 2. Transcription
-The transcription service is implemented in `backend/app/services/transcription.py`.
-Production target:
-- Model: `openai/whisper-large-v3`
-- Runtime: Hugging Face Transformers
-- Accelerator: PyTorch ROCm on AMD MI300X
-- Language goal: Thai and multilingual support
-### 3. Highlight Detection
-The highlight detector is implemented in `backend/app/services/highlight.py`.
-Production target:
-- Model: `Qwen/Qwen2.5-7B-Instruct`
-- Runtime: vLLM with ROCm backend
-- Output: strict structured JSON
-Highlight scoring considers:
-- questions
-- punchlines
-- emotional peaks
-- key information
-- channel niche
-- preferred clip style
-- target platform
-- target clip length
-### 4. Multimodal Analysis
-The multimodal service boundary is implemented in `backend/app/services/multimodal.py`.
-Planned production target:
-- Model: `Qwen/Qwen2-VL-7B-Instruct`
-- Inputs: sampled video frames, transcript context, and clip candidates
-- Visual signals:
-  - creator or guest reactions
-  - scene changes
-  - on-screen text
-  - high-motion segments
-This is isolated as a replaceable pipeline step so it can be enabled when AMD Cloud resources are available.
-### 5. Clip Generation
-Clip rendering is implemented in `backend/app/services/clips.py`.
-The ffmpeg stage:
-- cuts video by selected timestamps
-- exports MP4
-- creates `.srt` subtitle files
-- supports subtitle burn-in
-- reformats to 9:16 vertical output for short-form platforms
-- includes AMD hardware encoder configuration hooks
-### 6. Human-AI Collaborative Editing
-The frontend editor lets creators review AI-generated clips and make final decisions:
-- adjust start and end timestamps
-- edit subtitle text
-- delete weak clips
-- approve good clips
-- regenerate a specific clip
-- download the result
-## API Overview
-| Method | Endpoint | Description |
-| --- | --- | --- |
-| `GET` | `/health` | Returns service health and accelerator detection. |
-| `POST` | `/api/jobs/youtube` | Creates a processing job from a YouTube URL. |
-| `POST` | `/api/jobs/upload` | Creates a processing job from an uploaded video. |
-| `GET` | `/api/jobs/{job_id}` | Returns status, transcript, clips, timings, and errors. |
-| `PATCH` | `/api/jobs/{job_id}/clips/{clip_id}` | Updates trim times, subtitles, approval, or deletion state. |
-| `POST` | `/api/jobs/{job_id}/clips/{clip_id}/regenerate` | Re-renders one clip with updated parameters. |
-| `GET` | `/api/jobs/{job_id}/clips/{clip_id}/download` | Downloads an exported clip. |
 ## Local Development
-### Requirements
-- Python 3.11+
-- Node.js 20+
-- ffmpeg
-### Backend
-```bash
-cd backend
-python -m venv .venv
-. .venv/bin/activate
-pip install -e .
-uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
-```
-On Windows PowerShell:
-```powershell
-cd backend
-python -m venv .venv
-.\.venv\Scripts\Activate.ps1
-pip install -e .
-uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
 ```
-### Frontend
 ```bash
 cd frontend
 npm install
-npm run dev
 ```
-Open:
-```text
-http://localhost:5173
-```
-### Demo Mode
-By default, the project runs in demo mode:
-```env
-DEMO_MODE=true
-```
-Demo mode avoids downloading multi-GB AI models and returns deterministic mock transcript/highlight data while still exercising the API, UI, job state, timing logs, subtitle generation, and ffmpeg rendering path.
-## AMD Developer Cloud Deployment
-See [infra/amd-cloud.md](infra/amd-cloud.md) for a focused deployment guide.
-High-level steps:
-```bash
-git clone https://github.com/JakgritB/ElevenClip.AI.git
-cd ElevenClip.AI
-cp .env.example .env
-```
-Edit `.env`:
 ```env
-DEMO_MODE=false
-HF_TOKEN=your_huggingface_token
-WHISPER_MODEL_ID=openai/whisper-large-v3
-QWEN_TEXT_MODEL_ID=Qwen/Qwen2.5-7B-Instruct
-QWEN_VL_MODEL_ID=Qwen/Qwen2-VL-7B-Instruct
-```
-Install the AI/ROCm stack on the AMD instance:
-```bash
-cd backend
-pip install -e ".[ai,rocm-inference]"
-```
-Start the API:
-```bash
-uvicorn app.main:app --host 0.0.0.0 --port 8000
-```
-Validate accelerator detection:
-```bash
-curl http://localhost:8000/health
-```
-Expected on AMD Cloud:
-- `torch_available: true`
-- `cuda_api_available: true`
-- `rocm_hip_version` populated
-- MI300X visible as the active device
-## Docker
-```bash
-docker compose up --build
 ```
-For AMD Developer Cloud with ROCm extras:
 ```bash
-docker compose build --build-arg INSTALL_EXTRAS=.[ai,rocm-inference] backend
-docker compose up
 ```
-The compose file mounts AMD GPU devices (`/dev/kfd`, `/dev/dri`) and uses host IPC for large-model inference.
-## Benchmark Plan
-The hackathon judges care about technology application and real-world performance. ElevenClip.AI includes step-level timing logs so the demo can show why AMD acceleration matters.
-Run a benchmark against a running API:
-```bash
-python scripts/benchmark.py \
-  --api http://localhost:8000 \
-  --youtube-url "https://youtube.com/watch?v=..."
-```
-Recommended benchmark comparison:
-| Scenario | Hardware | Expected Purpose |
-| --- | --- | --- |
-| CPU baseline | CPU-only runtime | Show the pain of long-form video processing without acceleration. |
-| AMD GPU run | AMD Instinct MI300X + ROCm | Show high-throughput transcription and Qwen inference. |
-Metrics captured:
-- input/download time
-- transcription time
-- highlight detection time
-- multimodal analysis time
-- clip generation time
-- total wall-clock time
-- number of clips generated
-Demo target:
-- input: two-hour creator video
-- output: 10 subtitle-ready clips
-- goal: under 10 minutes on MI300X
-## Submission Assets Checklist
-The lablab.ai submission asks for:
-- Project title: `ElevenClip.AI`
-- Short description
-- Long description
-- Technology and category tags
-- Cover image
-- Video presentation
-- Slide presentation
-- Public GitHub repository
-- Demo application platform
-- Application URL
-Prepared submission docs:
-- `docs/SUBMISSION.md` - copy-ready project text for lablab.ai.
-- `docs/DEMO_SCRIPT.md` - draft and final recording script.
-- `docs/PITCH_DECK.md` - slide outline for the presentation deck.
-- `docs/BUILD_IN_PUBLIC.md` - social post drafts and AMD feedback notes.
-- `docs/AMD_CREDIT_RUNBOOK.md` - checklist for the first MI300X run.
-Recommended tags:
-```text
-AMD, ROCm, MI300X, AMD Developer Cloud, Vision AI, Multimodal AI, Video AI, Whisper, Qwen, Qwen-VL, Hugging Face, FastAPI, React
-```
-## Suggested Short Description
-```text
-ElevenClip.AI turns long-form videos into personalized short-form clips using Whisper, Qwen, Hugging Face, and AMD ROCm on MI300X.
-```
-## Suggested Long Description
-```text
-ElevenClip.AI is a human-AI collaborative clip studio for creators. It takes a YouTube URL or uploaded long-form video, transcribes it with Whisper Large V3, uses Qwen2.5 to identify high-engagement highlight moments based on a reusable channel profile, optionally enriches candidates with Qwen2-VL visual analysis, and renders short-form MP4 clips with subtitles using ffmpeg. The React editor lets creators trim, edit subtitles, approve, delete, regenerate, and download final clips. The project is designed for AMD Developer Cloud with ROCm and AMD Instinct MI300X acceleration, demonstrating how high-throughput multimodal AI can reduce hours of manual editing into a fast creator workflow.
-```
-## Judging Criteria Mapping
-### Application of Technology
-ElevenClip.AI integrates Whisper, Qwen2.5, Qwen2-VL, Hugging Face, ROCm, vLLM, and AMD Developer Cloud into an end-to-end video processing product.
-### Presentation
-The demo is designed to be visual and easy to understand: input a long video, watch AI create candidates, edit clips, and download platform-ready MP4 files.
-### Business Value
-The product targets a real creator economy workflow. Creators, agencies, podcasters, educators, and streamers all need short-form repurposing, and manual editing is expensive.
-### Originality
-The system goes beyond generic clipping by personalizing highlight selection to a creator's niche, style, language, clip length, and platform. It also preserves human editorial control instead of fully automating final publishing.
-## Build-in-Public Plan
-The hackathon includes a build-in-public challenge. Suggested updates:
-1. Share the architecture and first local demo.
-2. Share AMD Cloud/ROCm setup notes and benchmark results.
-3. Publish meaningful feedback about ROCm, AMD Developer Cloud, or inference setup.
-Suggested hashtags/topics:
-```text
-#AMDDeveloperHackathon #ROCm #MI300X #HuggingFace #Qwen #VideoAI #MultimodalAI
-```
-## Roadmap
-- Real Whisper Large V3 run on AMD Developer Cloud.
-- Real Qwen2.5 vLLM ROCm inference path.
-- Qwen2-VL frame sampling and visual scoring.
-- Batch export for 10+ clips.
-- Subtitle styling presets per platform.
-- Creator profile memory and reusable brand presets.
-- Hugging Face Space screenshot and richer project media.
-- CPU vs MI300X benchmark report after AMD credits arrive.
 ## License
-MIT. See [LICENSE](LICENSE).

+# ElevenClip AI ✂️
+> **AMD Developer Hackathon 2026 — Track 3: Vision & Multimodal AI**
+Turn livestream recordings or uploaded videos into TikTok-ready highlight clips using **true multimodal AI** — vision, audio, and text analyzed simultaneously on AMD Instinct MI300X.
+[![HuggingFace Space](https://img.shields.io/badge/🤗-HuggingFace%20Space-yellow)](https://huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI)
+[![AMD ROCm](https://img.shields.io/badge/AMD-ROCm%206.3-red)](https://rocm.docs.amd.com/)
+[![Qwen2.5-VL](https://img.shields.io/badge/Qwen2.5--VL-7B%20Instruct-blue)](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)
+[![License: MIT](https://img.shields.io/badge/License-MIT-green)](LICENSE)
+---
+## Demo
+> Try it live: [HuggingFace Space](https://huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI)
+---
+## What It Does
+ElevenClip AI ingests an uploaded video and automatically finds the best moments to clip for TikTok using three AI modalities working together. The backend also keeps optional yt-dlp/YouTube support, but the public demo focuses on uploads because public video platforms can trigger anti-bot restrictions.
+| Modality | Model | What it detects |
+|---|---|---|
+| **Vision** | Qwen2.5-VL-7B on ROCm | Excitement, faces, action type, humor, TikTok potential |
+| **Audio** | insanely-fast-whisper (ROCm) | Word-level transcript + language detection |
+| **Audio Signal** | librosa | RMS energy → loud/quiet moments |
+| **Vision+Text** | Qwen2.5-VL (multimodal) | Frame + transcript context fused together |
+| **Text** | Python keyword scorer + Qwen2.5-VL text prompt | Style keyword matching, emoji selection |
+### Highlight Scoring Formula
 ```
+final_score = 0.40 × vision_score + 0.35 × audio_energy + 0.25 × text_keywords
+where:
+  vision_score = 0.5 × excitement + 0.3 × tiktok_potential + 0.2 × humor_level
 ```
+---
+## AI Pipeline
+```
+┌─ Input ──────────────────────────────────────────────────────────┐
+│  Uploaded video file (YouTube backend support is optional)       │
+└──────────────────────────────────────────────────────────────────┘
+           │
+           ▼
+┌─ Audio Extraction (ffmpeg) ──────────────────────────────────────┐
+│  16kHz mono WAV for Whisper                                      │
+└──────────────────────────────────────────────────────────────────┘
+           │
+    ┌──────┴──────┐
+    │             │  ← PARALLEL on AMD GPU ─────────────────────────
+    ▼             ▼
+┌─ Scene     ┌─ Whisper ROCm ────────────────────────────────────┐
+│  Detection │  insanely-fast-whisper (SDPA attention, 4.45×)    │
+│  PyScene   │  → transcript + word-level timestamps             │
+│  Detect    │  → auto language detection                        │
+└─────┬──────┴───────────────────────────────────────────────────┘
+      │                    │
+      ▼                    ▼
+┌─ Frame Sampling ──────────────────────────────────────────────────┐
+│  3 frames per scene (20%, 50%, 80% of scene)                     │
+└──────────────────────────────────────────────────────────────────┘
+           │
+           ▼  ← CONCURRENT requests to vLLM ──────────────────────
+┌─ Qwen2.5-VL Multimodal Analysis ───────────────────────────────────┐
+│  Input per scene: [frame1] [frame2] [frame3] + transcript text   │
+│  Output: excitement_score, tiktok_potential, face_bbox,          │
+│          emotion, action_type, humor_level, highlight_reason     │
+│  All scenes sent concurrently — vLLM batches on AMD MI300X       │
+└──────────────────────────────────────────────────────────────────┘
+           │
+           ▼
+┌─ Multi-Signal Scoring ────────────────────────────────────────────┐
+│  score = 0.40×vision + 0.35×audio_energy + 0.25×text_keywords   │
+│  Select top-N non-overlapping clips (min 30s gap)                │
+└──────────────────────────────────────────────────────────────────┘
+           │
+           ▼
+┌─ Branch ──────────────────────────────────────────────────────────┐
+│                                                                   │
+│  Normal Mode              HRE (High-Retention Editing)           │
+│  ─────────────            ──────────────────────────────         │
+│  • pysubs2 ASS            • Silence removal (ffmpeg)             │
+│  • User style config      • Auto-zoom to face (zoompan)          │
+│  • Font/color/animation   • Jump cuts at boundaries              │
+│  • Karaoke/pop/fade       • Qwen2.5-VL emoji selection          │
+│  • AMD AMF encode         • Impact bold captions                 │
+└──────────────────────────────────────────────────────────────────┘
+           │
+           ▼
+┌─ Editor (/editor) ────────────────────────────────────────────────┐
+│  • Per-clip subtitle timeline editing                            │
+│  • Global style override (live preview)                          │
+│  • Re-render + download MP4                                      │
+└──────────────────────────────────────────────────────────────────┘
+```
+---
+## AMD GPU Optimizations
+- **ROCm 6.3** — all model inference on AMD Instinct MI300X
+- **vLLM** — serves Qwen2.5-VL with continuous batching and PagedAttention
+- **SDPA attention** — PyTorch 2.0 Scaled Dot-Product Attention for Whisper (4.45× faster on ROCm)
+- **float16 inference** — 7B model fits in ~14 GB VRAM, leaves 50+ GB for large videos
+- **h264_amf** — AMD VCE hardware encoder for clip extraction (falls back to libx264)
+- **Parallel pipeline** — scene detection (CPU) + Whisper (GPU) run simultaneously
+- **Concurrent vLLM requests** — all scenes sent to Qwen2.5-VL in parallel; server batches them
+---
+## Two Output Modes
+### Normal Subtitles
+Full creative control over:
+- Font family (Noto Sans Thai, Noto Sans SC, Montserrat, Impact, ...)
+- Font size, bold/italic/underline
+- 4-layer ASS colors: primary, secondary, outline, shadow
+- Display mode: word-by-word or sentence
+- Animation: Fade / Karaoke / Pop / Typewriter / Bounce
+- Alignment (3×3 grid) + margin sliders
+- Per-subtitle-line style overrides in the editor
+### High-Retention Editing (HRE)
+AI chooses everything:
+- Silence removal (`ffmpeg silenceremove`)
+- Auto-zoom to face region (`ffmpeg zoompan` using Qwen2.5-VL face_bbox)
+- Jump cuts at scene boundaries
+- Qwen2.5-VL selects contextually-appropriate emoji overlay
+- Impact 64px bold white captions, word-by-word, pop animation
+---
+## Multilingual Support
+| Layer | Coverage |
+|---|---|
+| UI language | ไทย · English · 中文 |
+| Video input language | Auto-detect + 15+ (Whisper) |
+| Subtitle output language | Thai (Noto Sans Thai) · Chinese (Noto Sans SC) · Japanese (Noto Sans JP) · Korean (Noto Sans KR) · English + more |
+| Cross-lingual | Whisper translate → English when English subtitles are requested; multilingual transcription/subtitle timing uses Whisper language support |
+| Character-level splitting | Thai and Chinese use character-level subtitle timing (no word spaces) |
+---
+## Tech Stack
+| Layer | Technology |
+|---|---|
+| Vision AI | **Qwen2.5-VL-7B-Instruct** (Apache 2.0) via vLLM |
+| Speech-to-Text | **insanely-fast-whisper** with PyTorch SDPA on ROCm |
+| Audio Analysis | **librosa** — RMS energy per scene |
+| Scene Detection | **PySceneDetect** — ContentDetector |
+| Video Download | **yt-dlp** |
+| Video Processing | **ffmpeg** (AMD AMF hardware encode) |
+| Subtitle Engine | **pysubs2** — full ASS format with karaoke tags |
+| GPU | **AMD Instinct MI300X** via ROCm 6.3 |
+| Frontend | **Next.js 16.2.4** App Router + Tailwind CSS |
+| Backend | **FastAPI** + WebSocket (real-time progress) |
+| Deployment | HuggingFace Spaces public demo + AMD GPU Cloud backend |
+---
+## Judge Demo
+Public visitors can open the HuggingFace Space and click **Try Demo** to see a simulated flow without using AMD GPU credits. Full AMD MI300X generation is protected by an access code shared only in the lablab.ai submission notes for judges.
+Recommended judging flow:
+1. Open the HuggingFace Space.
+2. Click **Try Demo** for the instant public demo.
+3. Enter the judge access code from the lablab.ai submission notes to run real generation on AMD GPU Cloud.
+4. Upload a short MP4 sample for the real run.
+---
 ## Local Development
+For the real development/demo path, run the frontend locally and point it at the AMD GPU Cloud backend:
+```env
+# frontend/.env.local
+NEXT_PUBLIC_API_URL=http://129.212.178.101:8080
+NEXT_PUBLIC_DEMO_ENABLED=true
+NEXT_PUBLIC_DEMO_ONLY=false
 ```
 ```bash
 cd frontend
 npm install
+npm run dev  # http://localhost:3000
 ```
+The AMD GPU Cloud backend runs FastAPI on `:8080` and vLLM/Qwen2.5-VL on `:8000`. For development without a GPU, the backend can still run with fallback stubs (stubbed Whisper, fallback vision scores).
+---
+## Safe Public Demo Setup
+ElevenClip AI supports three deployment modes:
+| Mode | Frontend runs on | Backend/vLLM runs on | Use when |
+|---|---|---|---|
+| Local dev | Your laptop (`localhost:3000`) | AMD GPU Cloud (`129.212.178.101:8080`) | Iterating quickly while using MI300X remotely |
+| HF public shell | HuggingFace Space CPU | AMD GPU Cloud | Public hackathon page, real generation gated by access code |
+| HF self-contained GPU | HuggingFace Space | HuggingFace Space GPU | Only if the Space has suitable ROCm/AMD GPU hardware |
+For the current CPU Basic HuggingFace Space, use it as the public UI and keep real generation on AMD GPU Cloud:
 ```env
+# frontend/.env.local for local development
+NEXT_PUBLIC_API_URL=http://129.212.178.101:8080
+NEXT_PUBLIC_DEMO_ENABLED=true
+NEXT_PUBLIC_DEMO_ONLY=false
 ```
+On the AMD GPU Cloud backend, protect expensive GPU endpoints before exposing the demo:
 ```bash
+export DEMO_ACCESS_CODE="share-this-only-with-judges"
+export MAX_CONCURRENT_JOBS=1
+export MAX_UPLOAD_MB=300
+export VLLM_IDLE_TIMEOUT=300
 ```
+When `DEMO_ACCESS_CODE` is set, `/api/process`, `/api/video-info`, and vLLM start/stop endpoints require the `X-Demo-Key` header. The frontend shows a Demo Access Code field and sends that header automatically. Leave `DEMO_ACCESS_CODE` unset only for private/local testing.
+For a self-contained HuggingFace GPU Space, leave `NEXT_PUBLIC_API_URL=""` so nginx routes `/api`, `/ws`, and `/downloads` to FastAPI inside the same Space. Only use this mode if the Space hardware is actually GPU-capable.
+For the public HuggingFace Space, set `NEXT_PUBLIC_DEMO_ONLY=true`. Visitors can open the UI and run the simulated demo without touching AMD GPU credits. Judges can enter the access code to run real generation against the protected AMD GPU Cloud backend.
+---
+## Hackathon Compliance
+| Requirement | Status |
+|---|---|
+| Track 3: Vision & Multimodal AI | ✅ Qwen2.5-VL processes frames + audio simultaneously |
+| AMD Developer Cloud | ✅ All inference on AMD Instinct MI300X via ROCm 6.3 |
+| ROCm acceleration | ✅ vLLM + SDPA Whisper + h264_amf encoder |
+| Qwen partner integration | ✅ Qwen2.5-VL as primary multimodal model and text/emoji prompt model |
+| HuggingFace Space | ✅ `lablab-ai-amd-developer-hackathon/ElevenClip-AI` |
+| Public GitHub repo | ✅ `JakgritB/ElevenClip-AI` |
+| Ship It challenge | ✅ Social posts tagging @AIatAMD + @lablab |
+| MIT license | ✅ |
+---
 ## License
+MIT — see [LICENSE](LICENSE)

backend/Dockerfile DELETED Viewed

@@ -1,17 +0,0 @@
-ARG ROCM_PYTORCH_IMAGE=rocm/pytorch:latest
-FROM ${ROCM_PYTORCH_IMAGE}
-WORKDIR /app
-RUN apt-get update \
-    && apt-get install -y --no-install-recommends ffmpeg git curl \
-    && rm -rf /var/lib/apt/lists/*
-COPY pyproject.toml ./
-ARG INSTALL_EXTRAS=.
-RUN pip install --upgrade pip && pip install -e "${INSTALL_EXTRAS}"
-COPY app ./app
-EXPOSE 8000
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

backend/app/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """AI Clip Studio backend."""

backend/app/core/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Core configuration and instrumentation."""

backend/app/core/config.py DELETED Viewed

@@ -1,68 +0,0 @@
-from functools import lru_cache
-import os
-from pathlib import Path
-from pydantic import Field
-from pydantic import BaseModel
-class Settings(BaseModel):
-    app_name: str = "ElevenClip.AI"
-    demo_mode: bool = True
-    storage_dir: Path = Path("data")
-    frontend_origin: str = "http://localhost:5173"
-    whisper_model_id: str = "openai/whisper-large-v3"
-    qwen_text_model_id: str = "Qwen/Qwen2.5-7B-Instruct"
-    qwen_vl_model_id: str = "Qwen/Qwen2-VL-7B-Instruct"
-    hf_token: str | None = None
-    preferred_torch_dtype: str = "bfloat16"
-    target_clip_count: int = Field(default=5, ge=1, le=20)
-    max_clips: int = Field(default=10, ge=1, le=50)
-    ffmpeg_binary: str = "ffmpeg"
-    ffprobe_binary: str = "ffprobe"
-    ffmpeg_video_codec: str = "h264_amf"
-    ffmpeg_cpu_codec: str = "libx264"
-    redis_url: str = "redis://redis:6379/0"
-    celery_enabled: bool = False
-@lru_cache
-def get_settings() -> Settings:
-    settings = Settings(
-        demo_mode=_bool_env("DEMO_MODE", True),
-        storage_dir=Path(os.getenv("STORAGE_DIR", "data")),
-        frontend_origin=os.getenv("FRONTEND_ORIGIN", "http://localhost:5173"),
-        whisper_model_id=os.getenv("WHISPER_MODEL_ID", "openai/whisper-large-v3"),
-        qwen_text_model_id=os.getenv("QWEN_TEXT_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct"),
-        qwen_vl_model_id=os.getenv("QWEN_VL_MODEL_ID", "Qwen/Qwen2-VL-7B-Instruct"),
-        hf_token=os.getenv("HF_TOKEN") or None,
-        preferred_torch_dtype=os.getenv("TORCH_DTYPE", "bfloat16"),
-        target_clip_count=_int_env("TARGET_CLIP_COUNT", 5),
-        max_clips=_int_env("MAX_CLIPS", 10),
-        ffmpeg_binary=os.getenv("FFMPEG_BINARY", "ffmpeg"),
-        ffprobe_binary=os.getenv("FFPROBE_BINARY", "ffprobe"),
-        ffmpeg_video_codec=os.getenv("FFMPEG_VIDEO_CODEC", "h264_amf"),
-        ffmpeg_cpu_codec=os.getenv("FFMPEG_CPU_CODEC", "libx264"),
-        redis_url=os.getenv("REDIS_URL", "redis://redis:6379/0"),
-        celery_enabled=_bool_env("CELERY_ENABLED", False),
-    )
-    settings.storage_dir.mkdir(parents=True, exist_ok=True)
-    return settings
-def _bool_env(name: str, default: bool) -> bool:
-    value = os.getenv(name)
-    if value is None:
-        return default
-    return value.strip().lower() in {"1", "true", "yes", "on"}
-def _int_env(name: str, default: int) -> int:
-    value = os.getenv(name)
-    if value is None:
-        return default
-    return int(value)

backend/app/core/timing.py DELETED Viewed

@@ -1,20 +0,0 @@
-from collections.abc import Iterator
-from contextlib import contextmanager
-from time import perf_counter
-class TimingLog:
-    def __init__(self) -> None:
-        self._steps: dict[str, float] = {}
-    @contextmanager
-    def measure(self, name: str) -> Iterator[None]:
-        started = perf_counter()
-        try:
-            yield
-        finally:
-            self._steps[name] = round(perf_counter() - started, 3)
-    def to_dict(self) -> dict[str, float]:
-        total = round(sum(self._steps.values()), 3)
-        return {**self._steps, "total": total}

backend/app/main.py DELETED Viewed

@@ -1,240 +0,0 @@
-from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import FileResponse
-from fastapi.staticfiles import StaticFiles
-from app.core.config import get_settings
-from app.models.schemas import (
-    ChannelProfile,
-    ClipCandidate,
-    ClipPatch,
-    HealthResponse,
-    JobSnapshot,
-    PolishSubtitlesRequest,
-    RegenerateClipRequest,
-    SubtitleCue,
-    TranslateSubtitlesRequest,
-    YoutubeJobRequest,
-)
-from app.services.highlight import QwenHighlightDetector
-from app.services.pipeline import VideoPipeline
-from app.services.transcription import WhisperTranscriber
-from app.services.video_input import save_upload
-from app.storage import JobStore
-from app.utils.rocm import detect_accelerator
-settings = get_settings()
-store = JobStore(settings)
-pipeline = VideoPipeline(settings, store)
-highlight_detector = QwenHighlightDetector(settings)
-transcriber = WhisperTranscriber(settings)
-app = FastAPI(title=settings.app_name, version="0.1.0")
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=[settings.frontend_origin, "http://localhost:5173", "http://127.0.0.1:5173"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-app.mount("/media", StaticFiles(directory=settings.storage_dir), name="media")
-@app.get("/health", response_model=HealthResponse)
-async def health() -> HealthResponse:
-    return HealthResponse(
-        ok=True,
-        app=settings.app_name,
-        demo_mode=settings.demo_mode,
-        accelerator=detect_accelerator(),
-    )
-@app.post("/api/jobs/youtube", response_model=JobSnapshot)
-async def create_youtube_job(
-    request: YoutubeJobRequest, background_tasks: BackgroundTasks
-) -> JobSnapshot:
-    snapshot = store.create_job(
-        request.profile, {"kind": "youtube", "url": str(request.youtube_url)}
-    )
-    background_tasks.add_task(
-        pipeline.process_source, snapshot.id, "youtube", str(request.youtube_url), request.profile
-    )
-    return snapshot
-@app.post("/api/jobs/upload", response_model=JobSnapshot)
-async def create_upload_job(
-    background_tasks: BackgroundTasks,
-    profile_json: str = Form(...),
-    file: UploadFile = File(...),
-) -> JobSnapshot:
-    try:
-        profile = ChannelProfile.model_validate_json(profile_json)
-    except Exception as exc:
-        raise HTTPException(status_code=422, detail=f"Invalid profile JSON: {exc}") from exc
-    snapshot = store.create_job(profile, {"kind": "upload", "filename": file.filename})
-    source_path = await save_upload(file, store.job_dir(snapshot.id))
-    background_tasks.add_task(pipeline.process_source, snapshot.id, "upload", str(source_path), profile)
-    return snapshot
-@app.get("/api/jobs/{job_id}", response_model=JobSnapshot)
-async def get_job(job_id: str) -> JobSnapshot:
-    try:
-        return store.get_job(job_id)
-    except FileNotFoundError as exc:
-        raise HTTPException(status_code=404, detail="Job not found") from exc
-@app.patch("/api/jobs/{job_id}/clips/{clip_id}", response_model=ClipCandidate)
-async def update_clip(job_id: str, clip_id: str, patch: ClipPatch) -> ClipCandidate:
-    try:
-        return pipeline.patch_clip(job_id, clip_id, patch.model_dump())
-    except FileNotFoundError as exc:
-        raise HTTPException(status_code=404, detail="Job not found") from exc
-    except KeyError as exc:
-        raise HTTPException(status_code=404, detail="Clip not found") from exc
-@app.post("/api/jobs/{job_id}/clips/{clip_id}/regenerate", response_model=ClipCandidate)
-async def regenerate_clip(
-    job_id: str, clip_id: str, request: RegenerateClipRequest
-) -> ClipCandidate:
-    try:
-        return pipeline.regenerate_clip(
-            job_id,
-            clip_id,
-            clip_style=request.clip_style,
-            clip_length_seconds=request.clip_length_seconds,
-            subtitle_text=request.subtitle_text,
-        )
-    except FileNotFoundError as exc:
-        raise HTTPException(status_code=404, detail="Source video not found") from exc
-    except KeyError as exc:
-        raise HTTPException(status_code=404, detail="Clip not found") from exc
-@app.get("/api/jobs/{job_id}/clips/{clip_id}/download")
-async def download_clip(job_id: str, clip_id: str) -> FileResponse:
-    snapshot = store.get_job(job_id)
-    clip = next((item for item in snapshot.clips if item.id == clip_id), None)
-    if clip is None or clip.download_url is None:
-        raise HTTPException(status_code=404, detail="Clip not found")
-    filename = clip.download_url.rsplit("/", 1)[-1]
-    path = store.job_dir(job_id) / filename
-    if not path.exists():
-        raise HTTPException(status_code=404, detail="Clip file not found")
-    return FileResponse(path, media_type="video/mp4", filename=filename)
-# ──────────────────────────────────────────────────────���──────────
-# AI subtitle endpoints — work in demo mode immediately, switch to
-# real Qwen / Whisper output once DEMO_MODE=false on AMD GPU cloud.
-# ─────────────────────────────────────────────────────────────────
-def _resolve_clip_cues(snapshot: JobSnapshot, clip: ClipCandidate) -> list[SubtitleCue]:
-    """Return the cue list to operate on. Prefer explicit subtitle_cues; fall
-    back to splitting subtitle_text into evenly-spaced cues."""
-    if clip.subtitle_cues:
-        return [SubtitleCue(**cue.model_dump()) for cue in clip.subtitle_cues]
-    duration = max(0.5, clip.end_seconds - clip.start_seconds)
-    text = clip.subtitle_text.strip()
-    if not text:
-        return [SubtitleCue(start_seconds=0.0, end_seconds=duration, text="")]
-    # Reuse Whisper aligner's deterministic chunking for fallback
-    return transcriber._demo_align_words(text, 0.0, duration)
-@app.post(
-    "/api/jobs/{job_id}/clips/{clip_id}/subtitle/polish",
-    response_model=ClipCandidate,
-)
-async def polish_clip_subtitles(
-    job_id: str, clip_id: str, request: PolishSubtitlesRequest
-) -> ClipCandidate:
-    try:
-        snapshot = store.get_job(job_id)
-    except FileNotFoundError as exc:
-        raise HTTPException(status_code=404, detail="Job not found") from exc
-    clip = next((c for c in snapshot.clips if c.id == clip_id), None)
-    if clip is None:
-        raise HTTPException(status_code=404, detail="Clip not found")
-    cues_in = _resolve_clip_cues(snapshot, clip)
-    polished = highlight_detector.polish_subtitles(cues_in, style=request.style)
-    return pipeline.patch_clip(
-        job_id,
-        clip_id,
-        {
-            "subtitle_cues": [cue.model_dump() for cue in polished],
-            "subtitle_text": " ".join(cue.text for cue in polished if cue.text),
-        },
-    )
-@app.post(
-    "/api/jobs/{job_id}/clips/{clip_id}/subtitle/translate",
-    response_model=ClipCandidate,
-)
-async def translate_clip_subtitles(
-    job_id: str, clip_id: str, request: TranslateSubtitlesRequest
-) -> ClipCandidate:
-    try:
-        snapshot = store.get_job(job_id)
-    except FileNotFoundError as exc:
-        raise HTTPException(status_code=404, detail="Job not found") from exc
-    clip = next((c for c in snapshot.clips if c.id == clip_id), None)
-    if clip is None:
-        raise HTTPException(status_code=404, detail="Clip not found")
-    cues_in = _resolve_clip_cues(snapshot, clip)
-    translated = highlight_detector.translate_subtitles(cues_in, request.target_language)
-    return pipeline.patch_clip(
-        job_id,
-        clip_id,
-        {
-            "subtitle_cues": [cue.model_dump() for cue in translated],
-            "subtitle_text": " ".join(cue.text for cue in translated if cue.text),
-        },
-    )
-@app.post(
-    "/api/jobs/{job_id}/clips/{clip_id}/subtitle/auto-time",
-    response_model=ClipCandidate,
-)
-async def auto_time_clip_subtitles(job_id: str, clip_id: str) -> ClipCandidate:
-    try:
-        snapshot = store.get_job(job_id)
-    except FileNotFoundError as exc:
-        raise HTTPException(status_code=404, detail="Job not found") from exc
-    clip = next((c for c in snapshot.clips if c.id == clip_id), None)
-    if clip is None:
-        raise HTTPException(status_code=404, detail="Clip not found")
-    text = clip.subtitle_text or " ".join(
-        (cue.text for cue in (clip.subtitle_cues or []) if cue.text)
-    )
-    # Best-effort: production mode uses the actual source video on disk; demo
-    # mode uses synthetic chunking that doesn't require the file at all.
-    source_path = ""
-    try:
-        for entry in store.job_dir(job_id).iterdir():
-            if entry.suffix.lower() in {".mp4", ".mkv", ".mov", ".webm"}:
-                source_path = str(entry)
-                break
-    except Exception:
-        source_path = ""
-    timed = transcriber.align_words(source_path, text, clip.start_seconds, clip.end_seconds)
-    return pipeline.patch_clip(
-        job_id,
-        clip_id,
-        {
-            "subtitle_cues": [cue.model_dump() for cue in timed],
-            "subtitle_text": " ".join(cue.text for cue in timed if cue.text),
-        },
-    )

backend/app/models/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Pydantic models."""

backend/app/models/schemas.py DELETED Viewed

@@ -1,127 +0,0 @@
-from datetime import datetime, timezone
-from enum import Enum
-from typing import Any, Literal
-from pydantic import BaseModel, Field, HttpUrl, field_validator
-def utc_now() -> datetime:
-    return datetime.now(timezone.utc)
-class TargetPlatform(str, Enum):
-    tiktok = "tiktok"
-    youtube_shorts = "youtube_shorts"
-    instagram_reels = "instagram_reels"
-class ChannelProfile(BaseModel):
-    niche: str = Field(default="education", min_length=2, max_length=80)
-    niche_custom: str = Field(default="", max_length=80)
-    channel_description: str = Field(default="", max_length=700)
-    clip_style: str = Field(default="informative", min_length=2, max_length=80)
-    clip_length_seconds: int = Field(default=60, ge=15, le=180)
-    clip_count: int = Field(default=5, ge=1, le=20)
-    primary_language: str = Field(default="Thai", min_length=2, max_length=40)
-    target_platform: TargetPlatform = TargetPlatform.tiktok
-    @field_validator("niche", "niche_custom", "channel_description", "clip_style", "primary_language")
-    @classmethod
-    def clean_text(cls, value: str) -> str:
-        return value.strip()
-class YoutubeJobRequest(BaseModel):
-    youtube_url: HttpUrl
-    profile: ChannelProfile
-class TranscriptSegment(BaseModel):
-    id: str
-    start_seconds: float = Field(ge=0)
-    end_seconds: float = Field(ge=0)
-    text: str
-    language: str | None = None
-class SubtitleCue(BaseModel):
-    """A single subtitle line with explicit timing relative to clip start."""
-    start_seconds: float = Field(ge=0)
-    end_seconds: float = Field(ge=0)
-    text: str = ""
-class SkipRange(BaseModel):
-    """A range to splice out of the middle of a clip (relative to clip start)."""
-    start_seconds: float = Field(ge=0)
-    end_seconds: float = Field(ge=0)
-class ClipCandidate(BaseModel):
-    id: str
-    start_seconds: float = Field(ge=0)
-    end_seconds: float = Field(ge=0)
-    title: str
-    reason: str
-    score: float = Field(ge=0, le=100)
-    subtitle_text: str = ""
-    subtitle_cues: list[SubtitleCue] | None = None
-    skip_ranges: list[SkipRange] | None = None
-    video_url: str | None = None
-    download_url: str | None = None
-    approved: bool = False
-    deleted: bool = False
-    metadata: dict[str, Any] = Field(default_factory=dict)
-class ClipPatch(BaseModel):
-    start_seconds: float | None = Field(default=None, ge=0)
-    end_seconds: float | None = Field(default=None, ge=0)
-    subtitle_text: str | None = None
-    subtitle_cues: list[SubtitleCue] | None = None
-    skip_ranges: list[SkipRange] | None = None
-    approved: bool | None = None
-    deleted: bool | None = None
-class RegenerateClipRequest(BaseModel):
-    clip_style: str | None = None
-    clip_length_seconds: int | None = Field(default=None, ge=15, le=180)
-    subtitle_text: str | None = None
-class TranslateSubtitlesRequest(BaseModel):
-    target_language: str = Field(min_length=2, max_length=40)
-class PolishSubtitlesRequest(BaseModel):
-    style: str | None = None
-class JobSnapshot(BaseModel):
-    id: str
-    status: Literal["queued", "running", "completed", "failed"]
-    progress: float = Field(ge=0, le=1)
-    message: str
-    current_step: str = ""
-    step_index: int = Field(default=0, ge=0)
-    step_total: int = Field(default=6, ge=1)
-    active_clip_index: int = Field(default=0, ge=0)
-    active_clip_total: int = Field(default=0, ge=0)
-    source: dict[str, Any]
-    profile: ChannelProfile
-    transcript: list[TranscriptSegment] = Field(default_factory=list)
-    clips: list[ClipCandidate] = Field(default_factory=list)
-    timings: dict[str, float] = Field(default_factory=dict)
-    error: str | None = None
-    created_at: datetime = Field(default_factory=utc_now)
-    updated_at: datetime = Field(default_factory=utc_now)
-class HealthResponse(BaseModel):
-    ok: bool
-    app: str
-    demo_mode: bool
-    accelerator: dict[str, Any]

backend/app/services/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Pipeline services."""

backend/app/services/clips.py DELETED Viewed

@@ -1,219 +0,0 @@
-import shutil
-import subprocess
-from pathlib import Path
-from typing import Callable
-from app.core.config import Settings
-from app.models.schemas import ChannelProfile, ClipCandidate, TranscriptSegment
-from app.services.subtitles import write_single_caption_srt, write_srt, write_srt_from_cues
-from app.storage import JobStore
-class ClipGenerator:
-    def __init__(self, settings: Settings, store: JobStore) -> None:
-        self.settings = settings
-        self.store = store
-    def generate(
-        self,
-        job_id: str,
-        video_path: Path,
-        clips: list[ClipCandidate],
-        transcript: list[TranscriptSegment],
-        profile: ChannelProfile,
-        progress_callback: Callable[[int, int], None] | None = None,
-    ) -> list[ClipCandidate]:
-        rendered: list[ClipCandidate] = []
-        total = len(clips)
-        for index, clip in enumerate(clips, start=1):
-            if progress_callback:
-                progress_callback(index, total)
-            rendered.append(self.render_one(job_id, video_path, clip, transcript, profile, index))
-        return rendered
-    def render_one(
-        self,
-        job_id: str,
-        video_path: Path,
-        clip: ClipCandidate,
-        transcript: list[TranscriptSegment],
-        profile: ChannelProfile,
-        index: int = 1,
-    ) -> ClipCandidate:
-        job_dir = self.store.job_dir(job_id)
-        output_name = f"clip_{index:02}_{clip.id[:8]}.mp4"
-        subtitle_name = f"clip_{index:02}_{clip.id[:8]}.srt"
-        output_path = job_dir / output_name
-        subtitle_path = job_dir / subtitle_name
-        duration = max(1.0, clip.end_seconds - clip.start_seconds)
-        if clip.subtitle_cues:
-            subtitle_cues = write_srt_from_cues(subtitle_path, clip.subtitle_cues)
-        elif clip.subtitle_text.strip():
-            subtitle_cues = write_single_caption_srt(subtitle_path, duration, clip.subtitle_text)
-        else:
-            subtitle_cues = write_srt(subtitle_path, clip.start_seconds, clip.end_seconds, transcript)
-        self._run_ffmpeg(video_path, output_path, subtitle_path, clip, profile)
-        clip.video_url = self.store.media_url(job_id, output_name)
-        clip.download_url = clip.video_url
-        clip.metadata["subtitle_file"] = self.store.media_url(job_id, subtitle_name)
-        clip.metadata["subtitle_cues"] = subtitle_cues
-        return clip
-    def _run_ffmpeg(
-        self,
-        video_path: Path,
-        output_path: Path,
-        subtitle_path: Path,
-        clip: ClipCandidate,
-        profile: ChannelProfile,
-    ) -> None:
-        ffmpeg = shutil.which(self.settings.ffmpeg_binary)
-        if not ffmpeg or not video_path.exists() or video_path.stat().st_size == 0:
-            output_path.write_bytes(b"")
-            return
-        keep_ranges = self._compute_keep_ranges(clip)
-        post_filters = [self._platform_filter(profile), self._subtitle_filter(subtitle_path)]
-        post_chain = ",".join(post_filters)
-        if len(keep_ranges) <= 1:
-            start, end = keep_ranges[0]
-            command = [
-                ffmpeg,
-                "-y",
-                "-ss",
-                f"{start:.3f}",
-                "-i",
-                str(video_path),
-                "-t",
-                f"{max(0.5, end - start):.3f}",
-                "-vf",
-                post_chain,
-                "-c:v",
-                self.settings.ffmpeg_video_codec,
-                "-c:a",
-                "aac",
-                "-b:a",
-                "160k",
-                "-movflags",
-                "+faststart",
-                str(output_path),
-            ]
-        else:
-            # Build concat filter that keeps multiple segments and skips middle ranges
-            parts = []
-            labels_v = []
-            labels_a = []
-            for i, (start, end) in enumerate(keep_ranges):
-                parts.append(
-                    f"[0:v]trim=start={start:.3f}:end={end:.3f},setpts=PTS-STARTPTS[v{i}]"
-                )
-                parts.append(
-                    f"[0:a]atrim=start={start:.3f}:end={end:.3f},asetpts=PTS-STARTPTS[a{i}]"
-                )
-                labels_v.append(f"[v{i}]")
-                labels_a.append(f"[a{i}]")
-            concat_inputs = "".join(
-                f"{labels_v[i]}{labels_a[i]}" for i in range(len(keep_ranges))
-            )
-            parts.append(
-                f"{concat_inputs}concat=n={len(keep_ranges)}:v=1:a=1[vc][ac]"
-            )
-            parts.append(f"[vc]{post_chain}[vout]")
-            filter_complex = ";".join(parts)
-            command = [
-                ffmpeg,
-                "-y",
-                "-i",
-                str(video_path),
-                "-filter_complex",
-                filter_complex,
-                "-map",
-                "[vout]",
-                "-map",
-                "[ac]",
-                "-c:v",
-                self.settings.ffmpeg_video_codec,
-                "-c:a",
-                "aac",
-                "-b:a",
-                "160k",
-                "-movflags",
-                "+faststart",
-                str(output_path),
-            ]
-        try:
-            subprocess.run(command, check=True, capture_output=True, text=True, timeout=180)
-            return
-        except Exception:
-            fallback = command.copy()
-            try:
-                fallback[fallback.index(self.settings.ffmpeg_video_codec)] = (
-                    self.settings.ffmpeg_cpu_codec
-                )
-            except ValueError:
-                pass
-            try:
-                subprocess.run(fallback, check=True, capture_output=True, text=True, timeout=180)
-                return
-            except Exception:
-                output_path.write_bytes(b"")
-    def _compute_keep_ranges(self, clip: ClipCandidate) -> list[tuple[float, float]]:
-        """Return absolute video time ranges to keep, after subtracting skip_ranges."""
-        clip_start = float(clip.start_seconds)
-        clip_end = float(clip.end_seconds)
-        if not clip.skip_ranges:
-            return [(clip_start, clip_end)]
-        # Skip ranges are relative to clip start. Convert to absolute and sort.
-        skips: list[tuple[float, float]] = []
-        for skip in clip.skip_ranges:
-            s = clip_start + max(0.0, float(skip.start_seconds))
-            e = clip_start + max(0.0, float(skip.end_seconds))
-            if e > s:
-                skips.append((min(s, clip_end), min(e, clip_end)))
-        skips.sort()
-        # Merge overlapping
-        merged: list[tuple[float, float]] = []
-        for s, e in skips:
-            if merged and s <= merged[-1][1]:
-                merged[-1] = (merged[-1][0], max(merged[-1][1], e))
-            else:
-                merged.append((s, e))
-        # Compute keep segments
-        keeps: list[tuple[float, float]] = []
-        cursor = clip_start
-        for s, e in merged:
-            if s > cursor:
-                keeps.append((cursor, s))
-            cursor = max(cursor, e)
-        if cursor < clip_end:
-            keeps.append((cursor, clip_end))
-        return keeps if keeps else [(clip_start, clip_end)]
-    def _platform_filter(self, profile: ChannelProfile) -> str:
-        if profile.target_platform.value in {"tiktok", "youtube_shorts", "instagram_reels"}:
-            return "scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920"
-        return "scale=1280:720:force_original_aspect_ratio=decrease,pad=1280:720:(ow-iw)/2:(oh-ih)/2"
-    def _subtitle_filter(self, subtitle_path: Path) -> str:
-        escaped = str(subtitle_path.resolve()).replace("\\", "/").replace(":", "\\:")
-        style = (
-            "Fontname=Arial,"
-            "Fontsize=22,"
-            "PrimaryColour=&H00FFFFFF,"
-            "OutlineColour=&H00000000,"
-            "BorderStyle=1,"
-            "Outline=2,"
-            "Shadow=1,"
-            "Alignment=2,"
-            "MarginV=210"
-        )
-        return f"subtitles='{escaped}':force_style='{style}'"

backend/app/services/highlight.py DELETED Viewed

@@ -1,434 +0,0 @@
-import json
-import re
-from uuid import uuid4
-from app.core.config import Settings
-from app.models.schemas import ChannelProfile, ClipCandidate, SubtitleCue, TranscriptSegment
-class QwenHighlightDetector:
-    def __init__(self, settings: Settings) -> None:
-        self.settings = settings
-        self._llm = None
-    def detect(
-        self, transcript: list[TranscriptSegment], profile: ChannelProfile
-    ) -> list[ClipCandidate]:
-        if self.settings.demo_mode:
-            return self._heuristic_detect(transcript, profile)
-        try:
-            return self._qwen_detect(transcript, profile)
-        except Exception:
-            return self._heuristic_detect(transcript, profile)
-    def _qwen_detect(
-        self, transcript: list[TranscriptSegment], profile: ChannelProfile
-    ) -> list[ClipCandidate]:
-        try:
-            from vllm import LLM, SamplingParams
-        except Exception as exc:
-            raise RuntimeError("vLLM with ROCm backend is required for Qwen inference") from exc
-        if self._llm is None:
-            self._llm = LLM(
-                model=self.settings.qwen_text_model_id,
-                dtype=self.settings.preferred_torch_dtype,
-                trust_remote_code=True,
-            )
-        transcript_text = "\n".join(
-            f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}"
-            for segment in transcript
-        )
-        niche = _effective_niche(profile)
-        channel_description = profile.channel_description or "No extra channel description provided."
-        clip_count = min(profile.clip_count, self.settings.max_clips)
-        prompt = f"""
-You are selecting short-form clips for a creator.
-Profile:
-- niche: {niche}
-- creator description: {channel_description}
-- style: {profile.clip_style}
-- target length seconds: {profile.clip_length_seconds}
-- target number of clips: {clip_count}
-- language: {profile.primary_language}
-- platform: {profile.target_platform.value}
-Return strict JSON only. Shape:
-[
-  {{
-    "start_seconds": 12.0,
-    "end_seconds": 72.0,
-    "title": "short title",
-    "reason": "why this will engage viewers",
-    "score": 91,
-    "subtitle_text": "clean subtitle text"
-  }}
-]
-Transcript:
-{transcript_text}
-""".strip()
-        sampling = SamplingParams(temperature=0.2, max_tokens=1200)
-        outputs = self._llm.generate([prompt], sampling)
-        text = outputs[0].outputs[0].text
-        payload = self._parse_json_array(text)
-        clips = [
-            ClipCandidate(
-                id=uuid4().hex,
-                start_seconds=float(item["start_seconds"]),
-                end_seconds=float(item["end_seconds"]),
-                title=str(item.get("title") or "Highlight"),
-                reason=str(item.get("reason") or "High engagement potential"),
-                score=float(item.get("score") or 75),
-                subtitle_text=str(item.get("subtitle_text") or ""),
-                metadata={"model": self.settings.qwen_text_model_id},
-            )
-            for item in payload[:clip_count]
-        ]
-        return clips or self._heuristic_detect(transcript, profile)
-    def _parse_json_array(self, text: str) -> list[dict]:
-        match = re.search(r"\[[\s\S]*\]", text)
-        if not match:
-            raise ValueError("No JSON array in Qwen response")
-        payload = json.loads(match.group(0))
-        if not isinstance(payload, list):
-            raise ValueError("Qwen response is not a list")
-        return payload
-    # ──────────────────────────────────────────────────────────────
-    # AI subtitle actions (Polish, Translate)
-    # ──────────────────────────────────────────────────────────────
-    def polish_subtitles(
-        self, cues: list[SubtitleCue], style: str | None = None
-    ) -> list[SubtitleCue]:
-        """Rewrite cue text to be punchier and more readable on short-form video.
-        Demo mode returns deterministic polished text so the UX is testable
-        without GPU. Production mode calls Qwen2.5.
-        """
-        if self.settings.demo_mode:
-            return self._heuristic_polish(cues, style)
-        try:
-            return self._qwen_polish(cues, style)
-        except Exception:
-            return self._heuristic_polish(cues, style)
-    def translate_subtitles(
-        self, cues: list[SubtitleCue], target_language: str
-    ) -> list[SubtitleCue]:
-        """Translate cue text to target_language while preserving timing."""
-        if self.settings.demo_mode:
-            return self._heuristic_translate(cues, target_language)
-        try:
-            return self._qwen_translate(cues, target_language)
-        except Exception:
-            return self._heuristic_translate(cues, target_language)
-    # ──────────────────────────────────────────────────────────────
-    # Demo / fallback implementations
-    # ──────────────────────────────────────────────────────────────
-    def _heuristic_polish(
-        self, cues: list[SubtitleCue], style: str | None
-    ) -> list[SubtitleCue]:
-        """Apply simple text transformations that look like an AI polish."""
-        polished: list[SubtitleCue] = []
-        for cue in cues:
-            text = (cue.text or "").strip()
-            if not text:
-                polished.append(cue.model_copy())
-                continue
-            # Shorten redundant phrasing (heuristic)
-            text = re.sub(r"\s+", " ", text)
-            text = re.sub(r"^(so|well|like|um|uh|you know|i mean)[,\s]+", "", text, flags=re.IGNORECASE)
-            text = text.rstrip(" ,.;:")
-            # Add light emphasis based on style
-            if style and style.lower() == "dramatic" and not text.endswith("!"):
-                text = text + "!"
-            polished.append(
-                SubtitleCue(
-                    start_seconds=cue.start_seconds,
-                    end_seconds=cue.end_seconds,
-                    text=text,
-                )
-            )
-        return polished
-    def _heuristic_translate(
-        self, cues: list[SubtitleCue], target_language: str
-    ) -> list[SubtitleCue]:
-        """Demo translation: append a marker so the UX shows the action ran."""
-        marker = f"[{target_language[:2].upper()}]"
-        translated: list[SubtitleCue] = []
-        for cue in cues:
-            text = (cue.text or "").strip()
-            translated.append(
-                SubtitleCue(
-                    start_seconds=cue.start_seconds,
-                    end_seconds=cue.end_seconds,
-                    text=f"{marker} {text}" if text else "",
-                )
-            )
-        return translated
-    # ──────────────────────────────────────────────────────────────
-    # Production Qwen calls (used when DEMO_MODE=false on AMD GPU)
-    # ──────────────────────────────────────────────────────────────
-    def _ensure_llm(self):
-        try:
-            from vllm import LLM
-        except Exception as exc:
-            raise RuntimeError("vLLM with ROCm backend is required for Qwen") from exc
-        if self._llm is None:
-            self._llm = LLM(
-                model=self.settings.qwen_text_model_id,
-                dtype=self.settings.preferred_torch_dtype,
-                trust_remote_code=True,
-            )
-        return self._llm
-    def _qwen_polish(
-        self, cues: list[SubtitleCue], style: str | None
-    ) -> list[SubtitleCue]:
-        from vllm import SamplingParams
-        llm = self._ensure_llm()
-        joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues))
-        prompt = f"""
-Rewrite each subtitle line to be punchier and easier to read on short-form vertical video.
-Keep the same number of lines and the same approximate length per line.
-Style preference: {style or 'natural'}.
-Return one rewritten line per row, prefixed with the original index. No commentary.
-Input:
-{joined}
-""".strip()
-        outputs = llm.generate([prompt], SamplingParams(temperature=0.3, max_tokens=800))
-        raw = outputs[0].outputs[0].text
-        rewritten = self._parse_indexed_lines(raw, expected=len(cues))
-        return [
-            SubtitleCue(
-                start_seconds=cue.start_seconds,
-                end_seconds=cue.end_seconds,
-                text=rewritten[i] if i < len(rewritten) else cue.text,
-            )
-            for i, cue in enumerate(cues)
-        ]
-    def _qwen_translate(
-        self, cues: list[SubtitleCue], target_language: str
-    ) -> list[SubtitleCue]:
-        from vllm import SamplingParams
-        llm = self._ensure_llm()
-        joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues))
-        prompt = f"""
-Translate each subtitle line into {target_language}. Preserve line count and order.
-Return one translated line per row, prefixed with the original index. No commentary.
-Input:
-{joined}
-""".strip()
-        outputs = llm.generate([prompt], SamplingParams(temperature=0.2, max_tokens=1000))
-        raw = outputs[0].outputs[0].text
-        translated = self._parse_indexed_lines(raw, expected=len(cues))
-        return [
-            SubtitleCue(
-                start_seconds=cue.start_seconds,
-                end_seconds=cue.end_seconds,
-                text=translated[i] if i < len(translated) else cue.text,
-            )
-            for i, cue in enumerate(cues)
-        ]
-    def _parse_indexed_lines(self, raw: str, expected: int) -> list[str]:
-        lines = []
-        for line in raw.splitlines():
-            stripped = line.strip()
-            if not stripped:
-                continue
-            match = re.match(r"^\s*\d+[.)\s-]+\s*(.*)$", stripped)
-            lines.append(match.group(1).strip() if match else stripped)
-            if len(lines) >= expected:
-                break
-        return lines
-    def _heuristic_detect(
-        self, transcript: list[TranscriptSegment], profile: ChannelProfile
-    ) -> list[ClipCandidate]:
-        style_terms = {
-            "funny": ["react", "punchy", "mistake", "surprising"],
-            "informative": ["important", "practical", "takeaway", "explanation"],
-            "dramatic": ["problem", "surprising", "before-and-after", "stop scrolling"],
-            "educational": ["question", "answer", "context", "takeaway"],
-        }
-        preferred_terms = style_terms.get(profile.clip_style.lower(), [])
-        niche = _effective_niche(profile)
-        profile_terms = [
-            term
-            for term in f"{niche} {profile.channel_description}".lower().split()[:30]
-            if len(term) > 2
-        ]
-        scored: list[tuple[float, TranscriptSegment]] = []
-        for segment in transcript:
-            text = segment.text.lower()
-            score = 45.0
-            score += 12 if "?" in segment.text else 0
-            score += 8 if any(term in text for term in preferred_terms) else 0
-            score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0
-            score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0
-            score += 5 if any(term in text for term in profile_terms) else 0
-            score += min(len(segment.text) / 12, 10)
-            scored.append((min(score, 100), segment))
-        scored.sort(key=lambda item: item[0], reverse=True)
-        clips: list[ClipCandidate] = []
-        clip_count = min(profile.clip_count, self.settings.max_clips)
-        for score, segment in scored[:clip_count]:
-            start = max(0.0, segment.start_seconds - 5.0)
-            end = start + float(profile.clip_length_seconds)
-            clips.append(
-                ClipCandidate(
-                    id=uuid4().hex,
-                    start_seconds=start,
-                    end_seconds=end,
-                    title=self._title_for(segment.text),
-                    reason=self._reason_for(profile, niche),
-                    score=round(score, 1),
-                    subtitle_text=segment.text,
-                    metadata={"model": "heuristic-fallback"},
-                )
-            )
-        return sorted(clips, key=lambda clip: clip.start_seconds)
-    def _title_for(self, text: str) -> str:
-        clean = re.sub(r"\s+", " ", text).strip(" \t\r\n.,!?;:()[]{}\"'")
-        words = clean.split()
-        if len(words) > 1:
-            title = " ".join(words[:7])
-        else:
-            title = clean[:48]
-        return title[:72].rstrip() or "Highlight"
-    def _reason_for(self, profile: ChannelProfile, niche: str) -> str:
-        language = profile.primary_language.lower()
-        style = _localized_profile_word(profile.clip_style, language, "style")
-        niche_label = _localized_profile_word(niche, language, "niche")
-        if "thai" in language:
-            return f"ตรงกับสไตล์ {style} สำหรับผู้ชมช่องแนว {niche_label}"
-        if "japanese" in language:
-            return f"{niche_label} の視聴者に合う {style} スタイルの候補です。"
-        if "chinese" in language:
-            return f"符合 {niche_label} 受众期待的 {style} 风格。"
-        if "korean" in language:
-            return f"{niche_label} 시청자에게 맞는 {style} 스타일의 후보입니다."
-        return f"Matches the {profile.clip_style} style for a {niche} audience."
-def _effective_niche(profile: ChannelProfile) -> str:
-    if profile.niche.lower() == "other" and profile.niche_custom:
-        return profile.niche_custom
-    return profile.niche
-def _localized_profile_word(value: str, language: str, group: str) -> str:
-    key = value.lower().replace(" ", "_")
-    localized = {
-        "thai": {
-            "niche": {
-                "education": "การศึกษา",
-                "gaming": "เกม",
-                "podcast": "พอดแคสต์",
-                "commentary": "เล่า/วิเคราะห์",
-                "cars": "รถยนต์",
-                "beauty": "บิวตี้",
-                "fitness": "ฟิตเนส",
-                "finance": "การเงิน",
-                "tech": "เทคโนโลยี",
-                "lifestyle": "ไลฟ์สไตล์",
-                "music": "ดนตรี",
-            },
-            "style": {
-                "informative": "ให้ข้อมูล",
-                "funny": "ตลก",
-                "dramatic": "ดราม่า",
-                "educational": "สอนเข้าใจง่าย",
-                "commentary": "วิเคราะห์",
-            },
-        },
-        "japanese": {
-            "niche": {
-                "education": "教育",
-                "gaming": "ゲーム",
-                "podcast": "ポッドキャスト",
-                "commentary": "解説",
-                "cars": "車",
-                "beauty": "美容",
-                "fitness": "フィットネス",
-                "finance": "金融",
-                "tech": "テック",
-                "lifestyle": "ライフスタイル",
-                "music": "音楽",
-            },
-            "style": {
-                "informative": "情報性の高い",
-                "funny": "ユーモアのある",
-                "dramatic": "ドラマチックな",
-                "educational": "学びやすい",
-                "commentary": "解説型の",
-            },
-        },
-        "chinese": {
-            "niche": {
-                "education": "教育",
-                "gaming": "游戏",
-                "podcast": "播客",
-                "commentary": "解说",
-                "cars": "汽车",
-                "beauty": "美妆",
-                "fitness": "健身",
-                "finance": "金融",
-                "tech": "科技",
-                "lifestyle": "生活方式",
-                "music": "音乐",
-            },
-            "style": {
-                "informative": "信息量高",
-                "funny": "有趣",
-                "dramatic": "戏剧化",
-                "educational": "教学型",
-                "commentary": "评论型",
-            },
-        },
-        "korean": {
-            "niche": {
-                "education": "교육",
-                "gaming": "게임",
-                "podcast": "팟캐스트",
-                "commentary": "해설",
-                "cars": "자동차",
-                "beauty": "뷰티",
-                "fitness": "피트니스",
-                "finance": "금융",
-                "tech": "테크",
-                "lifestyle": "라이프스타일",
-                "music": "음악",
-            },
-            "style": {
-                "informative": "정보형",
-                "funny": "재미있는",
-                "dramatic": "극적인",
-                "educational": "교육형",
-                "commentary": "해설형",
-            },
-        },
-    }
-    for language_key, groups in localized.items():
-        if language_key in language:
-            return groups.get(group, {}).get(key, value)
-    return value

backend/app/services/multimodal.py DELETED Viewed

@@ -1,200 +0,0 @@
-import os
-import subprocess
-import tempfile
-from app.core.config import Settings
-from app.models.schemas import ClipCandidate
-_DEMO_VISUALS = [
-    ("High-energy scene with strong visual contrast and clear subject focus.", 88.0),
-    ("Close-up with expressive reactions — excellent engagement framing.", 92.0),
-    ("Dynamic motion sequence; subject well-lit with clean background.", 84.0),
-    ("Text-overlay-friendly composition with natural colour grading.", 79.0),
-    ("Wide establishing shot; strong emotional beat in middle frames.", 81.0),
-]
-class QwenVisualAnalyzer:
-    def __init__(self, settings: Settings) -> None:
-        self.settings = settings
-        self._model = None
-        self._processor = None
-    def enrich(self, video_path: str, clips: list[ClipCandidate]) -> list[ClipCandidate]:
-        if self.settings.demo_mode:
-            return self._demo_enrich(clips)
-        try:
-            return self._qwen_enrich(video_path, clips)
-        except Exception:
-            return clips
-    # ------------------------------------------------------------------
-    # Demo mode
-    # ------------------------------------------------------------------
-    def _demo_enrich(self, clips: list[ClipCandidate]) -> list[ClipCandidate]:
-        enriched = []
-        for i, clip in enumerate(clips):
-            note, vscore = _DEMO_VISUALS[i % len(_DEMO_VISUALS)]
-            enriched.append(
-                clip.model_copy(
-                    update={
-                        "metadata": {
-                            **clip.metadata,
-                            "visual_model": "demo",
-                            "visual_note": note,
-                            "visual_score": vscore,
-                        }
-                    }
-                )
-            )
-        return enriched
-    # ------------------------------------------------------------------
-    # Production mode — Qwen2-VL on ROCm
-    # ------------------------------------------------------------------
-    def _load_model(self) -> None:
-        try:
-            import torch
-            from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
-        except ImportError as exc:
-            raise RuntimeError("transformers + ROCm PyTorch are required for Qwen2-VL") from exc
-        dtype = getattr(torch, self.settings.preferred_torch_dtype, torch.bfloat16)
-        self._model = Qwen2VLForConditionalGeneration.from_pretrained(
-            self.settings.qwen_vl_model_id,
-            torch_dtype=dtype,
-            device_map="auto",
-            trust_remote_code=True,
-            token=self.settings.hf_token or None,
-        )
-        self._processor = AutoProcessor.from_pretrained(
-            self.settings.qwen_vl_model_id,
-            trust_remote_code=True,
-            token=self.settings.hf_token or None,
-        )
-    def _qwen_enrich(self, video_path: str, clips: list[ClipCandidate]) -> list[ClipCandidate]:
-        if self._model is None:
-            self._load_model()
-        enriched = []
-        for clip in clips:
-            try:
-                frames = _sample_frames(video_path, clip.start_seconds, clip.end_seconds, self.settings.ffmpeg_binary)
-                if not frames:
-                    enriched.append(clip)
-                    continue
-                note, vscore = self._analyze(frames, clip.title)
-                enriched.append(
-                    clip.model_copy(
-                        update={
-                            "metadata": {
-                                **clip.metadata,
-                                "visual_model": self.settings.qwen_vl_model_id,
-                                "visual_note": note,
-                                "visual_score": vscore,
-                            }
-                        }
-                    )
-                )
-            except Exception:
-                enriched.append(
-                    clip.model_copy(
-                        update={
-                            "metadata": {
-                                **clip.metadata,
-                                "visual_model": self.settings.qwen_vl_model_id,
-                                "visual_status": "analysis_failed",
-                            }
-                        }
-                    )
-                )
-        return enriched
-    def _analyze(self, frames: list, title: str) -> tuple[str, float]:
-        import torch
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    *[{"type": "image", "image": f} for f in frames],
-                    {
-                        "type": "text",
-                        "text": (
-                            f'These frames are from a clip titled "{title}". '
-                            "Describe the visual quality and short-form engagement potential in 1-2 sentences. "
-                            "Then output exactly: SCORE: <integer 0-100>"
-                        ),
-                    },
-                ],
-            }
-        ]
-        text = self._processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = self._processor(text=[text], images=frames, return_tensors="pt").to(self._model.device)
-        with torch.no_grad():
-            ids = self._model.generate(**inputs, max_new_tokens=140)
-        reply = self._processor.batch_decode(
-            ids[:, inputs["input_ids"].shape[1]:],
-            skip_special_tokens=True,
-        )[0].strip()
-        vscore = 75.0
-        for line in reversed(reply.splitlines()):
-            upper = line.strip().upper()
-            if upper.startswith("SCORE:"):
-                try:
-                    vscore = float(upper.split(":", 1)[1].strip())
-                except ValueError:
-                    pass
-                break
-        note = reply.split("SCORE:")[0].strip() or reply
-        return note, min(max(vscore, 0.0), 100.0)
-# ------------------------------------------------------------------
-# Frame extraction helper
-# ------------------------------------------------------------------
-def _sample_frames(video_path: str, start: float, end: float, ffmpeg: str, n: int = 4) -> list:
-    try:
-        from PIL import Image
-    except ImportError:
-        return []
-    duration = max(end - start, 1.0)
-    timestamps = [start + duration * i / max(n - 1, 1) for i in range(n)]
-    frames = []
-    tmp_files = []
-    try:
-        for ts in timestamps:
-            fd, tmp = tempfile.mkstemp(suffix=".jpg")
-            os.close(fd)
-            tmp_files.append(tmp)
-            result = subprocess.run(
-                [
-                    ffmpeg,
-                    "-ss", f"{ts:.3f}",
-                    "-i", video_path,
-                    "-vframes", "1",
-                    "-q:v", "2",
-                    "-y", tmp,
-                ],
-                capture_output=True,
-                timeout=15,
-            )
-            if result.returncode == 0:
-                try:
-                    frames.append(Image.open(tmp).convert("RGB"))
-                except Exception:
-                    pass
-    finally:
-        for tmp in tmp_files:
-            try:
-                os.unlink(tmp)
-            except OSError:
-                pass
-    return frames

backend/app/services/pipeline.py DELETED Viewed

@@ -1,236 +0,0 @@
-import asyncio
-from pathlib import Path
-from app.core.config import Settings
-from app.core.timing import TimingLog
-from app.models.schemas import ChannelProfile, ClipCandidate
-from app.services.clips import ClipGenerator
-from app.services.highlight import QwenHighlightDetector
-from app.services.multimodal import QwenVisualAnalyzer
-from app.services.transcription import WhisperTranscriber
-from app.services.video_input import resolve_youtube_url
-from app.storage import JobStore
-class VideoPipeline:
-    def __init__(self, settings: Settings, store: JobStore) -> None:
-        self.settings = settings
-        self.store = store
-        self.transcriber = WhisperTranscriber(settings)
-        self.highlight_detector = QwenHighlightDetector(settings)
-        self.visual_analyzer = QwenVisualAnalyzer(settings)
-        self.clip_generator = ClipGenerator(settings, store)
-    async def process_source(
-        self,
-        job_id: str,
-        source_kind: str,
-        source_value: str,
-        profile: ChannelProfile,
-    ) -> None:
-        timings = TimingLog()
-        try:
-            self.store.update_job(
-                job_id,
-                status="running",
-                progress=0.04,
-                message="Preparing video input",
-                current_step="input",
-                step_index=1,
-                step_total=6,
-            )
-            with timings.measure("input"):
-                if source_kind == "youtube":
-                    video_path = await resolve_youtube_url(
-                        source_value, self.store.job_dir(job_id), self.settings
-                    )
-                else:
-                    video_path = Path(source_value)
-            self.store.update_job(
-                job_id,
-                progress=0.18,
-                message="Transcribing with Whisper Large V3",
-                current_step="transcription",
-                step_index=2,
-                step_total=6,
-            )
-            with timings.measure("transcription"):
-                transcript = await asyncio.to_thread(
-                    self.transcriber.transcribe, str(video_path), profile
-                )
-            self.store.write_json(
-                job_id,
-                "transcript.json",
-                [segment.model_dump(mode="json") for segment in transcript],
-            )
-            self.store.update_job(
-                job_id,
-                progress=0.42,
-                message="Transcript ready",
-                transcript=transcript,
-                timings=timings.to_dict(),
-            )
-            self.store.update_job(
-                job_id,
-                progress=0.48,
-                message="Scoring highlights with Qwen",
-                current_step="highlight_detection",
-                step_index=3,
-                step_total=6,
-            )
-            with timings.measure("highlight_detection"):
-                clips = await asyncio.to_thread(self.highlight_detector.detect, transcript, profile)
-            self.store.update_job(
-                job_id,
-                progress=0.62,
-                message="Checking visual highlights",
-                current_step="multimodal_analysis",
-                step_index=4,
-                step_total=6,
-            )
-            with timings.measure("multimodal_analysis"):
-                clips = await asyncio.to_thread(self.visual_analyzer.enrich, str(video_path), clips)
-            clip_total = len(clips)
-            self.store.update_job(
-                job_id,
-                progress=0.72,
-                message=f"Preparing to render {clip_total} clips",
-                current_step="clip_generation",
-                step_index=5,
-                step_total=6,
-                active_clip_index=0,
-                active_clip_total=clip_total,
-            )
-            def update_render_progress(index: int, total: int) -> None:
-                progress = 0.72 + (0.22 * ((index - 1) / max(total, 1)))
-                self.store.update_job(
-                    job_id,
-                    progress=min(progress, 0.94),
-                    message=f"Rendering clip {index}/{total}",
-                    current_step="clip_generation",
-                    step_index=5,
-                    step_total=6,
-                    active_clip_index=index,
-                    active_clip_total=total,
-                    timings=timings.to_dict(),
-                )
-            with timings.measure("clip_generation"):
-                rendered = await asyncio.to_thread(
-                    self.clip_generator.generate,
-                    job_id,
-                    video_path,
-                    clips,
-                    transcript,
-                    profile,
-                    update_render_progress,
-                )
-            self.store.update_job(
-                job_id,
-                progress=0.97,
-                message="Finalizing clips",
-                current_step="finalizing",
-                step_index=6,
-                step_total=6,
-                active_clip_index=clip_total,
-                active_clip_total=clip_total,
-                timings=timings.to_dict(),
-            )
-            self.store.write_json(
-                job_id, "clips.json", [clip.model_dump(mode="json") for clip in rendered]
-            )
-            self.store.update_job(
-                job_id,
-                status="completed",
-                progress=1,
-                message="Clips ready",
-                current_step="completed",
-                step_index=6,
-                step_total=6,
-                active_clip_index=clip_total,
-                active_clip_total=clip_total,
-                transcript=transcript,
-                clips=rendered,
-                timings=timings.to_dict(),
-            )
-        except Exception as exc:
-            self.store.update_job(
-                job_id,
-                status="failed",
-                progress=1,
-                message="Processing failed",
-                current_step="failed",
-                error=str(exc),
-                timings=timings.to_dict(),
-            )
-    def patch_clip(self, job_id: str, clip_id: str, updates: dict) -> ClipCandidate:
-        snapshot = self.store.get_job(job_id)
-        patched: ClipCandidate | None = None
-        clips: list[ClipCandidate] = []
-        for clip in snapshot.clips:
-            if clip.id == clip_id:
-                clean_updates = {key: value for key, value in updates.items() if value is not None}
-                clip = clip.model_copy(update=clean_updates)
-                if clip.end_seconds <= clip.start_seconds:
-                    clip = clip.model_copy(update={"end_seconds": clip.start_seconds + 1})
-                patched = clip
-            clips.append(clip)
-        if patched is None:
-            raise KeyError(clip_id)
-        self.store.update_job(job_id, clips=clips)
-        return patched
-    def regenerate_clip(
-        self,
-        job_id: str,
-        clip_id: str,
-        clip_style: str | None = None,
-        clip_length_seconds: int | None = None,
-        subtitle_text: str | None = None,
-    ) -> ClipCandidate:
-        snapshot = self.store.get_job(job_id)
-        source_path = self._source_path(job_id)
-        clips: list[ClipCandidate] = []
-        regenerated: ClipCandidate | None = None
-        for index, clip in enumerate(snapshot.clips, start=1):
-            if clip.id == clip_id:
-                profile = snapshot.profile.model_copy(
-                    update={
-                        key: value
-                        for key, value in {
-                            "clip_style": clip_style,
-                            "clip_length_seconds": clip_length_seconds,
-                        }.items()
-                        if value is not None
-                    }
-                )
-                if clip_length_seconds is not None:
-                    clip = clip.model_copy(
-                        update={"end_seconds": clip.start_seconds + clip_length_seconds}
-                    )
-                if subtitle_text is not None:
-                    clip = clip.model_copy(update={"subtitle_text": subtitle_text})
-                clip = self.clip_generator.render_one(
-                    job_id, source_path, clip, snapshot.transcript, profile, index
-                )
-                clip.metadata["regenerated"] = True
-                regenerated = clip
-            clips.append(clip)
-        if regenerated is None:
-            raise KeyError(clip_id)
-        self.store.update_job(job_id, clips=clips)
-        return regenerated
-    def _source_path(self, job_id: str) -> Path:
-        job_dir = self.store.job_dir(job_id)
-        matches = sorted(job_dir.glob("source.*"))
-        if not matches:
-            raise FileNotFoundError("source video missing")
-        return matches[0]

backend/app/services/subtitles.py DELETED Viewed

@@ -1,151 +0,0 @@
-import re
-from pathlib import Path
-from app.models.schemas import TranscriptSegment
-def seconds_to_srt_time(value: float) -> str:
-    millis = int(round(value * 1000))
-    hours, remainder = divmod(millis, 3_600_000)
-    minutes, remainder = divmod(remainder, 60_000)
-    seconds, millis = divmod(remainder, 1000)
-    return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
-def write_srt(
-    path: Path, clip_start: float, clip_end: float, segments: list[TranscriptSegment]
-) -> list[dict]:
-    cues: list[dict] = []
-    rows: list[str] = []
-    index = 1
-    for segment in segments:
-        if segment.end_seconds < clip_start or segment.start_seconds > clip_end:
-            continue
-        start = max(0.0, segment.start_seconds - clip_start)
-        end = min(clip_end - clip_start, segment.end_seconds - clip_start)
-        for cue in split_timed_caption(segment.text, start, max(end, start + 1.2)):
-            rows.extend(_srt_row(index, cue["start_seconds"], cue["end_seconds"], cue["text"]))
-            cues.append(cue)
-            index += 1
-    if not rows:
-        cues = [{"start_seconds": 0.0, "end_seconds": 3.0, "text": ""}]
-        rows = _srt_row(1, 0.0, 3.0, "")
-    path.write_text("\n".join(rows), encoding="utf-8")
-    return cues
-def write_single_caption_srt(path: Path, duration: float, text: str) -> list[dict]:
-    safe_duration = max(duration, 1.0)
-    cues = split_timed_caption(text, 0.0, safe_duration)
-    rows: list[str] = []
-    for index, cue in enumerate(cues, start=1):
-        rows.extend(_srt_row(index, cue["start_seconds"], cue["end_seconds"], cue["text"]))
-    if not rows:
-        cues = [{"start_seconds": 0.0, "end_seconds": min(safe_duration, 3.0), "text": ""}]
-        rows = _srt_row(1, cues[0]["start_seconds"], cues[0]["end_seconds"], "")
-    path.write_text("\n".join(rows), encoding="utf-8")
-    return cues
-def write_srt_from_cues(path: Path, cues: list) -> list[dict]:
-    """Write SRT using user-supplied per-cue timing (preferred over auto-distribution).
-    Accepts list of objects with .start_seconds / .end_seconds / .text attributes
-    (Pydantic SubtitleCue) or dicts with the same keys.
-    """
-    rows: list[str] = []
-    out_cues: list[dict] = []
-    index = 1
-    for cue in cues:
-        start = float(getattr(cue, "start_seconds", None) or cue.get("start_seconds", 0))
-        end = float(getattr(cue, "end_seconds", None) or cue.get("end_seconds", 0))
-        text = str(getattr(cue, "text", None) or cue.get("text", ""))
-        if end <= start:
-            end = start + 1.0
-        clean_text = text.strip()
-        if not clean_text:
-            continue
-        rows.extend(_srt_row(index, start, end, clean_text))
-        out_cues.append({"start_seconds": round(start, 3), "end_seconds": round(end, 3), "text": clean_text})
-        index += 1
-    if not rows:
-        out_cues = [{"start_seconds": 0.0, "end_seconds": 3.0, "text": ""}]
-        rows = _srt_row(1, 0.0, 3.0, "")
-    path.write_text("\n".join(rows), encoding="utf-8")
-    return out_cues
-def split_timed_caption(text: str, start: float, end: float) -> list[dict]:
-    phrases = split_caption_text(text)
-    if not phrases:
-        return []
-    total_duration = max(end - start, 1.2)
-    max_cues = max(1, int(total_duration / 1.2))
-    if len(phrases) > max_cues:
-        phrases = _merge_phrases(phrases, max_cues)
-    cue_duration = min(4.0, max(1.2, total_duration / len(phrases)))
-    cues: list[dict] = []
-    cursor = start
-    for index, phrase in enumerate(phrases):
-        remaining = len(phrases) - index
-        max_end = end - ((remaining - 1) * 1.2)
-        cue_end = min(max_end, cursor + cue_duration)
-        cue_end = max(cue_end, cursor + 1.2)
-        if index == len(phrases) - 1:
-            cue_end = end
-        cues.append(
-            {
-                "start_seconds": round(cursor, 3),
-                "end_seconds": round(max(cue_end, cursor + 0.8), 3),
-                "text": phrase,
-            }
-        )
-        cursor = cue_end
-    return cues
-def split_caption_text(text: str, max_chars: int = 42, max_words: int = 7) -> list[str]:
-    clean = re.sub(r"\s+", " ", text.strip())
-    if not clean:
-        return []
-    words = clean.split()
-    if len(words) <= 1:
-        return [clean[index : index + max_chars] for index in range(0, len(clean), max_chars)]
-    phrases: list[str] = []
-    current: list[str] = []
-    for word in words:
-        candidate = " ".join([*current, word]).strip()
-        punctuation_break = bool(current and re.search(r"[,.!?;:]$", current[-1]))
-        if current and (len(candidate) > max_chars or len(current) >= max_words or punctuation_break):
-            phrases.append(" ".join(current))
-            current = [word]
-        else:
-            current.append(word)
-    if current:
-        phrases.append(" ".join(current))
-    return phrases
-def _merge_phrases(phrases: list[str], target_count: int) -> list[str]:
-    if target_count <= 1:
-        return [" ".join(phrases)]
-    merged: list[str] = []
-    bucket_size = len(phrases) / target_count
-    for index in range(target_count):
-        start = round(index * bucket_size)
-        end = round((index + 1) * bucket_size)
-        merged.append(" ".join(phrases[start:end]).strip())
-    return [phrase for phrase in merged if phrase]
-def _srt_row(index: int, start: float, end: float, text: str) -> list[str]:
-    return [
-        str(index),
-        f"{seconds_to_srt_time(start)} --> {seconds_to_srt_time(end)}",
-        text.strip(),
-        "",
-    ]

backend/app/services/transcription.py DELETED Viewed

@@ -1,366 +0,0 @@
-from pathlib import Path
-from uuid import uuid4
-from app.core.config import Settings
-from app.models.schemas import ChannelProfile, SubtitleCue, TranscriptSegment
-from app.utils.rocm import torch_device_index
-class WhisperTranscriber:
-    def __init__(self, settings: Settings) -> None:
-        self.settings = settings
-        self._pipeline = None
-    def transcribe(self, video_path: str, profile: ChannelProfile) -> list[TranscriptSegment]:
-        if self.settings.demo_mode:
-            return self._demo_transcript(profile)
-        try:
-            from transformers import pipeline
-        except Exception as exc:
-            raise RuntimeError("transformers is required for Whisper transcription") from exc
-        if self._pipeline is None:
-            self._pipeline = pipeline(
-                task="automatic-speech-recognition",
-                model=self.settings.whisper_model_id,
-                device=torch_device_index(),
-                token=self.settings.hf_token,
-                chunk_length_s=30,
-                return_timestamps=True,
-            )
-        generate_kwargs = {"task": "transcribe"}
-        if profile.primary_language and profile.primary_language.lower() != "auto":
-            generate_kwargs["language"] = profile.primary_language.lower()
-        result = self._pipeline(str(video_path), generate_kwargs=generate_kwargs)
-        chunks = result.get("chunks") or []
-        if not chunks:
-            text = result.get("text", "").strip()
-            return [
-                TranscriptSegment(
-                    id=uuid4().hex,
-                    start_seconds=0,
-                    end_seconds=max(profile.clip_length_seconds, 15),
-                    text=text,
-                    language=profile.primary_language,
-                )
-            ]
-        segments: list[TranscriptSegment] = []
-        for chunk in chunks:
-            timestamp = chunk.get("timestamp") or (0, 0)
-            start = float(timestamp[0] or 0)
-            end = float(timestamp[1] or start + 5)
-            text = (chunk.get("text") or "").strip()
-            if text:
-                segments.append(
-                    TranscriptSegment(
-                        id=uuid4().hex,
-                        start_seconds=start,
-                        end_seconds=max(end, start + 1),
-                        text=text,
-                        language=profile.primary_language,
-                    )
-                )
-        return segments
-    def align_words(
-        self,
-        video_path: str | Path,
-        text: str,
-        clip_start: float,
-        clip_end: float,
-    ) -> list[SubtitleCue]:
-        """Estimate per-word/per-phrase timing within [clip_start, clip_end].
-        Demo mode: split the text into chunks of ~3 words, distribute timings
-        across the clip duration. Production: run Whisper word-level timestamps.
-        Returns SubtitleCues with timing relative to clip_start.
-        """
-        if self.settings.demo_mode or not text.strip():
-            return self._demo_align_words(text, clip_start, clip_end)
-        try:
-            return self._whisper_align_words(video_path, text, clip_start, clip_end)
-        except Exception:
-            return self._demo_align_words(text, clip_start, clip_end)
-    def _demo_align_words(
-        self, text: str, clip_start: float, clip_end: float
-    ) -> list[SubtitleCue]:
-        clean = " ".join(text.split())
-        if not clean:
-            return [SubtitleCue(start_seconds=0.0, end_seconds=2.0, text="")]
-        words = clean.split()
-        # Group into ~3 word chunks (typical for short-form caption pacing)
-        chunk_size = max(2, min(4, max(1, len(words) // 6)))
-        chunks: list[str] = []
-        for i in range(0, len(words), chunk_size):
-            chunks.append(" ".join(words[i : i + chunk_size]))
-        duration = max(0.5, clip_end - clip_start)
-        per = duration / len(chunks)
-        cues: list[SubtitleCue] = []
-        for i, chunk in enumerate(chunks):
-            cue_start = round(i * per, 3)
-            cue_end = round((i + 1) * per, 3)
-            cues.append(
-                SubtitleCue(
-                    start_seconds=cue_start,
-                    end_seconds=max(cue_end, cue_start + 0.4),
-                    text=chunk,
-                )
-            )
-        return cues
-    def _whisper_align_words(
-        self, video_path: str | Path, text: str, clip_start: float, clip_end: float
-    ) -> list[SubtitleCue]:
-        try:
-            from transformers import pipeline
-        except Exception as exc:
-            raise RuntimeError("transformers is required for word-level timestamps") from exc
-        if self._pipeline is None:
-            self._pipeline = pipeline(
-                task="automatic-speech-recognition",
-                model=self.settings.whisper_model_id,
-                device=torch_device_index(),
-                token=self.settings.hf_token,
-                chunk_length_s=30,
-                return_timestamps="word",
-            )
-        result = self._pipeline(
-            str(video_path),
-            generate_kwargs={"task": "transcribe"},
-            return_timestamps="word",
-        )
-        chunks = result.get("chunks") or []
-        # Filter to chunks inside [clip_start, clip_end] and convert to relative time
-        cues: list[SubtitleCue] = []
-        buffer_words: list[tuple[str, float, float]] = []
-        for chunk in chunks:
-            ts = chunk.get("timestamp") or (0, 0)
-            start = float(ts[0] or 0)
-            end = float(ts[1] or start + 0.3)
-            word = (chunk.get("text") or "").strip()
-            if not word:
-                continue
-            if end < clip_start or start > clip_end:
-                continue
-            buffer_words.append(
-                (word, max(0.0, start - clip_start), min(clip_end - clip_start, end - clip_start))
-            )
-        # Group into ~3 word phrases
-        chunk_size = 3
-        for i in range(0, len(buffer_words), chunk_size):
-            group = buffer_words[i : i + chunk_size]
-            text_chunk = " ".join(w for w, _, _ in group)
-            cue_start = group[0][1]
-            cue_end = group[-1][2]
-            cues.append(
-                SubtitleCue(
-                    start_seconds=round(cue_start, 3),
-                    end_seconds=round(max(cue_end, cue_start + 0.4), 3),
-                    text=text_chunk,
-                )
-            )
-        return cues if cues else self._demo_align_words(text, clip_start, clip_end)
-    def _demo_transcript(self, profile: ChannelProfile) -> list[TranscriptSegment]:
-        style = profile.clip_style.lower()
-        language = profile.primary_language.lower()
-        niche_value = (
-            profile.niche_custom
-            if profile.niche.lower() == "other" and profile.niche_custom
-            else profile.niche
-        )
-        niche = niche_value.lower()
-        creator_context = (
-            profile.channel_description
-            or "The creator wants clips that feel useful and easy to share."
-        )
-        lines = _demo_lines(
-            language,
-            _localized_profile_word(niche, language, "niche"),
-            _localized_profile_word(style, language, "style"),
-            creator_context,
-        )
-        segments: list[TranscriptSegment] = []
-        cursor = 0.0
-        for line in lines:
-            end = cursor + 15.0
-            segments.append(
-                TranscriptSegment(
-                    id=uuid4().hex,
-                    start_seconds=cursor,
-                    end_seconds=end,
-                    text=line,
-                    language=profile.primary_language,
-                )
-            )
-            cursor = end
-        return segments
-def _demo_lines(language: str, niche: str, style: str, creator_context: str) -> list[str]:
-    if "thai" in language:
-        return [
-            "ช่วงเปิดนี้วางปัญหาหลักของครีเอเตอร์ เวลาวิดีโอยาวซ่อนช่วงที่ดีที่สุดไว้",
-            "นี่คือความผิดพลาดที่หลายทีมทำ คือเลือกคลิปจากยอดวิวอย่างเดียว",
-            "คำถามสำคัญคือ ช่วงไหนที่จะทำให้คนหยุดเลื่อนหน้าจอได้ทันที",
-            f"สำหรับช่องแนว {niche} คำตอบจะเปลี่ยน เพราะผู้ชมคาดหวังจังหวะที่ {style}",
-            f"บริบทของช่องคือ {creator_context}",
-            "ช่วงนี้อธิบายได้ชัดที่สุด และมีภาพเปรียบเทียบก่อนกับหลังที่แรง",
-            "จากนั้นแขกรับเชิญตอบสนองด้วยประโยคสั้นที่เหมาะมากสำหรับ hook",
-            "ตรงนี้มีข้อคิดที่เอาไปใช้ได้ทันที และยืนเป็นคลิปสั้นได้ด้วยตัวเอง",
-            "ช่วงท้ายสรุปไอเดียด้วยประโยคชัด ๆ ที่ทำซับได้ง่าย",
-        ]
-    if "japanese" in language:
-        return [
-            "この冒頭では、長い動画に最高の瞬間が埋もれてしまう問題を示しています。",
-            "多くのチームが再生数だけでクリップを選ぶという意外なミスをしています。",
-            "大事な問いは、この瞬間が今すぐスクロールを止めさせるかどうかです。",
-            f"{niche} チャンネルでは、視聴者が {style} なテンポを期待するため答えが変わります。",
-            f"チャンネルの文脈はこうです。{creator_context}",
-            "この部分は説明が最も明確で、ビフォーアフターの対比も強いです。",
-            "その後、ゲストが短いフックとして使いやすい一言で反応します。",
-            "ここには単独の短尺クリップとして成立する実用的な学びがあります。",
-            "最後の部分は字幕にしやすい明確な一言でアイデアをまとめます。",
-        ]
-    if "chinese" in language:
-        return [
-            "这个开头点出了创作者常遇到的问题：长视频里藏着最好的瞬间。",
-            "很多团队都会犯一个意外错误，只根据播放量来选择剪辑片段。",
-            "关键问题很简单：哪个瞬间能让观众立刻停下滑动？",
-            f"对于 {niche} 频道，答案会不同，因为观众期待 {style} 的节奏。",
-            f"频道背景是：{creator_context}",
-            "这一段解释最清楚，并且有很强的前后对比。",
-            "接着嘉宾给出一句有冲击力的回应，很适合作为短视频 hook。",
-            "这里有一个实用结论，足够独立成为一个短视频片段。",
-            "最后一段用一句清晰的话收束观点，也很适合做字幕。",
-        ]
-    if "korean" in language:
-        return [
-            "이 오프닝은 긴 영상 속 좋은 순간이 묻히는 문제를 보여줍니다.",
-            "많은 팀이 조회수만 보고 클립을 고르는 의외의 실수를 합니다.",
-            "핵심 질문은 간단합니다. 어떤 순간이 시청자의 스크롤을 멈추게 할까요?",
-            f"{niche} 채널에서는 시청자가 {style} 리듬을 기대하기 때문에 답이 달라집니다.",
-            f"채널 맥락은 다음과 같습니다. {creator_context}",
-            "이 부분은 설명이 가장 명확하고 전후 대비도 강합니다.",
-            "그다음 게스트가 짧은 훅으로 쓰기 좋은 강한 한마디를 합니다.",
-            "여기에는 단독 숏폼 클립으로도 충분한 실용적인 takeaway가 있습니다.",
-            "마지막 부분은 자막으로 만들기 쉬운 명확한 문장으로 아이디어를 정리합니다.",
-        ]
-    return [
-        "This opening sets up the main problem creators face when a long video hides the best moments.",
-        "Here is the surprising mistake most teams make when they choose clips only by view count.",
-        "The important question is simple: which moment would make someone stop scrolling right now?",
-        f"For a {niche} channel, the answer changes because the audience expects a {style} rhythm.",
-        f"The channel context is simple: {creator_context}",
-        "This section has the clearest explanation and a strong before-and-after contrast.",
-        "Then the guest reacts with a punchy line that works well as a short hook.",
-        "A practical takeaway lands here, with enough context to stand alone as a sixty second clip.",
-        "The final segment wraps the idea with a direct callout that is easy to subtitle.",
-    ]
-def _localized_profile_word(value: str, language: str, group: str) -> str:
-    key = value.lower().replace(" ", "_")
-    localized = {
-        "thai": {
-            "niche": {
-                "education": "การศึกษา",
-                "gaming": "เกม",
-                "podcast": "พอดแคสต์",
-                "commentary": "เล่า/วิเคราะห์",
-                "cars": "รถยนต์",
-                "beauty": "บิวตี้",
-                "fitness": "ฟิตเนส",
-                "finance": "การเงิน",
-                "tech": "เทคโนโลยี",
-                "lifestyle": "ไลฟ์สไตล์",
-                "music": "ดนตรี",
-            },
-            "style": {
-                "informative": "ให้ข้อมูล",
-                "funny": "ตลก",
-                "dramatic": "ดราม่า",
-                "educational": "สอนเข้าใจง่าย",
-                "commentary": "วิเคราะห์",
-            },
-        },
-        "japanese": {
-            "niche": {
-                "education": "教育",
-                "gaming": "ゲーム",
-                "podcast": "ポッドキャスト",
-                "commentary": "解説",
-                "cars": "車",
-                "beauty": "美容",
-                "fitness": "フィットネス",
-                "finance": "金融",
-                "tech": "テック",
-                "lifestyle": "ライフスタイル",
-                "music": "音楽",
-            },
-            "style": {
-                "informative": "情報性の高い",
-                "funny": "ユーモアのある",
-                "dramatic": "ドラマチックな",
-                "educational": "学びやすい",
-                "commentary": "解説型の",
-            },
-        },
-        "chinese": {
-            "niche": {
-                "education": "教育",
-                "gaming": "游戏",
-                "podcast": "播客",
-                "commentary": "解说",
-                "cars": "汽车",
-                "beauty": "美妆",
-                "fitness": "健身",
-                "finance": "金融",
-                "tech": "科技",
-                "lifestyle": "生活方式",
-                "music": "音乐",
-            },
-            "style": {
-                "informative": "信息量高",
-                "funny": "有趣",
-                "dramatic": "戏剧化",
-                "educational": "教学型",
-                "commentary": "评论型",
-            },
-        },
-        "korean": {
-            "niche": {
-                "education": "교육",
-                "gaming": "게임",
-                "podcast": "팟캐스트",
-                "commentary": "해설",
-                "cars": "자동차",
-                "beauty": "뷰티",
-                "fitness": "피트니스",
-                "finance": "금융",
-                "tech": "테크",
-                "lifestyle": "라이프스타일",
-                "music": "음악",
-            },
-            "style": {
-                "informative": "정보형",
-                "funny": "재미있는",
-                "dramatic": "극적인",
-                "educational": "교육형",
-                "commentary": "해설형",
-            },
-        },
-    }
-    for language_key, groups in localized.items():
-        if language_key in language:
-            return groups.get(group, {}).get(key, value)
-    return value

backend/app/services/video_input.py DELETED Viewed

@@ -1,80 +0,0 @@
-import asyncio
-import shutil
-import subprocess
-from pathlib import Path
-from fastapi import UploadFile
-from app.core.config import Settings
-async def save_upload(upload: UploadFile, job_dir: Path) -> Path:
-    suffix = Path(upload.filename or "upload.mp4").suffix or ".mp4"
-    destination = job_dir / f"source{suffix.lower()}"
-    with destination.open("wb") as handle:
-        while chunk := await upload.read(1024 * 1024):
-            handle.write(chunk)
-    return destination
-async def resolve_youtube_url(url: str, job_dir: Path, settings: Settings) -> Path:
-    if settings.demo_mode:
-        return await asyncio.to_thread(create_demo_video, job_dir, settings)
-    try:
-        import yt_dlp
-    except Exception as exc:
-        raise RuntimeError("yt-dlp is required for YouTube ingestion") from exc
-    output_template = str(job_dir / "source.%(ext)s")
-    ydl_opts = {
-        "outtmpl": output_template,
-        "format": "bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4]/best",
-        "merge_output_format": "mp4",
-        "quiet": True,
-        "noprogress": True,
-    }
-    def download() -> Path:
-        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            ydl.download([url])
-        matches = sorted(job_dir.glob("source.*"))
-        if not matches:
-            raise RuntimeError("yt-dlp finished without producing a video")
-        return matches[0]
-    return await asyncio.to_thread(download)
-def create_demo_video(job_dir: Path, settings: Settings) -> Path:
-    destination = job_dir / "source.mp4"
-    ffmpeg = shutil.which(settings.ffmpeg_binary)
-    if not ffmpeg:
-        destination.write_bytes(b"")
-        return destination
-    command = [
-        ffmpeg,
-        "-y",
-        "-f",
-        "lavfi",
-        "-i",
-        "testsrc2=size=1280x720:rate=30:duration=120",
-        "-f",
-        "lavfi",
-        "-i",
-        "sine=frequency=660:sample_rate=48000:duration=120",
-        "-shortest",
-        "-c:v",
-        "libx264",
-        "-pix_fmt",
-        "yuv420p",
-        "-c:a",
-        "aac",
-        str(destination),
-    ]
-    try:
-        subprocess.run(command, check=True, capture_output=True, text=True, timeout=45)
-    except Exception:
-        destination.write_bytes(b"")
-    return destination

backend/app/storage.py DELETED Viewed

@@ -1,58 +0,0 @@
-import json
-from pathlib import Path
-from uuid import uuid4
-from app.core.config import Settings
-from app.models.schemas import ChannelProfile, JobSnapshot, utc_now
-class JobStore:
-    def __init__(self, settings: Settings) -> None:
-        self.settings = settings
-        self.root = settings.storage_dir
-        self.jobs_root = self.root / "jobs"
-        self.jobs_root.mkdir(parents=True, exist_ok=True)
-    def create_job(self, profile: ChannelProfile, source: dict) -> JobSnapshot:
-        job_id = uuid4().hex
-        job_dir = self.job_dir(job_id)
-        job_dir.mkdir(parents=True, exist_ok=True)
-        snapshot = JobSnapshot(
-            id=job_id,
-            status="queued",
-            progress=0,
-            message="Queued",
-            source=source,
-            profile=profile,
-        )
-        self.save_job(snapshot)
-        return snapshot
-    def job_dir(self, job_id: str) -> Path:
-        return self.jobs_root / job_id
-    def media_url(self, job_id: str, filename: str) -> str:
-        return f"/media/jobs/{job_id}/{filename}"
-    def save_job(self, snapshot: JobSnapshot) -> JobSnapshot:
-        snapshot.updated_at = utc_now()
-        path = self.job_dir(snapshot.id) / "job.json"
-        path.write_text(snapshot.model_dump_json(indent=2), encoding="utf-8")
-        return snapshot
-    def get_job(self, job_id: str) -> JobSnapshot:
-        path = self.job_dir(job_id) / "job.json"
-        if not path.exists():
-            raise FileNotFoundError(job_id)
-        data = json.loads(path.read_text(encoding="utf-8"))
-        return JobSnapshot.model_validate(data)
-    def update_job(self, job_id: str, **updates) -> JobSnapshot:
-        snapshot = self.get_job(job_id)
-        updated = snapshot.model_copy(update=updates)
-        return self.save_job(updated)
-    def write_json(self, job_id: str, filename: str, payload: object) -> Path:
-        path = self.job_dir(job_id) / filename
-        path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
-        return path

backend/app/utils/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Runtime helpers."""

backend/app/utils/rocm.py DELETED Viewed

@@ -1,33 +0,0 @@
-from typing import Any
-def detect_accelerator() -> dict[str, Any]:
-    try:
-        import torch
-    except Exception as exc:
-        return {
-            "torch_available": False,
-            "cuda_api_available": False,
-            "rocm_hip_version": None,
-            "device_name": None,
-            "error": str(exc),
-        }
-    cuda_available = bool(torch.cuda.is_available())
-    device_name = torch.cuda.get_device_name(0) if cuda_available else None
-    return {
-        "torch_available": True,
-        "cuda_api_available": cuda_available,
-        "rocm_hip_version": getattr(torch.version, "hip", None),
-        "cuda_version": getattr(torch.version, "cuda", None),
-        "device_name": device_name,
-        "device_count": torch.cuda.device_count() if cuda_available else 0,
-    }
-def torch_device_index() -> int:
-    try:
-        import torch
-    except Exception:
-        return -1
-    return 0 if torch.cuda.is_available() else -1

backend/app/workers/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Optional async workers."""

backend/app/workers/celery_app.py DELETED Viewed

@@ -1,15 +0,0 @@
-from celery import Celery
-from app.core.config import get_settings
-settings = get_settings()
-celery_app = Celery("ai_clip_studio", broker=settings.redis_url, backend=settings.redis_url)
-celery_app.conf.task_serializer = "json"
-celery_app.conf.result_serializer = "json"
-celery_app.conf.accept_content = ["json"]
-@celery_app.task(name="pipeline.process_job")
-def process_job(job_id: str) -> str:
-    return f"Queued job {job_id}. FastAPI background tasks are active by default."

backend/main.py ADDED Viewed

	@@ -0,0 +1,466 @@

+"""ElevenClip AI — FastAPI Backend.
+Endpoints:
+  POST /api/video-info          — get YouTube metadata (no download)
+  POST /api/process             — full pipeline (download/upload → clips)
+  WS   /ws/progress/{session}   — real-time pipeline progress
+  GET  /api/clips/{session}     — list generated clips
+  PATCH /api/clips/{session}/{index}/subtitles — update subtitle event
+  PATCH /api/clips/{session}/{index}/style     — apply global style override
+  POST /api/clips/{session}/{index}/render     — burn-in subtitles → download
+  GET  /downloads/{session}/{filename}         — serve output files
+"""
+import asyncio
+import json
+import os
+import uuid
+from pathlib import Path
+from typing import Optional
+from fastapi import FastAPI, UploadFile, File, Form, Header, WebSocket, WebSocketDisconnect, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+from loguru import logger
+from src.gpu.rocm_utils import get_device, log_gpu_status
+from src.gpu.vllm_manager import ensure_vllm_running, vllm_stop, vllm_status
+from src.ingestion.youtube import download_video_async, get_video_info
+from src.transcription.whisper import transcribe_async, extract_audio
+from src.analysis.scene_detector import detect_scenes, sample_frames
+from src.analysis.vision import analyze_scenes_batch_async
+from src.analysis.highlight_scorer import score_scenes, select_top_clips
+from src.processing.clip_extractor import extract_all_clips_async, burn_subtitles
+from src.processing.subtitle import generate_subtitles, update_subtitle_event, apply_global_style_override
+from src.processing.high_retention import apply_hre
+app = FastAPI(title="ElevenClip AI", version="1.0.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+WORK_DIR = Path(os.getenv("WORK_DIR", "/tmp/elevnclip"))
+WORK_DIR.mkdir(parents=True, exist_ok=True)
+DEMO_ACCESS_CODE = os.getenv("DEMO_ACCESS_CODE", "").strip()
+MAX_CONCURRENT_JOBS = int(os.getenv("MAX_CONCURRENT_JOBS", "1"))
+MAX_UPLOAD_MB = int(os.getenv("MAX_UPLOAD_MB", "500"))
+app.mount("/downloads", StaticFiles(directory=str(WORK_DIR)), name="downloads")
+# In-memory session store + WebSocket registry
+sessions: dict[str, dict] = {}
+ws_connections: dict[str, WebSocket] = {}
+ws_queues: dict[str, list[dict]] = {}  # buffer progress messages until WS connects
+active_jobs: set[str] = set()
+def _require_access(x_demo_key: Optional[str]) -> None:
+    """Optional public-demo guard for expensive GPU endpoints."""
+    if DEMO_ACCESS_CODE and (x_demo_key or "").strip() != DEMO_ACCESS_CODE:
+        raise HTTPException(403, "Access code required for generation")
+# ─── Startup ──────────────────────────────────────────────────────────────
+@app.on_event("startup")
+async def startup():
+    log_gpu_status()
+    # Pre-populate demo session so /editor?session=demo always works
+    sessions["demo"] = {"status": "done", "clips": _build_demo_clips()}
+def _build_demo_clips() -> list[dict]:
+    return [
+        {
+            "index": 1, "start": 0.0, "end": 45.0, "duration": 45.0, "score": 0.91,
+            "clip_path": None, "final_path": None, "ass_path": None,
+            "download_url": None, "raw_url": None,
+            "highlight_reason": "High-energy moment with peak audience reaction",
+            "vision_analysis": {"excitement_score": 0.92, "tiktok_potential": 0.89, "emotion": "excited", "action_type": "gaming"},
+        },
+        {
+            "index": 2, "start": 90.0, "end": 150.0, "duration": 60.0, "score": 0.83,
+            "clip_path": None, "final_path": None, "ass_path": None,
+            "download_url": None, "raw_url": None,
+            "highlight_reason": "Funny reaction — peak humor level detected",
+            "vision_analysis": {"excitement_score": 0.78, "tiktok_potential": 0.85, "emotion": "funny", "action_type": "reaction"},
+        },
+        {
+            "index": 3, "start": 210.0, "end": 270.0, "duration": 60.0, "score": 0.76,
+            "clip_path": None, "final_path": None, "ass_path": None,
+            "download_url": None, "raw_url": None,
+            "highlight_reason": "Educational highlight with strong engagement signal",
+            "vision_analysis": {"excitement_score": 0.70, "tiktok_potential": 0.80, "emotion": "happy", "action_type": "tutorial"},
+        },
+    ]
+# ─── WebSocket Progress ────────────────────────────────────────────────────
+@app.websocket("/ws/progress/{session_id}")
+async def ws_progress(websocket: WebSocket, session_id: str):
+    await websocket.accept()
+    ws_connections[session_id] = websocket
+    # Flush messages that were sent before the WS connected
+    for msg in ws_queues.pop(session_id, []):
+        try:
+            await websocket.send_json(msg)
+        except Exception:
+            break
+    try:
+        while True:
+            await asyncio.sleep(30)  # keep-alive
+    except WebSocketDisconnect:
+        ws_connections.pop(session_id, None)
+async def send_progress(session_id: str, stage: str, pct: int, message: str = ""):
+    payload = {"stage": stage, "pct": pct, "message": message}
+    sessions.setdefault(session_id, {})["last_progress"] = payload
+    ws = ws_connections.get(session_id)
+    if ws:
+        try:
+            await ws.send_json(payload)
+            return
+        except Exception:
+            ws_connections.pop(session_id, None)
+    # WS not yet connected — buffer for flush on connect
+    ws_queues.setdefault(session_id, []).append(payload)
+# ─── Models ───────────────────────────────────────────────────────────────
+class VideoInfoRequest(BaseModel):
+    url: str
+DEMO_VIDEO_DIR = Path("/root/ElevenClip-AI/demo_videos")
+_DEMO_CANDIDATES = ["demo1.mp4", "demo2.mp4", "demo.mp4"]
+def _get_demo_video() -> Path | None:
+    import random
+    available = [DEMO_VIDEO_DIR / f for f in _DEMO_CANDIDATES if (DEMO_VIDEO_DIR / f).exists()]
+    return random.choice(available) if available else None
+class ProcessSettings(BaseModel):
+    youtube_url: Optional[str] = None
+    use_demo_video: bool = False
+    channel_description: str = ""
+    clip_style: str = "entertaining"
+    target_duration: int = 60
+    clip_count: int = 3
+    clip_language: str = "auto"
+    subtitle_language: str = "en"
+    mode: str = "normal"  # "normal" | "hre"
+    aspect_mode: str = "crop"  # "crop" | "letterbox"
+    style_config: dict = {}
+class SubtitlePatch(BaseModel):
+    event_index: int
+    updates: dict  # {text, start, end}
+class GlobalStylePatch(BaseModel):
+    style_config: dict
+# ─── Routes ───────────────────────────────────────────────────────────────
+@app.get("/health")
+async def health():
+    return {"status": "ok", "device": get_device()}
+@app.post("/api/video-info")
+async def video_info(req: VideoInfoRequest, x_demo_key: Optional[str] = Header(None, alias="X-Demo-Key")):
+    _require_access(x_demo_key)
+    try:
+        return get_video_info(req.url)
+    except Exception as e:
+        raise HTTPException(400, str(e))
+@app.post("/api/process")
+async def process(
+    settings_json: str = Form(...),
+    file: Optional[UploadFile] = File(None),
+    x_demo_key: Optional[str] = Header(None, alias="X-Demo-Key"),
+):
+    """Main pipeline endpoint. Returns session_id immediately; progress via WebSocket."""
+    _require_access(x_demo_key)
+    if len(active_jobs) >= MAX_CONCURRENT_JOBS:
+        raise HTTPException(429, "GPU is busy. Please try again in a few minutes.")
+    settings = ProcessSettings(**json.loads(settings_json))
+    session_id = str(uuid.uuid4())
+    session_dir = WORK_DIR / session_id
+    session_dir.mkdir(parents=True, exist_ok=True)
+    sessions[session_id] = {"status": "starting", "clips": []}
+    # Read file bytes NOW — UploadFile becomes invalid once the response is sent
+    file_bytes: Optional[bytes] = None
+    file_name: Optional[str] = None
+    if file:
+        file_bytes = await file.read()
+        file_name = file.filename or "upload.mp4"
+        if len(file_bytes) > MAX_UPLOAD_MB * 1024 * 1024:
+            raise HTTPException(413, f"File too large. Max upload size is {MAX_UPLOAD_MB} MB.")
+    active_jobs.add(session_id)
+    asyncio.create_task(_run_pipeline(session_id, session_dir, settings, file_bytes, file_name))
+    return {"session_id": session_id}
+# ─── Pipeline ─────────────────────────────────────────────────────────────
+async def _run_pipeline(
+    session_id: str,
+    session_dir: Path,
+    settings: ProcessSettings,
+    file_bytes: Optional[bytes],
+    file_name: Optional[str],
+):
+    loop = asyncio.get_running_loop()
+    frames_dir = session_dir / "frames"
+    try:
+        # ── 1. Acquire video ──────────────────────────────────────────────
+        await send_progress(session_id, "download", 5, "Acquiring video...")
+        if settings.use_demo_video and (demo_vid := _get_demo_video()):
+            video_path = demo_vid
+            await send_progress(session_id, "download", 30, f"Using demo video: {demo_vid.name}")
+        elif settings.youtube_url:
+            def pct_cb(p):
+                asyncio.run_coroutine_threadsafe(
+                    send_progress(session_id, "download", max(5, int(p * 0.28)), f"Downloading {p:.0f}%"),
+                    loop,
+                )
+            video_path = await download_video_async(
+                settings.youtube_url, session_dir, session_id, pct_cb
+            )
+        elif file_bytes:
+            suffix = Path(file_name).suffix if file_name else ".mp4"
+            video_path = session_dir / f"{session_id}_input{suffix}"
+            await loop.run_in_executor(None, video_path.write_bytes, file_bytes)
+        else:
+            raise ValueError("No video source provided")
+        await send_progress(session_id, "download", 30, "Video ready")
+        # ── 2. Extract audio ─────────────────────────────────────────────
+        await send_progress(session_id, "audio", 32, "Extracting audio (16kHz mono)...")
+        audio_path = session_dir / f"{session_id}_audio.wav"
+        await loop.run_in_executor(None, lambda: extract_audio(video_path, audio_path))
+        # ── 3+4. Scene detection AND Whisper transcription IN PARALLEL ───
+        # Scene detection runs on CPU; Whisper runs on AMD GPU. True concurrency.
+        await send_progress(session_id, "scenes", 35, "Scene detection + Whisper transcription (parallel on AMD ROCm)...")
+        device = get_device()
+        scenes_future = loop.run_in_executor(None, lambda: detect_scenes(video_path))
+        transcript_task = transcribe_async(
+            audio_path,
+            clip_language=settings.clip_language,
+            subtitle_language=settings.subtitle_language,
+            device=device,
+        )
+        scenes, transcript = await asyncio.gather(scenes_future, transcript_task)
+        await send_progress(
+            session_id, "transcribe", 58,
+            f"Whisper: {len(transcript.get('segments', []))} segments | SceneDetect: {len(scenes)} scenes"
+        )
+        # Frame sampling (after scenes list is known)
+        scenes_with_frames = await loop.run_in_executor(
+            None, lambda: sample_frames(video_path, scenes, frames_dir)
+        )
+        # ── 5. Qwen2.5-VL multimodal analysis (concurrent requests to vLLM) ─
+        n_scenes = len(scenes_with_frames)
+        await send_progress(session_id, "vision", 58, "Ensuring AI model is running...")
+        await loop.run_in_executor(
+            None,
+            lambda: ensure_vllm_running(
+                progress_cb=lambda msg: asyncio.run_coroutine_threadsafe(
+                    send_progress(session_id, "vision", 59, msg), loop
+                )
+            ),
+        )
+        await send_progress(session_id, "vision", 60, f"Qwen2.5-VL analyzing {n_scenes} scenes (vision + audio + text fusion)...")
+        scenes_analyzed = await analyze_scenes_batch_async(
+            scenes_with_frames,
+            transcript.get("segments", []),
+            channel_description=settings.channel_description,
+            clip_style=settings.clip_style,
+        )
+        await send_progress(session_id, "vision", 76, f"Multimodal analysis complete: {n_scenes} scenes scored")
+        # ── 6. Multi-signal scoring ─────────────────────────────────────
+        await send_progress(session_id, "scoring", 77, "Scoring: 0.40×vision + 0.35×audio_energy + 0.25×text_keywords")
+        scored = score_scenes(scenes_analyzed, audio_path, settings.clip_style, settings.target_duration)
+        selected = select_top_clips(scored, settings.clip_count, settings.target_duration)
+        # ── 7. Extract clips (AMD AMF hardware encoder) ─────────────────
+        await send_progress(session_id, "cutting", 81, f"Cutting {len(selected)} clips (h264_amf)...")
+        clips = await extract_all_clips_async(video_path, selected, session_dir, session_id, aspect_mode=settings.aspect_mode)
+        # ── 8. Subtitles / HRE (all clips in parallel) ─────────────────
+        await send_progress(session_id, "subtitles", 86, "Generating subtitles (parallel)...")
+        subtitle_tasks = []
+        final_clips = []
+        for clip in clips:
+            if not clip.get("clip_path"):
+                continue
+            clip_path = Path(clip["clip_path"])
+            i = clip["clip_index"]
+            clip_transcript = {
+                **transcript,
+                "segments": [
+                    s for s in transcript.get("segments", [])
+                    if s["start"] < clip["end"] and s["end"] > clip["start"]
+                ],
+            }
+            ass_path = session_dir / f"{session_id}_clip_{i:02d}.ass"
+            final_path = session_dir / f"{session_id}_clip_{i:02d}_final.mp4"
+            if settings.mode == "hre":
+                subtitle_tasks.append(loop.run_in_executor(
+                    None,
+                    lambda cp=clip_path, cd=clip, tr=clip_transcript, fp=final_path:
+                        apply_hre(cp, cd, tr, fp)
+                ))
+            else:
+                def _gen_and_burn(cp=clip_path, ap=ass_path, tr=clip_transcript, cs=clip["start"], fp=final_path):
+                    generate_subtitles(tr, ap, settings.style_config, clip_start_offset=cs)
+                    burn_subtitles(cp, ap, fp)
+                subtitle_tasks.append(loop.run_in_executor(None, _gen_and_burn))
+            final_clips.append({
+                "index": i,
+                "start": clip["start"],
+                "end": clip["end"],
+                "duration": clip["end"] - clip["start"],
+                "score": clip.get("final_score", 0),
+                "clip_path": str(clip_path),
+                "final_path": str(final_path),
+                "ass_path": str(ass_path) if settings.mode == "normal" else None,
+                "download_url": f"/downloads/{session_id}/{final_path.name}",
+                "raw_url": f"/downloads/{session_id}/{clip_path.name}",
+                "vision_analysis": clip.get("vision_analysis", {}),
+                "highlight_reason": clip.get("vision_analysis", {}).get("highlight_reason", ""),
+            })
+        if subtitle_tasks:
+            await asyncio.gather(*subtitle_tasks)
+        sessions[session_id] = {"status": "done", "clips": final_clips}
+        await send_progress(session_id, "done", 100, f"Done! {len(final_clips)} clips ready for download.")
+    except Exception as e:
+        logger.exception(f"Pipeline failed [{session_id}]")
+        sessions[session_id] = {"status": "error", "error": str(e), "clips": []}
+        await send_progress(session_id, "error", 0, f"Pipeline error: {e}")
+    finally:
+        active_jobs.discard(session_id)
+# ─── Editor API ───────────────────────────────────────────────────────────
+@app.get("/api/clips/{session_id}")
+async def get_clips(session_id: str):
+    session = sessions.get(session_id)
+    if not session:
+        raise HTTPException(404, "Session not found")
+    return session
+@app.patch("/api/clips/{session_id}/{clip_index}/subtitles")
+async def patch_subtitle(session_id: str, clip_index: int, patch: SubtitlePatch):
+    clip = _get_clip_or_404(session_id, clip_index)
+    if not clip.get("ass_path"):
+        raise HTTPException(404, "No subtitle file for this clip")
+    update_subtitle_event(Path(clip["ass_path"]), patch.event_index, patch.updates)
+    return {"ok": True}
+@app.patch("/api/clips/{session_id}/{clip_index}/style")
+async def patch_global_style(session_id: str, clip_index: int, patch: GlobalStylePatch):
+    clip = _get_clip_or_404(session_id, clip_index)
+    if not clip.get("ass_path"):
+        raise HTTPException(404, "No subtitle file for this clip")
+    apply_global_style_override(Path(clip["ass_path"]), patch.style_config)
+    return {"ok": True}
+@app.post("/api/clips/{session_id}/{clip_index}/render")
+async def render_clip(session_id: str, clip_index: int):
+    clip = _get_clip_or_404(session_id, clip_index)
+    clip_path = Path(clip["clip_path"])
+    ass_path = Path(clip["ass_path"]) if clip.get("ass_path") else None
+    final_path = clip_path.parent / f"{clip_path.stem}_edited.mp4"
+    if ass_path and ass_path.exists():
+        loop = asyncio.get_running_loop()
+        await loop.run_in_executor(None, lambda: burn_subtitles(clip_path, ass_path, final_path))
+    else:
+        final_path = Path(clip["final_path"])
+    download_url = f"/downloads/{session_id}/{final_path.name}"
+    clip["download_url"] = download_url
+    clip["final_path"] = str(final_path)
+    return {"download_url": download_url}
+def _get_clip_or_404(session_id: str, clip_index: int) -> dict:
+    session = sessions.get(session_id)
+    if not session:
+        raise HTTPException(404, "Session not found")
+    clip = next((c for c in session.get("clips", []) if c["index"] == clip_index), None)
+    if not clip:
+        raise HTTPException(404, f"Clip {clip_index} not found")
+    return clip
+# ─── vLLM management endpoints ────────────────────────────────────────────────
+@app.get("/api/vllm/status")
+async def get_vllm_status():
+    return vllm_status()
+@app.post("/api/vllm/stop")
+async def stop_vllm(x_demo_key: Optional[str] = Header(None, alias="X-Demo-Key")):
+    _require_access(x_demo_key)
+    loop = asyncio.get_running_loop()
+    await loop.run_in_executor(None, vllm_stop)
+    return {"ok": True, "message": "vLLM stopped — will restart automatically on next job"}
+@app.post("/api/vllm/start")
+async def start_vllm(x_demo_key: Optional[str] = Header(None, alias="X-Demo-Key")):
+    _require_access(x_demo_key)
+    loop = asyncio.get_running_loop()
+    await loop.run_in_executor(None, ensure_vllm_running)
+    return {"ok": True, "status": vllm_status()}
+if __name__ == "__main__":
+    import uvicorn
+    log_gpu_status()
+    uvicorn.run(app, host="0.0.0.0", port=8000, reload=False)

backend/pyproject.toml DELETED Viewed

@@ -1,44 +0,0 @@
-[project]
-name = "elevenclip-ai-backend"
-version = "0.1.0"
-description = "FastAPI backend for ElevenClip.AI on AMD ROCm"
-requires-python = ">=3.11"
-dependencies = [
-  "fastapi>=0.115.0",
-  "uvicorn[standard]>=0.30.0",
-  "pydantic>=2.8.0",
-  "python-multipart>=0.0.9",
-  "yt-dlp>=2025.1.15",
-  "celery[redis]>=5.4.0",
-  "redis>=5.0.0"
-]
-[project.optional-dependencies]
-ai = [
-  "transformers>=4.47.0",
-  "accelerate>=1.2.0",
-  "sentencepiece>=0.2.0",
-  "safetensors>=0.4.5",
-  "Pillow>=10.0.0",
-  "qwen-vl-utils>=0.0.8"
-]
-rocm-inference = [
-  "vllm>=0.6.6",
-  "optimum-amd>=0.1.0; platform_system == 'Linux'"
-]
-dev = [
-  "pytest>=8.3.0",
-  "httpx>=0.27.0",
-  "ruff>=0.6.0"
-]
-[build-system]
-requires = ["setuptools>=69.0"]
-build-backend = "setuptools.build_meta"
-[tool.setuptools.packages.find]
-include = ["app*"]
-[tool.ruff]
-line-length = 100
-target-version = "py311"

backend/requirements.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+# FastAPI & server
+fastapi==0.115.5
+uvicorn[standard]==0.32.1
+python-multipart==0.0.20
+websockets==14.1
+aiofiles==24.1.0
+httpx==0.28.1
+# Video download
+yt-dlp==2025.4.30
+# Video processing (ffmpeg called via subprocess — no Python wrapper needed)
+scenedetect[opencv]==0.6.5.2
+librosa==0.10.2
+soundfile==0.12.1
+# AI — Whisper STT (ROCm-optimized)
+# PyTorch must be installed separately with ROCm wheels:
+#   pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
+transformers==4.47.1
+accelerate==1.2.1
+# AI — Vision: Qwen2.5-VL via vLLM OpenAI-compatible API
+# vLLM installed separately:
+#   pip install vllm --extra-index-url https://download.pytorch.org/whl/rocm6.2
+openai==1.57.4
+# Subtitles
+pysubs2==1.7.3
+# Utils
+numpy==1.26.4
+pillow==11.0.0
+python-dotenv==1.0.1
+pydantic==2.10.4
+pydantic-settings==2.7.0
+loguru==0.7.3

backend/src/__init__.py ADDED Viewed

File without changes

backend/src/analysis/__init__.py ADDED Viewed

File without changes

backend/src/analysis/highlight_scorer.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""Multi-signal highlight scoring: Vision + Audio energy + Text keywords."""
+import math
+from pathlib import Path
+from loguru import logger
+# Style-specific keyword boosts
+STYLE_KEYWORDS = {
+    "funny": ["haha", "lol", "funny", "joke", "laugh", "omg", "what", "no way", "ตลก", "ฮา", "โอ้โห", "搞笑", "哈哈"],
+    "serious": ["important", "key", "must", "critical", "สำคัญ", "ต้อง", "หลัก", "重要", "关键"],
+    "educational": ["learn", "tip", "trick", "how", "why", "เรียน", "วิธี", "ทำไม", "学习", "方法", "技巧"],
+    "gaming": ["win", "lose", "boss", "kill", "score", "level", "ชนะ", "แพ้", "赢", "输"],
+    "entertainment": ["wow", "amazing", "incredible", "unbelievable", "เจ๋ง", "เยี่ยม", "厉害", "太棒了"],
+}
+def compute_audio_energy(audio_path: Path, scenes: list[dict]) -> list[float]:
+    """Compute RMS energy per scene using librosa."""
+    try:
+        import librosa
+        import numpy as np
+        y, sr = librosa.load(str(audio_path), sr=16000, mono=True)
+        energies = []
+        for scene in scenes:
+            start_sample = int(scene["start"] * sr)
+            end_sample = int(scene["end"] * sr)
+            segment = y[start_sample:end_sample]
+            if len(segment) == 0:
+                energies.append(0.0)
+                continue
+            rms = float(np.sqrt(np.mean(segment ** 2)))
+            energies.append(rms)
+        # Normalize to 0-1
+        if max(energies) > 0:
+            max_e = max(energies)
+            energies = [e / max_e for e in energies]
+        return energies
+    except ImportError:
+        logger.warning("librosa not installed, using uniform audio energy")
+        return [0.5] * len(scenes)
+    except Exception as e:
+        logger.error(f"Audio energy computation failed: {e}")
+        return [0.5] * len(scenes)
+def compute_text_score(transcript_text: str, clip_style: str) -> float:
+    """Score transcript text based on style keywords (0-1)."""
+    if not transcript_text:
+        return 0.3
+    text_lower = transcript_text.lower()
+    keywords = STYLE_KEYWORDS.get(clip_style.lower(), [])
+    if not keywords:
+        return 0.3
+    hits = sum(1 for kw in keywords if kw in text_lower)
+    score = min(1.0, hits / max(len(keywords) * 0.2, 1))
+    return max(0.1, score)
+def score_scenes(
+    scenes_analyzed: list[dict],
+    audio_path: Path,
+    clip_style: str = "entertaining",
+    target_duration: int = 60,
+) -> list[dict]:
+    """Compute final highlight scores for all scenes.
+    Final score = 0.40 × vision + 0.35 × audio_energy + 0.25 × text_keywords
+    """
+    # Audio energy per scene
+    audio_energies = compute_audio_energy(audio_path, scenes_analyzed)
+    scored = []
+    for i, scene in enumerate(scenes_analyzed):
+        analysis = scene.get("vision_analysis", {})
+        vision_score = (
+            analysis.get("excitement_score", 0.5) * 0.5 +
+            analysis.get("tiktok_potential", 0.5) * 0.3 +
+            analysis.get("humor_level", 0.3) * 0.2
+        )
+        audio_score = audio_energies[i]
+        # Text from transcript segments overlapping this scene
+        transcript_text = scene.get("transcript_text", "")
+        text_score = compute_text_score(transcript_text, clip_style)
+        final_score = (
+            0.40 * vision_score +
+            0.35 * audio_score +
+            0.25 * text_score
+        )
+        # Penalize very short or very long scenes relative to target
+        duration = scene["duration"]
+        duration_penalty = 1.0 - abs(duration - target_duration) / max(target_duration * 2, 1)
+        duration_penalty = max(0.5, duration_penalty)
+        scored.append({
+            **scene,
+            "vision_score": round(vision_score, 3),
+            "audio_score": round(audio_score, 3),
+            "text_score": round(text_score, 3),
+            "final_score": round(final_score * duration_penalty, 3),
+        })
+    scored.sort(key=lambda s: s["final_score"], reverse=True)
+    logger.info(f"Top scene: {scored[0]['start']:.1f}s score={scored[0]['final_score']:.3f}" if scored else "No scenes")
+    return scored
+def select_top_clips(
+    scored_scenes: list[dict],
+    count: int,
+    target_duration: int,
+    min_gap_sec: float = 30.0,
+) -> list[dict]:
+    """Select top-N non-overlapping clips.
+    Merges adjacent high-scoring scenes to reach target_duration.
+    Ensures clips don't overlap (min_gap_sec between selections).
+    """
+    selected = []
+    used_ranges = []
+    for scene in scored_scenes:
+        if len(selected) >= count:
+            break
+        # Check overlap with already selected clips
+        overlaps = any(
+            abs(scene["start"] - used_start) < min_gap_sec
+            for used_start in used_ranges
+        )
+        if overlaps:
+            continue
+        # Adjust clip boundaries to match target_duration
+        clip = _adjust_clip_duration(scene, target_duration)
+        selected.append(clip)
+        used_ranges.append(clip["start"])
+    logger.info(f"Selected {len(selected)}/{count} clips")
+    return sorted(selected, key=lambda c: c["start"])
+def _adjust_clip_duration(scene: dict, target_sec: int) -> dict:
+    """Expand or shrink a scene to approximately target_sec."""
+    current_dur = scene["end"] - scene["start"]
+    if abs(current_dur - target_sec) < 5:
+        return scene
+    # Center the target window on the scene midpoint
+    mid = (scene["start"] + scene["end"]) / 2
+    half = target_sec / 2
+    new_start = max(0, mid - half)
+    new_end = new_start + target_sec
+    return {**scene, "start": new_start, "end": new_end, "duration": target_sec}

backend/src/analysis/scene_detector.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""Scene detection using PySceneDetect."""
+from pathlib import Path
+from typing import Optional
+from loguru import logger
+def detect_scenes(
+    video_path: Path,
+    threshold: float = 27.0,
+    min_scene_len_sec: float = 2.0,
+) -> list[dict]:
+    """Detect scene cuts and return list of scenes with timestamps.
+    Returns:
+        [{"start": float, "end": float, "duration": float}, ...]
+    """
+    try:
+        from scenedetect import open_video, SceneManager
+        from scenedetect.detectors import ContentDetector
+        video = open_video(str(video_path))
+        scene_manager = SceneManager()
+        scene_manager.add_detector(ContentDetector(threshold=threshold))
+        logger.info(f"Running scene detection on: {video_path.name}")
+        scene_manager.detect_scenes(video, show_progress=False)
+        scene_list = scene_manager.get_scene_list()
+        scenes = []
+        for start_tc, end_tc in scene_list:
+            start = start_tc.get_seconds()
+            end = end_tc.get_seconds()
+            duration = end - start
+            if duration >= min_scene_len_sec:
+                scenes.append({"start": start, "end": end, "duration": duration})
+        logger.info(f"Detected {len(scenes)} scenes")
+        if not scenes:
+            logger.warning("0 scenes from ContentDetector — using fixed-interval fallback")
+            return _fixed_interval_scenes(video_path, interval_sec=8.0)
+        return scenes
+    except ImportError:
+        logger.warning("scenedetect not installed, using fixed-interval fallback")
+        return _fixed_interval_scenes(video_path, interval_sec=5.0)
+    except Exception as e:
+        logger.error(f"Scene detection failed: {e}")
+        return _fixed_interval_scenes(video_path, interval_sec=5.0)
+def _fixed_interval_scenes(video_path: Path, interval_sec: float = 5.0) -> list[dict]:
+    """Fallback: split video into fixed-interval scenes."""
+    import subprocess
+    result = subprocess.run(
+        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+         "-of", "default=noprint_wrappers=1:nokey=1", str(video_path)],
+        capture_output=True, text=True
+    )
+    try:
+        total = float(result.stdout.strip())
+    except ValueError:
+        total = 300.0
+    scenes = []
+    t = 0.0
+    while t < total:
+        end = min(t + interval_sec, total)
+        scenes.append({"start": t, "end": end, "duration": end - t})
+        t = end
+    return scenes
+def sample_frames(
+    video_path: Path,
+    scenes: list[dict],
+    output_dir: Path,
+    frames_per_scene: int = 3,
+) -> list[dict]:
+    """Extract representative frames from each scene for vision analysis.
+    Returns scenes with added 'frame_paths' key.
+    """
+    import subprocess
+    output_dir.mkdir(parents=True, exist_ok=True)
+    result_scenes = []
+    for i, scene in enumerate(scenes):
+        mid = scene["start"] + scene["duration"] / 2
+        frame_paths = []
+        # Sample frames at start, middle, end of scene
+        timestamps = [
+            scene["start"] + scene["duration"] * 0.2,
+            mid,
+            scene["start"] + scene["duration"] * 0.8,
+        ][:frames_per_scene]
+        for j, ts in enumerate(timestamps):
+            frame_path = output_dir / f"scene_{i:04d}_frame_{j}.jpg"
+            cmd = [
+                "ffmpeg", "-y", "-ss", str(ts), "-i", str(video_path),
+                "-vframes", "1", "-q:v", "2", "-vf", "scale=640:-1",
+                str(frame_path)
+            ]
+            subprocess.run(cmd, capture_output=True)
+            if frame_path.exists():
+                frame_paths.append(str(frame_path))
+        result_scenes.append({**scene, "index": i, "frame_paths": frame_paths})
+    return result_scenes

backend/src/analysis/vision.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""Qwen2.5-VL multimodal scene analysis via vLLM OpenAI-compatible API.
+Sends video frames + transcript text together (true multimodal fusion).
+Outputs: excitement_score, face_bbox, action_type, humor_level, emotion.
+All scenes analyzed concurrently — vLLM handles GPU batching internally.
+"""
+import asyncio
+import base64
+import json
+import os
+from pathlib import Path
+from typing import Optional
+from loguru import logger
+VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "http://localhost:8000/v1")
+VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
+VLLM_API_KEY = os.getenv("VLLM_API_KEY", "EMPTY")
+ANALYSIS_PROMPT = """You are a TikTok content expert analyzing a livestream segment for highlight potential.
+Analyze the provided video frames and transcript text together as a unified multimodal signal.
+Respond ONLY with valid JSON matching this exact schema — no markdown, no explanation:
+{{
+  "excitement_score": <0.0-1.0>,
+  "humor_level": <0.0-1.0>,
+  "emotion": "<neutral|happy|surprised|angry|sad|excited|funny>",
+  "action_type": "<talking|gaming|reaction|tutorial|entertainment|sports|other>",
+  "has_face": <true|false>,
+  "face_bbox": [<x1_pct>, <y1_pct>, <x2_pct>, <y2_pct>] or null,
+  "highlight_reason": "<one sentence: why this IS or isn't a good TikTok highlight>",
+  "tiktok_potential": <0.0-1.0>
+}}
+Channel context: {channel_description}
+Requested clip style: {clip_style}
+"""
+def _encode_image(image_path: str) -> str:
+    with open(image_path, "rb") as f:
+        return base64.b64encode(f.read()).decode("utf-8")
+def analyze_scene(
+    scene: dict,
+    transcript_text: str = "",
+    channel_description: str = "",
+    clip_style: str = "entertaining",
+) -> dict:
+    """Analyze a single scene using Qwen2.5-VL (vision + text multimodal fusion).
+    Sends up to 3 representative frames + transcript context to vLLM.
+    Returns analysis dict with excitement_score, face_bbox, etc.
+    """
+    try:
+        from openai import OpenAI
+        client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
+        frame_paths = scene.get("frame_paths", [])
+        if not frame_paths:
+            return _default_analysis()
+        content = []
+        # Add up to 3 frames as base64 images
+        for frame_path in frame_paths[:3]:
+            if Path(frame_path).exists():
+                b64 = _encode_image(frame_path)
+                content.append({
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
+                })
+        if not content:
+            return _default_analysis()
+        prompt = ANALYSIS_PROMPT.format(
+            channel_description=channel_description or "General content creator",
+            clip_style=clip_style,
+        )
+        if transcript_text.strip():
+            prompt += f"\n\nTranscript for this segment:\n\"{transcript_text.strip()}\""
+        content.append({"type": "text", "text": prompt})
+        response = client.chat.completions.create(
+            model=VLLM_MODEL,
+            messages=[{"role": "user", "content": content}],
+            max_tokens=300,
+            temperature=0.1,
+        )
+        raw = response.choices[0].message.content.strip()
+        # Strip markdown code fences if present
+        if raw.startswith("```"):
+            parts = raw.split("```")
+            raw = parts[1] if len(parts) > 1 else raw
+            if raw.startswith("json"):
+                raw = raw[4:]
+        analysis = json.loads(raw.strip())
+        logger.debug(
+            f"Scene [{scene['start']:.1f}s-{scene['end']:.1f}s]: "
+            f"excitement={analysis.get('excitement_score', 0):.2f} "
+            f"tiktok={analysis.get('tiktok_potential', 0):.2f} | "
+            f"{analysis.get('highlight_reason', '')[:60]}"
+        )
+        try:
+            from src.gpu.vllm_manager import vllm_touch
+            vllm_touch()
+        except Exception:
+            pass
+        return analysis
+    except Exception as e:
+        logger.warning(f"Vision analysis failed at {scene.get('start', 0):.1f}s: {e}")
+        return _default_analysis()
+async def analyze_scenes_batch_async(
+    scenes_with_frames: list[dict],
+    transcript_segments: list[dict],
+    channel_description: str = "",
+    clip_style: str = "entertaining",
+) -> list[dict]:
+    """Analyze all scenes concurrently.
+    Sends all vLLM requests in parallel — the server queues and batches them
+    internally, giving full GPU utilization on AMD MI300X.
+    Each result includes 'vision_analysis' and 'transcript_text' for scoring.
+    """
+    loop = asyncio.get_running_loop()
+    async def _analyze_one(scene: dict) -> dict:
+        scene_text = " ".join(
+            seg["text"] for seg in transcript_segments
+            if seg["start"] < scene["end"] and seg["end"] > scene["start"]
+        )
+        analysis = await loop.run_in_executor(
+            None,
+            lambda s=scene, t=scene_text: analyze_scene(s, t, channel_description, clip_style),
+        )
+        return {**scene, "vision_analysis": analysis, "transcript_text": scene_text}
+    results = await asyncio.gather(*[_analyze_one(s) for s in scenes_with_frames])
+    logger.info(f"Vision analysis complete: {len(results)} scenes")
+    return list(results)
+def _default_analysis() -> dict:
+    """Fallback analysis when vLLM is unavailable (keeps pipeline running)."""
+    return {
+        "excitement_score": 0.5,
+        "humor_level": 0.3,
+        "emotion": "neutral",
+        "action_type": "talking",
+        "has_face": False,
+        "face_bbox": None,
+        "highlight_reason": "Vision model unavailable — using audio+text signals only",
+        "tiktok_potential": 0.4,
+    }
+HRE_SEGMENT_PROMPT = """Analyze this video frame for high-retention TikTok editing decisions.
+Segment {seg_idx} of {n_total}. Transcript: "{context}"
+Respond ONLY with valid JSON — no markdown:
+{{
+  "zoom_direction": "<in|out|hold>",
+  "zoom_speed": "<fast|slow>",
+  "face_detected": <true|false>,
+  "face_cx": <0.0-1.0>,
+  "face_cy": <0.0-1.0>,
+  "subtitle_position": "<top|bottom>",
+  "subtitle_color": "<white|yellow|cyan|orange|green>",
+  "energy_level": "<high|medium|low>",
+  "moment_type": "<hook|punchline|context|reaction|transition>"
+}}
+Rules:
+- seg_idx==0: always zoom_direction=in, zoom_speed=fast (hook the viewer)
+- zoom IN fast: punchlines, reactions, peak energy
+- zoom IN slow: context, buildup, moderate energy
+- zoom OUT: reveals, breathing room after intensity
+- HOLD: stable content, text-heavy moments
+- subtitle TOP: face is in bottom half → put text at top
+- subtitle BOTTOM: face is in top half → text at bottom
+- face_cx/face_cy: face center as 0.0-1.0 fraction of frame
+"""
+def analyze_frame_for_hre(
+    frame_path: "Path",
+    context: str = "",
+    seg_idx: int = 0,
+    n_total: int = 1,
+) -> dict:
+    """Per-segment HRE: zoom direction, subtitle position+color for this moment."""
+    try:
+        from openai import OpenAI
+        client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
+        if not Path(frame_path).exists():
+            return _default_hre_analysis(seg_idx, n_total)
+        b64 = _encode_image(str(frame_path))
+        prompt = HRE_SEGMENT_PROMPT.format(
+            seg_idx=seg_idx, n_total=n_total, context=context[:200]
+        )
+        response = client.chat.completions.create(
+            model=VLLM_MODEL,
+            messages=[{
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
+                    {"type": "text", "text": prompt},
+                ],
+            }],
+            max_tokens=200,
+            temperature=0.1,
+        )
+        raw = response.choices[0].message.content.strip()
+        if raw.startswith("```"):
+            parts = raw.split("```")
+            raw = parts[1] if len(parts) > 1 else raw
+            if raw.startswith("json"):
+                raw = raw[4:]
+        analysis = json.loads(raw.strip())
+        logger.debug(
+            f"HRE seg {seg_idx}/{n_total}: "
+            f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) "
+            f"sub={analysis.get('subtitle_position')}/{analysis.get('subtitle_color')} "
+            f"type={analysis.get('moment_type')}"
+        )
+        try:
+            from src.gpu.vllm_manager import vllm_touch
+            vllm_touch()
+        except Exception:
+            pass
+        return analysis
+    except Exception as e:
+        logger.warning(f"HRE frame analysis failed (seg {seg_idx}): {e}")
+        return _default_hre_analysis(seg_idx, n_total)
+def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
+    """Fallback with varied decisions based on position in clip."""
+    if seg_idx == 0:
+        zoom_dir, zoom_speed, moment = "in", "fast", "hook"
+    elif seg_idx == n_total - 1:
+        zoom_dir, zoom_speed, moment = "out", "slow", "transition"
+    elif seg_idx % 3 == 1:
+        zoom_dir, zoom_speed, moment = "hold", "slow", "context"
+    else:
+        zoom_dir, zoom_speed, moment = "in", "slow", "reaction"
+    _colors    = ["yellow", "white",  "cyan",   "orange", "white",  "yellow"]
+    _positions = ["bottom", "top",    "bottom", "top",    "bottom", "top"]
+    return {
+        "zoom_direction":    zoom_dir,
+        "zoom_speed":        zoom_speed,
+        "face_detected":     False,
+        "face_cx":           0.5,
+        "face_cy":           0.38,
+        "subtitle_position": _positions[seg_idx % len(_positions)],
+        "subtitle_color":    _colors[seg_idx % len(_colors)],
+        "energy_level":      "medium",
+        "moment_type":       moment,
+    }
+def get_emoji_for_scene(scene_text: str, emotion: str, action_type: str) -> str:
+    """Use the configured Qwen2.5-VL model as a text prompt to select an emoji."""
+    try:
+        from openai import OpenAI
+        client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
+        response = client.chat.completions.create(
+            model=VLLM_MODEL,
+            messages=[{"role": "user", "content": (
+                f"Select ONE emoji for this TikTok moment.\n"
+                f"Emotion: {emotion}\nAction: {action_type}\n"
+                f"Text: \"{scene_text[:200]}\"\n"
+                f"Reply with ONLY the emoji character, nothing else."
+            )}],
+            max_tokens=5,
+            temperature=0.3,
+        )
+        emoji = response.choices[0].message.content.strip()
+        if len(emoji) <= 4:
+            return emoji
+    except Exception:
+        pass
+    emoji_map = {
+        "happy": "😄", "excited": "🔥", "funny": "😂",
+        "surprised": "😲", "angry": "😤", "sad": "😢",
+        "neutral": "💡", "gaming": "🎮", "tutorial": "📚",
+        "entertainment": "✨", "reaction": "😱",
+    }
+    return emoji_map.get(emotion) or emoji_map.get(action_type, "⚡")

backend/src/gpu/__init__.py ADDED Viewed

File without changes

backend/src/gpu/rocm_utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""AMD ROCm device management and monitoring."""
+import os
+import subprocess
+from loguru import logger
+def get_device() -> str:
+    """Return 'cuda' (ROCm uses cuda device name in PyTorch) or 'cpu'."""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            device_name = torch.cuda.get_device_name(0)
+            logger.info(f"GPU detected: {device_name}")
+            return "cuda"
+    except ImportError:
+        pass
+    logger.warning("No GPU available, falling back to CPU")
+    return "cpu"
+def get_vram_gb() -> float:
+    """Return available VRAM in GB."""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            total = torch.cuda.get_device_properties(0).total_memory
+            return round(total / 1024**3, 1)
+    except Exception:
+        pass
+    return 0.0
+def get_gpu_utilization() -> dict:
+    """Return GPU utilization stats via rocm-smi."""
+    try:
+        result = subprocess.run(
+            ["rocm-smi", "--showuse", "--showmemuse", "--csv"],
+            capture_output=True, text=True, timeout=5
+        )
+        if result.returncode == 0:
+            lines = result.stdout.strip().split("\n")
+            if len(lines) >= 2:
+                headers = lines[0].split(",")
+                values = lines[1].split(",")
+                return dict(zip(headers, values))
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        pass
+    # Fallback: PyTorch memory stats
+    try:
+        import torch
+        if torch.cuda.is_available():
+            allocated = torch.cuda.memory_allocated(0) / 1024**3
+            reserved = torch.cuda.memory_reserved(0) / 1024**3
+            total = torch.cuda.get_device_properties(0).total_memory / 1024**3
+            return {
+                "vram_used_gb": round(allocated, 2),
+                "vram_reserved_gb": round(reserved, 2),
+                "vram_total_gb": round(total, 2),
+                "vram_pct": round(allocated / total * 100, 1) if total > 0 else 0,
+            }
+    except Exception:
+        pass
+    return {}
+def get_optimal_batch_size(model_type: str = "whisper") -> int:
+    """Return optimal batch size based on available VRAM."""
+    vram = get_vram_gb()
+    if model_type == "whisper":
+        if vram >= 48:
+            return 32
+        elif vram >= 24:
+            return 16
+        elif vram >= 16:
+            return 8
+        return 4
+    elif model_type == "vision":
+        if vram >= 80:
+            return 8
+        elif vram >= 48:
+            return 4
+        return 1
+    return 1
+def log_gpu_status():
+    stats = get_gpu_utilization()
+    if stats:
+        logger.info(f"GPU stats: {stats}")
+    else:
+        logger.info(f"GPU: {get_device()} | VRAM: {get_vram_gb()} GB")

backend/src/gpu/vllm_manager.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""On-demand vLLM process manager.
+Starts vLLM when first needed, shuts it down after idle.
+Set VLLM_ON_DEMAND=false to use an externally managed vLLM instead.
+Set VLLM_IDLE_TIMEOUT=300 (seconds) to control the idle shutdown window.
+"""
+import os
+import subprocess
+import threading
+import time
+import requests
+from loguru import logger
+VLLM_MODEL       = os.getenv("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
+VLLM_PORT        = int(os.getenv("VLLM_PORT", "8000"))
+IDLE_TIMEOUT     = int(os.getenv("VLLM_IDLE_TIMEOUT", "300"))   # 5 min default
+ON_DEMAND        = os.getenv("VLLM_ON_DEMAND", "true").lower() == "true"
+DOCKER_CONTAINER = os.getenv("VLLM_DOCKER_CONTAINER", "rocm")  # container that has vllm installed
+class _VLLMManager:
+    def __init__(self):
+        self._proc: subprocess.Popen | None = None
+        self._lock = threading.Lock()
+        self._last_used = 0.0
+        threading.Thread(target=self._watchdog, daemon=True, name="vllm-watchdog").start()
+    # ── Public ────────────────────────────────────────────────────────────
+    def is_running(self) -> bool:
+        if not ON_DEMAND or DOCKER_CONTAINER:
+            # Docker mode or external vLLM: rely solely on health endpoint
+            return self._check_health()
+        with self._lock:
+            if self._proc is None or self._proc.poll() is not None:
+                return False
+            return self._check_health()
+    def ensure_running(self, progress_cb=None) -> None:
+        """Start vLLM if not running. Blocks until healthy (max 3 min)."""
+        if not ON_DEMAND:
+            return
+        with self._lock:
+            if self._check_health():
+                self._last_used = time.time()
+                return
+            self._start(progress_cb)
+    def stop(self) -> None:
+        if not ON_DEMAND:
+            return
+        with self._lock:
+            self._stop_locked()
+    def touch(self) -> None:
+        """Reset idle timer — call after each successful vLLM API call."""
+        self._last_used = time.time()
+    def status(self) -> dict:
+        running = self.is_running()
+        idle = round(time.time() - self._last_used, 1) if self._last_used else None
+        return {
+            "running":      running,
+            "on_demand":    ON_DEMAND,
+            "idle_seconds": idle,
+            "idle_timeout": IDLE_TIMEOUT,
+            "model":        VLLM_MODEL,
+        }
+    # ── Internal ──────────────────────────────────────────────────────────
+    def _health_url(self) -> str:
+        return f"http://localhost:{VLLM_PORT}/health"
+    def _check_health(self) -> bool:
+        try:
+            return requests.get(self._health_url(), timeout=2).status_code == 200
+        except Exception:
+            return False
+    def _start(self, progress_cb=None) -> None:
+        logger.info("vLLM: starting on demand…")
+        if progress_cb:
+            progress_cb("Starting AI model (Qwen2.5-VL)… ~2 min first time")
+        # Try Docker container first (vLLM may only be installed inside a container)
+        if DOCKER_CONTAINER:
+            self._start_via_docker(progress_cb)
+        else:
+            self._start_via_subprocess(progress_cb)
+    def _start_via_docker(self, progress_cb=None) -> None:
+        """Start vLLM inside an existing Docker container via docker exec."""
+        cmd = (
+            f"vllm serve {VLLM_MODEL} "
+            f"--host 0.0.0.0 --port {VLLM_PORT} "
+            f"--gpu-memory-utilization 0.85 --max-model-len 4096 "
+            f"> /tmp/vllm_server.log 2>&1"
+        )
+        subprocess.Popen(
+            ["docker", "exec", "-d", DOCKER_CONTAINER, "bash", "-c", cmd],
+            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+        )
+        self._proc = None  # process lives inside container, tracked by health check
+        deadline = time.time() + 200
+        tick = 0
+        while time.time() < deadline:
+            time.sleep(5)
+            tick += 1
+            if self._check_health():
+                self._last_used = time.time()
+                logger.info(f"vLLM (docker) ready after {tick * 5}s")
+                return
+            if progress_cb and tick % 6 == 0:
+                progress_cb(f"AI model loading… {tick * 5}s")
+        raise RuntimeError("vLLM did not start within 200s")
+    def _start_via_subprocess(self, progress_cb=None) -> None:
+        """Start vLLM as a direct subprocess (vllm must be in current Python env)."""
+        import sys
+        self._proc = subprocess.Popen(
+            [
+                sys.executable, "-m", "vllm.entrypoints.openai.api_server",
+                "--model", VLLM_MODEL,
+                "--device", "rocm",
+                "--port", str(VLLM_PORT),
+                "--gpu-memory-utilization", "0.85",
+                "--max-model-len", "4096",
+            ],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.PIPE,
+        )
+        deadline = time.time() + 200
+        tick = 0
+        while time.time() < deadline:
+            time.sleep(5)
+            tick += 1
+            if self._proc.poll() is not None:
+                err = self._proc.stderr.read().decode()[-600:]
+                raise RuntimeError(f"vLLM exited during startup: {err}")
+            if self._check_health():
+                self._last_used = time.time()
+                logger.info(f"vLLM ready after {tick * 5}s")
+                return
+            if progress_cb and tick % 6 == 0:
+                progress_cb(f"AI model loading… {tick * 5}s")
+        raise RuntimeError("vLLM did not start within 200s")
+    def _stop_locked(self) -> None:
+        if DOCKER_CONTAINER:
+            subprocess.run(
+                ["docker", "exec", DOCKER_CONTAINER, "pkill", "-f", "vllm"],
+                capture_output=True,
+            )
+            self._proc = None
+        elif self._proc and self._proc.poll() is None:
+            self._proc.terminate()
+            try:
+                self._proc.wait(timeout=10)
+            except subprocess.TimeoutExpired:
+                self._proc.kill()
+            self._proc = None
+        logger.info("vLLM stopped")
+    def _watchdog(self) -> None:
+        while True:
+            time.sleep(60)
+            if not ON_DEMAND or IDLE_TIMEOUT <= 0:
+                continue
+            with self._lock:
+                if (self._proc
+                        and self._proc.poll() is None
+                        and self._last_used > 0
+                        and time.time() - self._last_used > IDLE_TIMEOUT):
+                    logger.info(
+                        f"vLLM idle {IDLE_TIMEOUT}s → shutting down to save GPU credits"
+                    )
+                    self._stop_locked()
+_manager = _VLLMManager()
+# ── Module-level helpers ──────────────────────────────────────────────────────
+def ensure_vllm_running(progress_cb=None) -> None:
+    _manager.ensure_running(progress_cb)
+def vllm_touch() -> None:
+    _manager.touch()
+def vllm_stop() -> None:
+    _manager.stop()
+def vllm_is_running() -> bool:
+    return _manager.is_running()
+def vllm_status() -> dict:
+    return _manager.status()

backend/src/ingestion/__init__.py ADDED Viewed

File without changes

backend/src/ingestion/uploader.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""Handle file upload from user."""
+import shutil
+from pathlib import Path
+from fastapi import UploadFile
+from loguru import logger
+ALLOWED_EXTENSIONS = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".m4v"}
+MAX_SIZE_BYTES = 2 * 1024 * 1024 * 1024  # 2 GB
+async def save_upload(
+    file: UploadFile,
+    output_dir: Path,
+    session_id: str,
+) -> Path:
+    """Save uploaded video file to disk."""
+    suffix = Path(file.filename or "video.mp4").suffix.lower()
+    if suffix not in ALLOWED_EXTENSIONS:
+        raise ValueError(f"Unsupported file type: {suffix}. Allowed: {ALLOWED_EXTENSIONS}")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    dest = output_dir / f"{session_id}_input{suffix}"
+    size = 0
+    with open(dest, "wb") as f:
+        while chunk := await file.read(1024 * 1024):  # 1MB chunks
+            size += len(chunk)
+            if size > MAX_SIZE_BYTES:
+                dest.unlink(missing_ok=True)
+                raise ValueError("File too large (max 2 GB)")
+            f.write(chunk)
+    logger.info(f"Saved upload: {dest} ({size / 1024 / 1024:.1f} MB)")
+    return dest

backend/src/ingestion/youtube.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""YouTube video downloader using yt-dlp."""
+import asyncio
+import subprocess
+from pathlib import Path
+from typing import Optional, Callable
+import yt_dlp
+from loguru import logger
+def _progress_hook(callback: Optional[Callable] = None):
+    def hook(d: dict):
+        if d["status"] == "downloading" and callback:
+            pct = d.get("_percent_str", "0%").strip().replace("%", "")
+            try:
+                callback(float(pct))
+            except ValueError:
+                pass
+    return hook
+def download_video(
+    url: str,
+    output_dir: Path,
+    session_id: str,
+    progress_callback: Optional[Callable] = None,
+    max_height: int = 1080,
+) -> Path:
+    """Download video from YouTube (or any yt-dlp-supported site).
+    Returns path to downloaded MP4 file.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_template = str(output_dir / f"{session_id}_input.%(ext)s")
+    ydl_opts = {
+        "format": (
+            f"bestvideo[vcodec^=avc1][height<={max_height}]+bestaudio/"
+            f"bestvideo[vcodec^=avc][height<={max_height}]+bestaudio/"
+            f"bestvideo[vcodec!^=av01][height<={max_height}]+bestaudio/"
+            f"best[height<={max_height}]/best"
+        ),
+        "format_sort": ["vcodec:h264"],
+        "outtmpl": output_template,
+        "merge_output_format": "mp4",
+        "quiet": True,
+        "no_warnings": True,
+        "progress_hooks": [_progress_hook(progress_callback)],
+        "postprocessors": [{
+            "key": "FFmpegVideoConvertor",
+            "preferedformat": "mp4",
+        }],
+        # Use iOS/Android clients to bypass datacenter IP bot-detection
+        "extractor_args": {
+            "youtube": {
+                "player_client": ["ios", "android", "tv_embedded"],
+            }
+        },
+    }
+    _inject_cookies(ydl_opts)
+    logger.info(f"Downloading: {url}")
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=True)
+        title = info.get("title", "video")
+        duration = info.get("duration", 0)
+        logger.info(f"Downloaded: '{title}' ({duration}s)")
+    output_path = output_dir / f"{session_id}_input.mp4"
+    if not output_path.exists():
+        for f in output_dir.glob(f"{session_id}_input.*"):
+            output_path = f
+            break
+    # Safety: transcode AV1 → h264 if yt-dlp still picked it
+    output_path = _ensure_h264(output_path)
+    return output_path
+def _ensure_h264(video_path: Path) -> Path:
+    """Transcode to h264 if video codec is AV1 (not supported by PySceneDetect on this server)."""
+    probe = subprocess.run(
+        ["ffprobe", "-v", "error", "-select_streams", "v:0",
+         "-show_entries", "stream=codec_name", "-of", "csv=p=0", str(video_path)],
+        capture_output=True, text=True,
+    )
+    codec = probe.stdout.strip().lower()
+    if codec not in ("av1", "av01"):
+        return video_path
+    logger.warning(f"AV1 detected ({video_path.name}), transcoding to h264...")
+    out = video_path.with_name(video_path.stem + "_h264.mp4")
+    result = subprocess.run(
+        ["ffmpeg", "-y", "-i", str(video_path), "-c:v", "libx264", "-preset", "fast",
+         "-crf", "23", "-c:a", "aac", "-b:a", "128k", str(out)],
+        capture_output=True, text=True,
+    )
+    if result.returncode == 0:
+        logger.info(f"Transcoded to h264: {out.name}")
+        return out
+    logger.error(f"Transcode failed: {result.stderr[-200:]}")
+    return video_path
+_COOKIES_PATH = Path("/root/cookies.txt")
+def _inject_cookies(opts: dict) -> None:
+    """Add cookiefile to ydl_opts if cookies.txt exists on server."""
+    if _COOKIES_PATH.exists():
+        opts["cookiefile"] = str(_COOKIES_PATH)
+        logger.debug(f"Using cookies: {_COOKIES_PATH}")
+def get_video_info(url: str) -> dict:
+    """Return metadata without downloading."""
+    ydl_opts = {
+        "quiet": True,
+        "no_warnings": True,
+        "skip_download": True,
+        "extractor_args": {
+            "youtube": {"player_client": ["ios", "android", "tv_embedded"]}
+        },
+    }
+    _inject_cookies(ydl_opts)
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=False)
+        return {
+            "title": info.get("title", ""),
+            "duration": info.get("duration", 0),
+            "thumbnail": info.get("thumbnail", ""),
+            "channel": info.get("channel", ""),
+            "view_count": info.get("view_count", 0),
+            "description": info.get("description", "")[:500],
+        }
+async def download_video_async(
+    url: str,
+    output_dir: Path,
+    session_id: str,
+    progress_callback: Optional[Callable] = None,
+) -> Path:
+    """Async wrapper for download_video."""
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(
+        None, lambda: download_video(url, output_dir, session_id, progress_callback)
+    )

backend/src/processing/__init__.py ADDED Viewed

File without changes

backend/src/processing/clip_extractor.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""Extract video clips using ffmpeg-python."""
+import asyncio
+import subprocess
+from pathlib import Path
+from loguru import logger
+def extract_clip(
+    video_path: Path,
+    start: float,
+    end: float,
+    output_path: Path,
+    use_hw_encode: bool = True,
+    vertical: bool = True,
+    face_bbox: list = None,
+    **kwargs,
+) -> Path:
+    """Cut a clip and convert to 9:16 vertical (1080x1920) for TikTok.
+    face_bbox: [x1, y1, x2, y2] in pixels from Qwen2.5-VL — used to center
+    the crop on the face. Falls back to center crop when None.
+    Uses AMD AMF hardware encoder when available.
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    encoders = ["h264_amf", "libx264"] if use_hw_encode else ["libx264"]
+    # 9:16 vertical conversion filter
+    vf_filters = []
+    if vertical:
+        aspect_mode = kwargs.get("aspect_mode", "crop")
+        if aspect_mode == "letterbox":
+            # Fit entire 16:9 frame into 9:16, black bars top+bottom
+            vf_filters.append(
+                "scale=1080:1920:force_original_aspect_ratio=decrease,"
+                "pad=1080:1920:(ow-iw)/2:(oh-ih)/2:black"
+            )
+        else:
+            # Crop: scale to 1920 height first, then center-crop to 1080 wide
+            # Optionally center on face_bbox x when available
+            if face_bbox and len(face_bbox) == 4:
+                x1, _, x2, _ = face_bbox
+                face_cx = int((x1 + x2) / 2)
+                crop = f"scale=-1:1920,crop=1080:1920:max(0\\,min(iw-1080\\,{face_cx}*iw/in_w-540)):0"
+            else:
+                crop = "scale=-1:1920,crop=1080:1920:(iw-1080)/2:0"
+            vf_filters.append(crop)
+    for encoder in encoders:
+        cmd = ["ffmpeg", "-y", "-ss", str(start), "-to", str(end), "-i", str(video_path)]
+        if vf_filters:
+            cmd += ["-vf", ",".join(vf_filters)]
+        cmd += ["-c:v", encoder, "-c:a", "aac", "-b:a", "128k", "-movflags", "+faststart", str(output_path)]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode == 0:
+            if encoder == "h264_amf":
+                logger.info(f"Encoded 9:16 with AMD AMF: {output_path.name}")
+            return output_path
+        elif encoder == "h264_amf":
+            logger.debug("AMD AMF not available, falling back to libx264")
+    raise RuntimeError(f"All encoders failed for clip {output_path.name}")
+def burn_subtitles(
+    clip_path: Path,
+    ass_path: Path,
+    output_path: Path,
+    use_hw_encode: bool = True,
+) -> Path:
+    """Burn ASS subtitles into video using ffmpeg.
+    Returns path to output video with burned-in subtitles.
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    ass_str = str(ass_path).replace("\\", "/").replace(":", "\\:")
+    encoders = ["h264_amf", "libx264"] if use_hw_encode else ["libx264"]
+    for encoder in encoders:
+        cmd = [
+            "ffmpeg", "-y",
+            "-i", str(clip_path),
+            "-vf", f"ass='{ass_str}'",
+            "-c:v", encoder,
+            "-c:a", "copy",
+            "-movflags", "+faststart",
+            str(output_path),
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode == 0:
+            return output_path
+        elif encoder == "h264_amf":
+            logger.debug("AMD AMF burn-sub failed, using libx264")
+    raise RuntimeError(f"Subtitle burn-in failed for {clip_path.name}\n{result.stderr[-500:]}")
+def extract_all_clips(
+    video_path: Path,
+    selected_clips: list[dict],
+    output_dir: Path,
+    session_id: str,
+    aspect_mode: str = "crop",
+) -> list[dict]:
+    """Extract all selected clips from video. Returns list with added 'clip_path'."""
+    results = []
+    for i, clip in enumerate(selected_clips):
+        out_path = output_dir / f"{session_id}_clip_{i+1:02d}_raw.mp4"
+        face_bbox = clip.get("vision_analysis", {}).get("face_bbox")
+        try:
+            extract_clip(video_path, clip["start"], clip["end"], out_path, face_bbox=face_bbox, aspect_mode=aspect_mode)
+            results.append({**clip, "clip_index": i + 1, "clip_path": str(out_path)})
+            logger.info(f"Extracted clip {i+1}: {clip['start']:.1f}s–{clip['end']:.1f}s → {out_path.name}")
+        except Exception as e:
+            logger.error(f"Failed to extract clip {i+1}: {e}")
+            results.append({**clip, "clip_index": i + 1, "clip_path": None, "error": str(e)})
+    return results
+async def extract_all_clips_async(
+    video_path: Path,
+    selected_clips: list[dict],
+    output_dir: Path,
+    session_id: str,
+    aspect_mode: str = "crop",
+) -> list[dict]:
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(
+        None,
+        lambda: extract_all_clips(video_path, selected_clips, output_dir, session_id, aspect_mode)
+    )

backend/src/processing/emoji_overlay.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""Emoji and text overlay utilities for HRE pipeline."""
+import subprocess
+from pathlib import Path
+from loguru import logger
+def add_emoji_overlay(
+    video_path: Path,
+    emoji: str,
+    output_path: Path,
+    x: str = "w-100",
+    y: str = "50",
+    size: int = 80,
+    start_sec: float = 0.0,
+    end_sec: float = 3.0,
+) -> Path:
+    """Add emoji text overlay to video using ffmpeg drawtext."""
+    escaped = emoji.replace("'", "\\'").replace(":", "\\:")
+    vf = (
+        f"drawtext=text='{escaped}'"
+        f":fontsize={size}:x={x}:y={y}"
+        f":enable='between(t,{start_sec},{end_sec})'"
+    )
+    cmd = [
+        "ffmpeg", "-y", "-i", str(video_path),
+        "-vf", vf,
+        "-c:v", "libx264", "-c:a", "copy",
+        str(output_path),
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode == 0 and output_path.exists():
+        return output_path
+    logger.warning(f"Emoji overlay failed: {result.stderr[-200:]}")
+    return video_path  # fallback to original

backend/src/processing/high_retention.py ADDED Viewed

	@@ -0,0 +1,491 @@

+"""High-Retention Editing pipeline — per-segment AI decisions.
+Each 3-5s segment gets its own zoom direction, subtitle position,
+and caption color driven by Qwen2.5-VL analyzing one frame per segment.
+Pipeline per clip:
+  1. Segment clip at speech pauses (3-5s chunks)
+  2. Extract midpoint frame from each segment
+  3. Qwen2.5-VL analyzes each frame → zoom + subtitle decisions
+  4. ffmpeg filter_complex: per-segment zoompan + concat
+  5. ASS subtitles with per-segment alignment/color/size override tags
+"""
+import subprocess
+import tempfile
+from pathlib import Path
+from loguru import logger
+# ─── Video metadata ────────────────────────────────────────────────────────────
+def _probe_dimensions(video_path: Path) -> tuple[int, int]:
+    probe = subprocess.run(
+        ["ffprobe", "-v", "error", "-select_streams", "v:0",
+         "-show_entries", "stream=width,height", "-of", "csv=p=0",
+         str(video_path)],
+        capture_output=True, text=True,
+    )
+    try:
+        w, h = map(int, probe.stdout.strip().split(","))
+        return w, h
+    except Exception:
+        return 1080, 1920
+def _probe_duration(video_path: Path) -> float:
+    probe = subprocess.run(
+        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+         "-of", "csv=p=0", str(video_path)],
+        capture_output=True, text=True,
+    )
+    try:
+        return float(probe.stdout.strip())
+    except Exception:
+        return 0.0
+def _has_audio_stream(video_path: Path) -> bool:
+    probe = subprocess.run(
+        ["ffprobe", "-v", "error", "-select_streams", "a",
+         "-show_entries", "stream=codec_type", "-of", "csv=p=0",
+         str(video_path)],
+        capture_output=True, text=True,
+    )
+    return bool(probe.stdout.strip())
+# ─── Segmentation ─────────────────────────────────────────────────────────────
+def _segment_clip(
+    duration: float,
+    transcript: dict,
+    clip_start: float,
+    max_seg: float = 4.5,
+) -> list[dict]:
+    """Divide clip into segments at speech pauses, max_seg seconds each."""
+    words: list[dict] = []
+    for seg in transcript.get("segments", []):
+        words.extend(seg.get("words", []))
+    if clip_start > 0:
+        words = [
+            {**w, "start": max(0.0, w["start"] - clip_start),
+                  "end":   max(0.0, w["end"]   - clip_start)}
+            for w in words
+        ]
+    words = [w for w in words if w["end"] > 0 and w["start"] < duration]
+    # Collect pause midpoints as candidate cut times
+    cuts = [0.0]
+    for i in range(len(words) - 1):
+        gap = words[i + 1]["start"] - words[i]["end"]
+        if gap > 0.2:
+            cuts.append((words[i]["end"] + words[i + 1]["start"]) / 2.0)
+    cuts.append(duration)
+    cuts = sorted(set(cuts))
+    # Merge short intervals, split long ones
+    segs: list[dict] = []
+    start = 0.0
+    for cut in cuts[1:]:
+        seg_len = cut - start
+        if seg_len < 1.5 and cut < duration:
+            continue  # too short — extend to next cut
+        if seg_len > max_seg:
+            t = start
+            while t + max_seg < cut:
+                segs.append({"start": t, "end": t + max_seg})
+                t += max_seg
+            if cut - t > 0.5:
+                segs.append({"start": t, "end": cut})
+            start = cut
+        else:
+            segs.append({"start": start, "end": cut})
+            start = cut
+    # Fallback: split evenly if not enough segments
+    if len(segs) < 2:
+        n = max(2, round(duration / 4.0))
+        d = duration / n
+        segs = [{"start": i * d, "end": min((i + 1) * d, duration)} for i in range(n)]
+    return segs
+# ─── Frame extraction ─────────────────────────────────────────────────────────
+def _extract_frame(video_path: Path, t: float, out_path: Path) -> bool:
+    cmd = [
+        "ffmpeg", "-y", "-ss", f"{t:.3f}", "-i", str(video_path),
+        "-vframes", "1", "-q:v", "3", str(out_path),
+    ]
+    result = subprocess.run(cmd, capture_output=True, timeout=30)
+    return result.returncode == 0 and out_path.exists()
+# ─── Per-segment AI analysis ──────────────────────────────────────────────────
+def _analyze_segment(
+    video_path: Path,
+    seg: dict,
+    seg_idx: int,
+    n_total: int,
+    transcript: dict,
+    clip_start: float,
+    tmp_dir: Path,
+) -> dict:
+    from src.analysis.vision import analyze_frame_for_hre, _default_hre_analysis
+    mid_t = (seg["start"] + seg["end"]) / 2.0
+    frame_path = tmp_dir / f"seg_{seg_idx:03d}.jpg"
+    if not _extract_frame(video_path, mid_t, frame_path):
+        return _default_hre_analysis(seg_idx, n_total)
+    words_all: list[dict] = []
+    for s in transcript.get("segments", []):
+        words_all.extend(s.get("words", []))
+    abs_start = seg["start"] + clip_start
+    abs_end   = seg["end"]   + clip_start
+    context = " ".join(
+        w.get("word", w.get("text", ""))
+        for w in words_all
+        if w.get("start", 0) < abs_end and w.get("end", 0) > abs_start
+    ).strip()
+    return analyze_frame_for_hre(frame_path, context, seg_idx, n_total)
+# ─── Zoom expression builders ─────────────────────────────────────────────────
+def _build_zoom_exprs(
+    analysis: dict,
+    w: int,
+    h: int,
+) -> tuple[str, str, str]:
+    """Return (z_expr, x_expr, y_expr) for ffmpeg zoompan from HRE analysis.
+    Note: \\, escapes comma inside ffmpeg filter expressions.
+    """
+    direction     = analysis.get("zoom_direction", "in")
+    speed         = analysis.get("zoom_speed", "slow")
+    face_detected = bool(analysis.get("face_detected", False))
+    face_cx       = float(analysis.get("face_cx") or 0.5)
+    face_cy       = float(analysis.get("face_cy") or 0.38)
+    if direction == "in":
+        if speed == "fast":
+            z_expr, max_zoom = "min(1.2+n*0.0014\\,1.6)", 1.6
+        else:
+            z_expr, max_zoom = "min(1.05+n*0.0006\\,1.35)", 1.35
+    elif direction == "out":
+        if speed == "fast":
+            z_expr, max_zoom = "max(1.6-n*0.0016\\,1.0)", 1.6
+        else:
+            z_expr, max_zoom = "max(1.4-n*0.0010\\,1.0)", 1.4
+    else:  # hold
+        z_expr, max_zoom = "1.1", 1.1
+    if face_detected and direction == "in" and max_zoom > 1.05:
+        raw_cx = int(face_cx * w - w / (max_zoom * 2))
+        raw_cy = int(face_cy * h - h / (max_zoom * 2))
+        safe_cx = max(0, min(w - int(w / max_zoom), raw_cx))
+        safe_cy = max(0, min(h - int(h / max_zoom), raw_cy))
+        ctr_x = w / 2 - w / (max_zoom * 2)
+        ctr_y = h / 2 - h / (max_zoom * 2)
+        x_expr = (
+            f"(iw/2-(iw/zoom/2))+({safe_cx}-{ctr_x:.1f})*(zoom-1)/({max_zoom}-1)"
+        )
+        y_expr = (
+            f"(ih/2-(ih/zoom/2))+({safe_cy}-{ctr_y:.1f})*(zoom-1)/({max_zoom}-1)"
+        )
+    else:
+        x_expr = "iw/2-(iw/zoom/2)"
+        if direction == "in":
+            y_bias = min(face_cy, 0.5) if face_cy < 0.55 else 0.38
+            y_expr = f"ih*{y_bias:.2f}-(ih/zoom/2)"
+        else:
+            y_expr = "ih/2-(ih/zoom/2)"
+    return z_expr, x_expr, y_expr
+# ─── Per-segment zoom via filter_complex ──────────────────────────────────────
+def _apply_per_segment_zoom(
+    input_path: Path,
+    segments: list[dict],
+    analyses: list[dict],
+    w: int,
+    h: int,
+    output_path: Path,
+    has_audio: bool = True,
+) -> Path:
+    """Apply different zoompan to each segment, concat into single stream."""
+    filter_parts: list[str] = []
+    v_labels: list[str] = []
+    a_labels: list[str] = []
+    for i, (seg, analysis) in enumerate(zip(segments, analyses)):
+        s = f"{seg['start']:.3f}"
+        e = f"{seg['end']:.3f}"
+        z, x, y = _build_zoom_exprs(analysis, w, h)
+        zp = f"zoompan=z='{z}':x='{x}':y='{y}':d=1:s={w}x{h}:fps=30"
+        filter_parts.append(f"[0:v]trim={s}:{e},setpts=PTS-STARTPTS,{zp}[v{i}]")
+        v_labels.append(f"[v{i}]")
+        if has_audio:
+            filter_parts.append(f"[0:a]atrim={s}:{e},asetpts=PTS-STARTPTS[a{i}]")
+            a_labels.append(f"[a{i}]")
+    n = len(segments)
+    filter_parts.append("".join(v_labels) + f"concat=n={n}:v=1:a=0[vout]")
+    if has_audio:
+        filter_parts.append("".join(a_labels) + f"concat=n={n}:v=0:a=1[aout]")
+    cmd = [
+        "ffmpeg", "-y", "-i", str(input_path),
+        "-filter_complex", ";".join(filter_parts),
+        "-map", "[vout]",
+    ]
+    if has_audio:
+        cmd += ["-map", "[aout]", "-c:a", "aac"]
+    cmd += ["-c:v", "libx264", "-movflags", "+faststart", str(output_path)]
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+    if result.returncode == 0 and output_path.exists():
+        logger.info(f"Per-segment zoom: {n} segments, {w}x{h}")
+        return output_path
+    logger.warning(f"Per-segment zoom failed: {result.stderr[-800:]}")
+    return input_path
+# ─── Per-segment ASS subtitles ────────────────────────────────────────────────
+_ASS_COLORS = {
+    "white":  "&H00FFFFFF",
+    "yellow": "&H0000FFFF",
+    "cyan":   "&H00FFFF00",
+    "orange": "&H000066FF",
+    "green":  "&H0000FF00",
+    "red":    "&H000000FF",
+}
+def _ts(t: float) -> str:
+    h = int(t // 3600)
+    m = int((t % 3600) // 60)
+    s = t % 60
+    return f"{h}:{m:02d}:{s:06.3f}"
+def _generate_per_segment_subtitles(
+    transcript: dict,
+    ass_path: Path,
+    clip_start: float,
+    segments: list[dict],
+    analyses: list[dict],
+) -> None:
+    """Write ASS with per-segment alignment, color, and font-size overrides."""
+    events: list[dict] = []
+    # Word-level events
+    for seg in transcript.get("segments", []):
+        for w in seg.get("words", []):
+            t0 = max(0.0, float(w.get("start", 0)) - clip_start)
+            t1 = max(0.0, float(w.get("end",   0)) - clip_start)
+            text = w.get("word", w.get("text", "")).strip()
+            if text and t1 > 0:
+                events.append({"start": t0, "end": max(t1, t0 + 0.08), "text": text})
+    # Sentence-level fallback (split into 3-word chunks)
+    if not events:
+        for seg in transcript.get("segments", []):
+            t0 = max(0.0, float(seg.get("start", 0)) - clip_start)
+            t1 = max(0.0, float(seg.get("end",   0)) - clip_start)
+            text = seg.get("text", "").strip()
+            if not text or t1 <= 0:
+                continue
+            wlist = text.split()
+            chunk = 3
+            n_ch = max(1, (len(wlist) + chunk - 1) // chunk)
+            dur = (t1 - t0) / n_ch
+            for j in range(n_ch):
+                events.append({
+                    "start": t0 + j * dur,
+                    "end":   t0 + (j + 1) * dur,
+                    "text":  " ".join(wlist[j * chunk:(j + 1) * chunk]),
+                })
+    def get_an(t: float) -> dict:
+        for seg, an in zip(segments, analyses):
+            if seg["start"] <= t < seg["end"]:
+                return an
+        return analyses[-1] if analyses else {}
+    lines = [
+        "[Script Info]",
+        "ScriptType: v4.00+",
+        "PlayResX: 1080",
+        "PlayResY: 1920",
+        "ScaledBorderAndShadow: yes",
+        "",
+        "[V4+ Styles]",
+        "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, "
+        "OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
+        "ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
+        "Alignment, MarginL, MarginR, MarginV, Encoding",
+        "Style: Default,Impact,90,&H00FFFFFF,&H0000FFFF,&H00000000,&H80000000,"
+        "-1,0,0,0,100,100,0,0,1,4,0,2,40,40,200,1",
+        "",
+        "[Events]",
+        "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text",
+    ]
+    for ev in events:
+        an      = get_an(ev["start"])
+        color   = _ASS_COLORS.get(an.get("subtitle_color", "white"), "&H00FFFFFF")
+        pos     = an.get("subtitle_position", "bottom")
+        energy  = an.get("energy_level", "medium")
+        moment  = an.get("moment_type", "context")
+        alignment = 8 if pos == "top" else 2
+        margin_v  = 120 if pos == "top" else 200
+        fs = (108 if energy == "high" or moment in ("hook", "punchline")
+              else 80 if energy == "low" else 92)
+        # Pop animation: start 130% scale, shrink to 100% in 120ms
+        pop = "{\\fscx130\\fscy130\\t(0,120,\\fscx100\\fscy100)}"
+        tag = f"{{\\an{alignment}\\1c{color}&\\fs{fs}\\b1}}{pop}"
+        lines.append(
+            f"Dialogue: 0,{_ts(ev['start'])},{_ts(ev['end'])},"
+            f"Default,,0,0,{margin_v},,{tag}{ev['text'].upper()}"
+        )
+    ass_path.write_text("\n".join(lines), encoding="utf-8")
+    logger.debug(f"ASS: {len(events)} events across {len(segments)} segments")
+# ─── Emoji ─────────────────────────────────────────────────────────────────────
+def _get_emoji(clip_data: dict, analyses: list[dict] | None = None) -> str:
+    if analyses:
+        energy_rank = {"high": 3, "medium": 2, "low": 1}
+        best = max(analyses, key=lambda a: energy_rank.get(a.get("energy_level", "low"), 1))
+        moment_emoji = {
+            "hook": "🔥", "punchline": "😂", "reaction": "😲",
+            "context": "💡", "transition": "✨",
+        }
+        if emoji := moment_emoji.get(best.get("moment_type", "")):
+            return emoji
+    a = clip_data.get("vision_analysis", {})
+    emotion = a.get("emotion", "excited")
+    action  = a.get("action_type", "entertainment")
+    transcript_text = clip_data.get("transcript_text", "")
+    if transcript_text:
+        try:
+            from src.analysis.vision import get_emoji_for_scene
+            return get_emoji_for_scene(transcript_text, emotion, action)
+        except Exception:
+            pass
+    fb = {"happy": "😄", "excited": "🔥", "funny": "😂", "surprised": "😲",
+          "gaming": "🎮", "tutorial": "📚", "angry": "😤", "sad": "😢"}
+    return fb.get(emotion, fb.get(action, "⚡"))
+# ─── Final render ─────────────────────────────────────────────────────────────
+def _render_final(
+    video_path: Path,
+    ass_path: Path,
+    emoji: str,
+    output_path: Path,
+) -> None:
+    ass_str = str(ass_path).replace("\\", "/").replace(":", "\\:")
+    emoji_filter = (
+        f"drawtext=text='{emoji}':fontsize=80:x=w-100:y=50"
+        f":enable='between(t\\,0\\,3)'"
+    )
+    vf = f"ass='{ass_str}',{emoji_filter}"
+    cmd = [
+        "ffmpeg", "-y", "-i", str(video_path),
+        "-vf", vf, "-c:v", "libx264", "-c:a", "copy",
+        "-movflags", "+faststart", str(output_path),
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+    if result.returncode != 0:
+        cmd2 = [
+            "ffmpeg", "-y", "-i", str(video_path),
+            "-vf", f"ass='{ass_str}'",
+            "-c:v", "libx264", "-c:a", "copy", str(output_path),
+        ]
+        result2 = subprocess.run(cmd2, capture_output=True, text=True, timeout=300)
+        if result2.returncode != 0:
+            logger.error(f"HRE render failed: {result2.stderr[-300:]}")
+            return
+    logger.info(f"HRE render complete → {output_path.name}")
+# ─── Main pipeline ────────────────────────────────────────────────────────────
+def apply_hre(
+    clip_path: Path,
+    clip_data: dict,
+    transcript: dict,
+    output_path: Path,
+) -> Path:
+    """Apply per-segment AI-driven HRE: each 3-5s chunk gets its own zoom + subtitle style."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    clip_start = clip_data.get("start", 0.0)
+    with tempfile.TemporaryDirectory() as _tmp:
+        tmp_dir    = Path(_tmp)
+        tmp_zoomed = tmp_dir / "zoomed.mp4"
+        w, h      = _probe_dimensions(clip_path)
+        duration  = _probe_duration(clip_path)
+        if duration <= 0:
+            duration = float(clip_data.get("end", clip_start + 30)) - clip_start
+        has_audio = _has_audio_stream(clip_path)
+        # 1. Segment at speech pauses
+        segments = _segment_clip(duration, transcript, clip_start)
+        n = len(segments)
+        logger.info(
+            f"HRE clip {clip_data.get('index', '?')}: "
+            f"{duration:.1f}s → {n} segments (AI analyzing each)"
+        )
+        # 2. Qwen2.5-VL analyzes each segment
+        analyses = [
+            _analyze_segment(clip_path, seg, i, n, transcript, clip_start, tmp_dir)
+            for i, seg in enumerate(segments)
+        ]
+        for i, (seg, an) in enumerate(zip(segments, analyses)):
+            logger.info(
+                f"  [{seg['start']:.1f}s-{seg['end']:.1f}s] "
+                f"zoom={an.get('zoom_direction')}({an.get('zoom_speed')}) "
+                f"sub={an.get('subtitle_position')}/{an.get('subtitle_color')} "
+                f"type={an.get('moment_type')} energy={an.get('energy_level')}"
+            )
+        # 3. Per-segment zoom via filter_complex
+        zoomed = _apply_per_segment_zoom(
+            clip_path, segments, analyses, w, h, tmp_zoomed, has_audio=has_audio
+        )
+        # 4. Per-segment ASS subtitles
+        ass_path = output_path.with_suffix(".ass")
+        _generate_per_segment_subtitles(transcript, ass_path, clip_start, segments, analyses)
+        # 5. Emoji from highest-energy segment
+        emoji = _get_emoji(clip_data, analyses)
+        # 6. Render
+        _render_final(zoomed, ass_path, emoji, output_path)
+    return output_path

backend/src/processing/subtitle.py ADDED Viewed

	@@ -0,0 +1,291 @@

+"""Generate ASS subtitles using pysubs2.
+Supports: word-by-word, sentence, karaoke, fade, pop, typewriter animations.
+Full ASS spec: font, size, 4-color layers, border, shadow, position, alignment.
+Handles Thai/Chinese character-level splitting.
+"""
+from pathlib import Path
+from typing import Optional
+import pysubs2
+from pysubs2 import SSAFile, SSAEvent, SSAStyle
+from loguru import logger
+# Languages that split by character rather than word
+CHAR_LEVEL_LANGUAGES = {"th", "zh", "ja", "km", "lo"}
+# Default font per language
+DEFAULT_FONTS = {
+    "th": "Noto Sans Thai",
+    "zh": "Noto Sans SC",
+    "zh-tw": "Noto Sans TC",
+    "ja": "Noto Sans JP",
+    "ko": "Noto Sans KR",
+    "en": "Montserrat",
+    "default": "Noto Sans",
+}
+# Animation presets (ASS override tags)
+def _fade_tags(fade_in_ms: int = 200, fade_out_ms: int = 200) -> str:
+    return f"{{\\fade({fade_in_ms},{fade_out_ms})}}"
+def _pop_tags() -> str:
+    return "{\\t(0,100,\\fscx120\\fscy120)\\t(100,200,\\fscx100\\fscy100)}"
+def _typewriter_per_char(char: str, delay_ms: int) -> str:
+    return f"{{\\alpha&HFF&\\t({delay_ms},{delay_ms+80},\\alpha&H00&)}}{char}"
+def _bounce_tags() -> str:
+    return "{\\t(0,150,\\frz-5)\\t(150,300,\\frz5)\\t(300,400,\\frz0)}"
+def _color_to_ass(hex_color: str, alpha: int = 0) -> str:
+    """Convert #RRGGBB hex to ASS &HAABBGGRR format."""
+    hex_color = hex_color.lstrip("#")
+    if len(hex_color) == 6:
+        r, g, b = hex_color[0:2], hex_color[2:4], hex_color[4:6]
+    else:
+        r, g, b = "FF", "FF", "FF"
+    aa = f"{alpha:02X}"
+    return f"&H{aa}{b}{g}{r}"
+def build_style(
+    font_family: str = "Noto Sans",
+    font_size: int = 72,
+    primary_color: str = "#FFFFFF",
+    secondary_color: str = "#FFFF00",
+    outline_color: str = "#000000",
+    shadow_color: str = "#000000",
+    primary_alpha: int = 0,
+    outline_alpha: int = 0,
+    shadow_alpha: int = 80,
+    bold: bool = True,
+    italic: bool = False,
+    underline: bool = False,
+    outline_size: float = 4.0,
+    shadow_size: float = 2.0,
+    alignment: int = 2,  # 2=bottom-center, 8=top-center
+    margin_l: int = 40,
+    margin_r: int = 40,
+    margin_v: int = 250,
+    scale_x: int = 100,
+    scale_y: int = 100,
+    spacing: float = 0.0,
+    angle: float = 0.0,
+) -> SSAStyle:
+    style = SSAStyle()
+    style.fontname = font_family
+    style.fontsize = font_size
+    style.primarycolor = pysubs2.Color(*_hex_to_rgba(primary_color, primary_alpha))
+    style.secondarycolor = pysubs2.Color(*_hex_to_rgba(secondary_color, 0))
+    style.outlinecolor = pysubs2.Color(*_hex_to_rgba(outline_color, outline_alpha))
+    style.backcolor = pysubs2.Color(*_hex_to_rgba(shadow_color, shadow_alpha))
+    style.bold = bold
+    style.italic = italic
+    style.underline = underline
+    style.outline = outline_size
+    style.shadow = shadow_size
+    style.alignment = alignment
+    style.marginl = margin_l
+    style.marginr = margin_r
+    style.marginv = margin_v
+    style.scalex = scale_x
+    style.scaley = scale_y
+    style.spacing = spacing
+    style.angle = angle
+    style.borderstyle = 1  # outline + shadow
+    return style
+def _hex_to_rgba(hex_color: str, alpha_0_255: int = 0):
+    """Convert #RRGGBB to (R, G, B, A) where A=0 is opaque."""
+    hex_color = hex_color.lstrip("#")
+    if len(hex_color) == 6:
+        r = int(hex_color[0:2], 16)
+        g = int(hex_color[2:4], 16)
+        b = int(hex_color[4:6], 16)
+    else:
+        r, g, b = 255, 255, 255
+    return r, g, b, alpha_0_255
+def generate_subtitles(
+    transcript: dict,
+    output_path: Path,
+    style_config: dict,
+    clip_start_offset: float = 0.0,
+) -> Path:
+    """Generate .ass subtitle file from transcript.
+    Args:
+        transcript: Output from whisper.py
+        output_path: Where to save the .ass file
+        style_config: Dict with font/color/animation settings from frontend
+        clip_start_offset: Shift all timestamps (for sub-clips from longer video)
+    """
+    subs = SSAFile()
+    subs.info["PlayResX"] = "1080"
+    subs.info["PlayResY"] = "1920"
+    subs.info["ScaledBorderAndShadow"] = "yes"
+    subs.info["WrapStyle"] = "0"
+    display_mode = style_config.get("display_mode", "word")  # "word" or "sentence"
+    animation = style_config.get("animation", "none")  # none|fade|karaoke|pop|typewriter|bounce
+    subtitle_lang = style_config.get("subtitle_language", "en")
+    char_level = transcript.get("char_level", False) or subtitle_lang in CHAR_LEVEL_LANGUAGES
+    font_family = style_config.get("font_family") or DEFAULT_FONTS.get(subtitle_lang, DEFAULT_FONTS["default"])
+    style = build_style(
+        font_family=font_family,
+        font_size=style_config.get("font_size", 72),
+        primary_color=style_config.get("primary_color", "#FFFFFF"),
+        secondary_color=style_config.get("secondary_color", "#FFFF00"),
+        outline_color=style_config.get("outline_color", "#000000"),
+        shadow_color=style_config.get("shadow_color", "#000000"),
+        primary_alpha=style_config.get("primary_alpha", 0),
+        outline_alpha=style_config.get("outline_alpha", 0),
+        shadow_alpha=style_config.get("shadow_alpha", 80),
+        bold=style_config.get("bold", True),
+        italic=style_config.get("italic", False),
+        underline=style_config.get("underline", False),
+        outline_size=style_config.get("outline_size", 4.0),
+        shadow_size=style_config.get("shadow_size", 2.0),
+        alignment=style_config.get("alignment", 2),
+        margin_l=style_config.get("margin_l", 40),
+        margin_r=style_config.get("margin_r", 40),
+        margin_v=style_config.get("margin_v", 250),
+        scale_x=style_config.get("scale_x", 100),
+        scale_y=style_config.get("scale_y", 100),
+        spacing=style_config.get("spacing", 0.0),
+        angle=style_config.get("angle", 0.0),
+    )
+    subs.styles["Default"] = style
+    segments = transcript.get("segments", [])
+    for seg in segments:
+        words = seg.get("words", [])
+        seg_end = seg["end"] - clip_start_offset
+        if seg_end <= 0:
+            continue  # segment ends before clip starts — skip entirely
+        seg_start = max(0.0, seg["start"] - clip_start_offset)
+        if display_mode == "sentence" or not words:
+            _add_sentence_event(subs, seg["text"], seg_start, seg_end, animation, style_config)
+        else:
+            if animation == "karaoke":
+                _add_karaoke_line(subs, words, seg_start, seg_end, clip_start_offset, char_level)
+            else:
+                _add_word_events(subs, words, seg_start, seg_end, animation, char_level, style_config, clip_start_offset)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    subs.save(str(output_path), encoding="utf-8")
+    logger.info(f"Generated {len(subs)} subtitle events → {output_path.name}")
+    return output_path
+def _add_sentence_event(subs, text, start, end, animation, style_config):
+    tags = ""
+    if animation == "fade":
+        fi = style_config.get("fade_in_ms", 200)
+        fo = style_config.get("fade_out_ms", 200)
+        tags = _fade_tags(fi, fo)
+    elif animation == "pop":
+        tags = _pop_tags()
+    elif animation == "bounce":
+        tags = _bounce_tags()
+    event = SSAEvent(
+        start=pysubs2.make_time(s=start),
+        end=pysubs2.make_time(s=end),
+        text=tags + text.strip(),
+    )
+    subs.append(event)
+def _add_word_events(subs, words, seg_start, seg_end, animation, char_level, style_config, clip_offset=0.0):
+    """Add one SSAEvent per word (word-by-word mode)."""
+    unit_list = []
+    for w in words:
+        if char_level:
+            for ch in w["word"]:
+                unit_list.append({"word": ch, "start": w["start"], "end": w["end"]})
+        else:
+            unit_list.append(w)
+    for i, unit in enumerate(unit_list):
+        start = unit["start"] - clip_offset
+        end = (unit["end"] - clip_offset) if unit["end"] > unit["start"] else start + 0.3
+        if start < 0:
+            continue
+        tags = ""
+        if animation == "fade":
+            fi = style_config.get("fade_in_ms", 150)
+            fo = style_config.get("fade_out_ms", 100)
+            tags = _fade_tags(fi, fo)
+        elif animation == "pop":
+            tags = _pop_tags()
+        elif animation == "typewriter":
+            delay = int((start - seg_start) * 1000)
+            tags = _typewriter_per_char("", delay)
+        event = SSAEvent(
+            start=pysubs2.make_time(s=start),
+            end=pysubs2.make_time(s=end),
+            text=tags + unit["word"].strip(),
+        )
+        subs.append(event)
+def _add_karaoke_line(subs, words, seg_start, seg_end, clip_offset, char_level):
+    """Add karaoke-style line: full line visible, words highlight in sequence."""
+    karaoke_text = ""
+    for w in words:
+        duration_cs = int((w["end"] - w["start"]) * 100)
+        word_text = w["word"].strip()
+        if char_level:
+            for ch in word_text:
+                karaoke_text += f"{{\\kf{duration_cs // max(len(word_text), 1)}}}{ch}"
+        else:
+            karaoke_text += f"{{\\kf{duration_cs}}}{word_text} "
+    event = SSAEvent(
+        start=pysubs2.make_time(s=seg_start),
+        end=pysubs2.make_time(s=seg_end),
+        text=karaoke_text.strip(),
+    )
+    subs.append(event)
+def update_subtitle_event(
+    ass_path: Path,
+    event_index: int,
+    updates: dict,
+) -> Path:
+    """Update a single subtitle event (for editor patches)."""
+    subs = SSAFile.load(str(ass_path))
+    if event_index >= len(subs):
+        raise IndexError(f"Event index {event_index} out of range")
+    evt = subs[event_index]
+    if "text" in updates:
+        evt.text = updates["text"]
+    if "start" in updates:
+        evt.start = pysubs2.make_time(s=updates["start"])
+    if "end" in updates:
+        evt.end = pysubs2.make_time(s=updates["end"])
+    subs.save(str(ass_path), encoding="utf-8")
+    return ass_path
+def apply_global_style_override(ass_path: Path, style_config: dict) -> Path:
+    """Re-apply global style overrides to all events (for live preview)."""
+    subs = SSAFile.load(str(ass_path))
+    new_style = build_style(**{k: v for k, v in style_config.items() if k in build_style.__code__.co_varnames})
+    subs.styles["Default"] = new_style
+    subs.save(str(ass_path), encoding="utf-8")
+    return ass_path

backend/src/transcription/__init__.py ADDED Viewed

File without changes

backend/src/transcription/whisper.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""Speech-to-text using insanely-fast-whisper with ROCm support.
+Word-level timestamps for subtitle generation.
+Supports transcription (same language) and translation (→ English then to target).
+"""
+import asyncio
+import subprocess
+import json
+import os
+from pathlib import Path
+from typing import Optional
+from loguru import logger
+# Language codes supported by Whisper
+WHISPER_LANGUAGES = {
+    "thai": "th",
+    "english": "en",
+    "chinese": "zh",
+    "japanese": "ja",
+    "korean": "ko",
+    "french": "fr",
+    "german": "de",
+    "spanish": "es",
+    "portuguese": "pt",
+    "russian": "ru",
+    "arabic": "ar",
+    "hindi": "hi",
+    "vietnamese": "vi",
+    "indonesian": "id",
+    "malay": "ms",
+}
+# Languages that need character-level splitting (no word spaces)
+CHAR_LEVEL_LANGUAGES = {"th", "zh", "ja", "km", "lo", "my"}
+def extract_audio(video_path: Path, audio_path: Path) -> Path:
+    """Extract mono 16kHz audio from video using ffmpeg."""
+    cmd = [
+        "ffmpeg", "-y", "-i", str(video_path),
+        "-ac", "1", "-ar", "16000",
+        "-vn", "-f", "wav", str(audio_path)
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"ffmpeg audio extraction failed: {result.stderr}")
+    return audio_path
+def transcribe(
+    audio_path: Path,
+    clip_language: str = "auto",
+    subtitle_language: str = "en",
+    model_size: str = "large-v3",
+    device: str = "cuda",
+    batch_size: int = 16,
+) -> dict:
+    """Transcribe audio and return word-level timestamps.
+    Returns:
+        {
+            "text": str,
+            "segments": [{"start": float, "end": float, "text": str, "words": [...]}],
+            "language": str,
+            "char_level": bool,
+        }
+    """
+    clip_lang_code = WHISPER_LANGUAGES.get(clip_language.lower(), None)
+    sub_lang_code = WHISPER_LANGUAGES.get(subtitle_language.lower(), "en")
+    # Determine whisper task
+    task = "transcribe"
+    if clip_lang_code and sub_lang_code and clip_lang_code != sub_lang_code:
+        if sub_lang_code == "en":
+            task = "translate"  # Whisper built-in translate → English
+        else:
+            task = "transcribe"  # Non-English targets keep transcription in the selected language.
+    logger.info(f"Whisper: task={task}, clip_lang={clip_lang_code}, sub_lang={sub_lang_code}, model={model_size}")
+    try:
+        from transformers import pipeline
+        import torch
+        # AMD ROCm: float16 triggers HIPBLAS_STATUS_INTERNAL_ERROR on some models.
+        # Use float32 for stability; bfloat16 as middle ground if available.
+        if device == "cuda":
+            try:
+                name = torch.cuda.get_device_name(0).lower()
+                is_amd = any(k in name for k in ("amd", "radeon", "instinct", "mi"))
+            except Exception:
+                is_amd = True  # default safe
+            dtype = torch.bfloat16 if is_amd else torch.float16
+        else:
+            dtype = torch.float32
+        def _run_on_cpu(gk):
+            logger.warning("Whisper: running on CPU (GPU unavailable or OOM)")
+            pipe_cpu = pipeline(
+                "automatic-speech-recognition",
+                model=f"openai/whisper-{model_size}",
+                torch_dtype=torch.float32,
+                device="cpu",
+            )
+            return pipe_cpu(str(audio_path), batch_size=1,
+                            return_timestamps="word", generate_kwargs=gk)
+        generate_kwargs = {"task": task}
+        if clip_lang_code:
+            generate_kwargs["language"] = clip_lang_code
+        # Check free VRAM — if GPU is nearly full, go straight to CPU
+        use_gpu = device == "cuda"
+        if use_gpu:
+            try:
+                free_bytes = torch.cuda.mem_get_info(0)[0]
+                if free_bytes < 8 * 1024 ** 3:  # < 8 GB free
+                    logger.warning(f"Whisper: only {free_bytes/1024**3:.1f} GB free — using CPU")
+                    use_gpu = False
+            except Exception:
+                pass
+        pipe = None
+        try:
+            if not use_gpu:
+                result = _run_on_cpu(generate_kwargs)
+            else:
+                pipe = pipeline(
+                    "automatic-speech-recognition",
+                    model=f"openai/whisper-{model_size}",
+                    torch_dtype=dtype,
+                    device=device,
+                    model_kwargs={"attn_implementation": "sdpa"},
+                )
+                result = pipe(str(audio_path), batch_size=batch_size,
+                              return_timestamps="word", generate_kwargs=generate_kwargs)
+        except (RuntimeError, Exception) as e:
+            err = str(e)
+            if any(k in err for k in ("HIPBLAS", "HIP", "out of memory", "OutOfMemory", "CUDA")):
+                logger.warning(f"GPU error in Whisper ({err[:120]}), retrying on CPU")
+                result = _run_on_cpu(generate_kwargs)
+            else:
+                raise
+        finally:
+            if pipe is not None:
+                del pipe
+                try:
+                    torch.cuda.empty_cache()
+                except Exception:
+                    pass
+        segments = _build_segments(result, sub_lang_code)
+        char_level = sub_lang_code in CHAR_LEVEL_LANGUAGES
+        return {
+            "text": result.get("text", ""),
+            "segments": segments,
+            "language": clip_lang_code or "auto",
+            "char_level": char_level,
+            "task": task,
+        }
+    except ImportError:
+        logger.warning("transformers not available, using stub transcription")
+        return _stub_transcription(str(audio_path))
+def _build_segments(whisper_result: dict, target_lang: str) -> list:
+    """Convert Whisper output to segment list with word timestamps."""
+    segments = []
+    chunks = whisper_result.get("chunks", [])
+    if not chunks:
+        # Fallback: single segment
+        return [{"start": 0.0, "end": 30.0, "text": whisper_result.get("text", ""), "words": []}]
+    current_seg = {"start": None, "end": None, "text": "", "words": []}
+    SEGMENT_GAP = 1.5  # seconds gap to split into new segment
+    for chunk in chunks:
+        ts = chunk.get("timestamp", [0, 0])
+        start, end = (ts[0] or 0.0), (ts[1] or ts[0] or 0.0)
+        text = chunk.get("text", "").strip()
+        if not text:
+            continue
+        if current_seg["start"] is None:
+            current_seg["start"] = start
+        if current_seg["words"] and start - current_seg["end"] > SEGMENT_GAP:
+            segments.append(current_seg)
+            current_seg = {"start": start, "end": end, "text": text, "words": []}
+        else:
+            current_seg["text"] += (" " if current_seg["text"] else "") + text
+        current_seg["words"].append({"word": text, "start": start, "end": end})
+        current_seg["end"] = end
+    if current_seg["start"] is not None:
+        segments.append(current_seg)
+    return segments
+def _stub_transcription(audio_path: str) -> dict:
+    """Return minimal stub when Whisper is unavailable (dev/CPU mode)."""
+    return {
+        "text": "[Transcription not available — Whisper model not loaded]",
+        "segments": [{"start": 0.0, "end": 5.0, "text": "Sample subtitle", "words": [
+            {"word": "Sample", "start": 0.0, "end": 0.5},
+            {"word": "subtitle", "start": 0.6, "end": 1.0},
+        ]}],
+        "language": "en",
+        "char_level": False,
+        "task": "transcribe",
+    }
+async def transcribe_async(
+    audio_path: Path,
+    clip_language: str = "auto",
+    subtitle_language: str = "en",
+    model_size: str = "large-v3",
+    device: str = "cuda",
+) -> dict:
+    """Async wrapper for transcribe."""
+    loop = asyncio.get_event_loop()
+    from src.gpu.rocm_utils import get_optimal_batch_size
+    batch_size = get_optimal_batch_size("whisper")
+    return await loop.run_in_executor(
+        None,
+        lambda: transcribe(audio_path, clip_language, subtitle_language, model_size, device, batch_size)
+    )

deploy/setup_droplet.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#!/bin/bash
+# ElevenClip AI — Full AMD Droplet Setup Script
+# Run once after fresh boot: bash /root/setup_droplet.sh
+set -e
+LOG=/tmp/elevnclip_setup.log
+exec > >(tee -a "$LOG") 2>&1
+echo "=== ElevenClip AI Droplet Setup $(date) ==="
+# ── 1. Update repo ────────────────────────────────────────────────────────────
+echo "[1/5] Pulling latest code..."
+cd /root/ElevenClip-AI
+git pull origin master
+# ── 2. Python venv + pip install ─────────────────────────────────────────────
+echo "[2/5] Installing Python dependencies..."
+if [ ! -f /root/venv/bin/activate ]; then
+    python3 -m venv /root/venv
+fi
+source /root/venv/bin/activate
+pip install --upgrade pip -q
+pip install -r backend/requirements.txt -q
+echo "PACKAGES_DONE"
+# ── 3. Start vLLM inside Docker container ────────────────────────────────────
+echo "[3/5] Starting vLLM with Qwen2.5-VL-7B-Instruct..."
+docker start rocm 2>/dev/null || true
+sleep 3
+# Kill any stale vllm process
+docker exec rocm bash -c "pkill -f 'vllm serve' 2>/dev/null || true"
+sleep 2
+# Start vLLM detached
+docker exec -d rocm bash -c '
+    vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
+        --port 8000 \
+        --dtype float16 \
+        --trust-remote-code \
+        --max-model-len 4096 \
+        --gpu-memory-utilization 0.7 \
+        --limit-mm-per-prompt "image=3" \
+        > /tmp/vllm.log 2>&1
+'
+echo "vLLM started in background (downloading model, may take 5-15 min)"
+# ── 4. Start FastAPI backend on port 8080 ────────────────────────────────────
+echo "[4/5] Starting FastAPI backend on :8080..."
+pkill -f "uvicorn backend.main" 2>/dev/null || true
+sleep 1
+cd /root/ElevenClip-AI
+VLLM_BASE_URL=http://localhost:8000/v1 \
+VLLM_MODEL=Qwen/Qwen2.5-VL-7B-Instruct \
+WORK_DIR=/tmp/elevnclip \
+NEXT_PUBLIC_API_URL=http://localhost:8080 \
+nohup /root/venv/bin/uvicorn backend.main:app \
+    --host 0.0.0.0 \
+    --port 8080 \
+    --workers 1 \
+    --log-level info \
+    > /tmp/fastapi.log 2>&1 &
+echo "FastAPI PID: $!"
+echo "FASTAPI_STARTED"
+# ── 5. Poll vLLM health ───────────────────────────────────────────────────────
+echo "[5/5] Waiting for vLLM to load model..."
+for i in $(seq 1 180); do
+    if curl -sf http://localhost:8000/health > /dev/null 2>&1; then
+        echo "vLLM READY after $((i * 5))s!"
+        echo "VLLM_READY"
+        break
+    fi
+    if [ $((i % 12)) -eq 0 ]; then
+        echo "  Still loading... $((i * 5))s elapsed"
+        docker exec rocm bash -c "tail -3 /tmp/vllm.log 2>/dev/null"
+    fi
+    sleep 5
+done
+echo ""
+echo "=== Setup complete! ==="
+echo "  FastAPI:  http://129.212.178.101:8080"
+echo "  vLLM API: http://129.212.178.101:8000/v1"
+echo "  Logs:     /tmp/fastapi.log | docker exec rocm cat /tmp/vllm.log"

deploy/start_fastapi.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+pkill -f uvicorn 2>/dev/null
+sleep 1
+cd /root/ElevenClip-AI/backend
+export VLLM_BASE_URL=http://localhost:8000/v1
+export VLLM_MODEL=Qwen/Qwen2.5-VL-7B-Instruct
+export WORK_DIR=/tmp/elevnclip
+mkdir -p /tmp/elevnclip
+nohup /root/venv/bin/uvicorn main:app --host 0.0.0.0 --port 8080 --workers 1 > /tmp/fastapi.log 2>&1 &
+echo "FastAPI PID: $!"