diff --git a/.env.example b/.env.example deleted file mode 100644 index 15770474f38406d3b51842091b1883cdc9aec79b..0000000000000000000000000000000000000000 --- a/.env.example +++ /dev/null @@ -1,18 +0,0 @@ -DEMO_MODE=true -STORAGE_DIR=backend/data -FRONTEND_ORIGIN=http://localhost:5173 - -WHISPER_MODEL_ID=openai/whisper-large-v3 -QWEN_TEXT_MODEL_ID=Qwen/Qwen2.5-7B-Instruct -QWEN_VL_MODEL_ID=Qwen/Qwen2-VL-7B-Instruct -HF_TOKEN= - -TARGET_CLIP_COUNT=5 -MAX_CLIPS=10 -FFMPEG_BINARY=ffmpeg -FFPROBE_BINARY=ffprobe -FFMPEG_VIDEO_CODEC=h264_amf -FFMPEG_CPU_CODEC=libx264 - -REDIS_URL=redis://redis:6379/0 -CELERY_ENABLED=false diff --git a/.github/workflows/sync-to-hf.yml b/.github/workflows/sync-to-hf.yml deleted file mode 100644 index 950a1cd116e379573dc00529d2816a3c92636eaa..0000000000000000000000000000000000000000 --- a/.github/workflows/sync-to-hf.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: Sync to Hugging Face Space - -on: - push: - branches: - - main - -jobs: - sync: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - lfs: true - - - name: Push to Hugging Face Space - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - git remote add space https://JakgritB:$HF_TOKEN@huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI - git push space main --force diff --git a/.gitignore b/.gitignore index a3ef6779c5de156b965354564782939a55318fc9..a89e7a8ccaedcba9fdb65e6b3f7c7608a121df5f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,25 +1,37 @@ -.env -.hf-home/ -.venv/ -.python_packages/ +# Python __pycache__/ -*.pyc +*.py[cod] *.egg-info/ -.pytest_cache/ -.mypy_cache/ -.ruff_cache/ - -node_modules/ +.venv/ +venv/ dist/ -.vite/ +build/ -data/ -backend/data/ -tmp/ -pip-tmp/ +# Node +frontend/node_modules/ +frontend/.next/ +frontend/out/ + +# Temp files +graphify-out/ +/tmp/ +*.wav +*.mp4 +*.ass *.log -hf-space-live/ +# Env +.env +.env.local +.env.*.local + +# SSH keys (never commit) +*.pem +*_key +*_key.pub +id_rsa* +id_ed25519* +# OS .DS_Store Thumbs.db diff --git a/Dockerfile b/Dockerfile index 42d0f8d6219e51aeefc67a39fb5f1bd164198d48..8d5b5509fde956cead509d271f077f657d62c066 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,58 @@ -FROM python:3.11-slim +# ElevenClip AI — HuggingFace Spaces (AMD ROCm) +FROM rocm/pytorch:rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 + WORKDIR /app -RUN pip install --no-cache-dir fastapi uvicorn +# System dependencies +RUN apt-get update && apt-get install -y \ + ffmpeg \ + nginx \ + curl \ + git \ + nodejs \ + npm \ + && rm -rf /var/lib/apt/lists/* + +# ─── Backend Python dependencies ─────────────────────────────────────────── +COPY backend/requirements.txt /app/backend/requirements.txt +RUN pip install --no-cache-dir -r /app/backend/requirements.txt + +# vLLM with ROCm support +RUN pip install --no-cache-dir \ + "vllm>=0.6.0" \ + --extra-index-url https://download.pytorch.org/whl/rocm6.2 + +COPY backend/ /app/backend/ + +# ─── Frontend (Next.js standalone build) ────────────────────────────────── +COPY frontend/package*.json /app/frontend/ +RUN cd /app/frontend && npm ci --production=false + +COPY frontend/ /app/frontend/ -COPY landing.py ./landing.py +# Relative API URL — nginx proxies /api/* and /ws/* to FastAPI :8080 +ENV NEXT_PUBLIC_API_URL="" +ENV NEXT_PUBLIC_DEMO_ENABLED="true" +ENV NEXT_PUBLIC_DEMO_ONLY="true" + +RUN cd /app/frontend && npm run build + +# ─── nginx config ────────────────────────────────────────────────────────── +COPY nginx.conf /app/nginx.conf + +# ─── Runtime directories ─────────────────────────────────────────────────── +RUN mkdir -p /tmp/elevnclip /root/.cache/huggingface /root/ElevenClip-AI/demo_videos + +# ─── Startup ────────────────────────────────────────────────────────────── +COPY start.sh /app/start.sh +RUN chmod +x /app/start.sh EXPOSE 7860 -CMD ["uvicorn", "landing:app", "--host", "0.0.0.0", "--port", "7860"] + +# vLLM managed on-demand by vllm_manager.py (not started at container startup) +ENV VLLM_ON_DEMAND="true" +ENV VLLM_PORT="8000" +ENV VLLM_IDLE_TIMEOUT="300" +ENV VLLM_DOCKER_CONTAINER="" + +CMD ["/app/start.sh"] diff --git a/LICENSE b/LICENSE index b4d7db40ef3a031021f04accec7a8d097025917a..3ff6c036e03b7650d97a25560141d5e083cfcb33 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2026 ElevenClip.AI contributors +Copyright (c) 2026 ElevenClip AI Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index addf132e68473706c1c33dcd3bcfaa9e3f7c57f0..a56911fe468e0b07c526cf26a9cd34266be7c67a 100644 --- a/README.md +++ b/README.md @@ -1,498 +1,255 @@ ---- -title: ElevenClip AI -emoji: 🎬 -colorFrom: purple -colorTo: red -sdk: docker -pinned: false ---- - -# ElevenClip.AI - -ElevenClip.AI is an AI-powered clip studio for turning long-form videos into personalized short-form content for TikTok, YouTube Shorts, and Instagram Reels. - -This project is built for the **AMD Developer Hackathon** on lablab.ai, targeting **Track 3: Vision & Multimodal AI**. The system is designed to run on **AMD Developer Cloud** with **ROCm** and **AMD Instinct MI300X** acceleration, while using **Hugging Face** as the model hub/deployment layer and **Qwen** models for profile-aware highlight reasoning. - -## One-Sentence Pitch - -ElevenClip.AI helps creators convert long videos into ready-to-edit short clips by combining Whisper transcription, Qwen highlight detection, optional Qwen-VL visual understanding, ffmpeg rendering, and a human-in-the-loop clip editor. - -## Problem - -Long-form creators, podcasters, educators, streamers, and marketing teams often publish hours of video but still need short clips for modern discovery platforms. - -The manual workflow is painful: - -- Watch the full video. -- Find high-retention moments. -- Trim each clip. -- Rewrite subtitles. -- Reframe to vertical 9:16. -- Export platform-ready MP4 files. - -For a two-hour video, this can take several hours of editing time. The bottleneck is not just cutting video; it is understanding which moments match the creator's audience, channel style, language, and target platform. - -## Solution - -ElevenClip.AI automates the first pass of short-form production: - -1. The creator sets up a reusable channel profile. -2. The creator provides a YouTube URL or uploads a video file. -3. Whisper Large V3 transcribes the video, including Thai and multilingual speech. -4. Qwen2.5 analyzes the transcript and scores candidate highlights based on engagement potential and the creator profile. -5. Optional Qwen2-VL analysis can enrich the scores with visual signals such as reactions, scene changes, and on-screen text. -6. ffmpeg renders vertical clips with subtitle files and burn-in support. -7. The React editor lets the human approve, delete, trim, regenerate, and edit subtitles before download. +# ElevenClip AI ✂️ -The product is intentionally human-AI collaborative: AI finds and prepares the clips quickly, while the creator keeps editorial control. +> **AMD Developer Hackathon 2026 — Track 3: Vision & Multimodal AI** -## Hackathon Alignment +Turn livestream recordings or uploaded videos into TikTok-ready highlight clips using **true multimodal AI** — vision, audio, and text analyzed simultaneously on AMD Instinct MI300X. -### Track +[![HuggingFace Space](https://img.shields.io/badge/🤗-HuggingFace%20Space-yellow)](https://huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI) +[![AMD ROCm](https://img.shields.io/badge/AMD-ROCm%206.3-red)](https://rocm.docs.amd.com/) +[![Qwen2.5-VL](https://img.shields.io/badge/Qwen2.5--VL-7B%20Instruct-blue)](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) +[![License: MIT](https://img.shields.io/badge/License-MIT-green)](LICENSE) -**Track 3: Vision & Multimodal AI** - -ElevenClip.AI processes multiple media types: - -- Audio: speech transcription with Whisper Large V3. -- Text: transcript reasoning and highlight ranking with Qwen2.5. -- Video: frame-aware multimodal analysis with Qwen2-VL as the next pipeline stage. -- Rendered media: ffmpeg exports platform-ready video clips. - -### AMD Technology - -The production target is AMD Developer Cloud: - -- **AMD Instinct MI300X** for high-throughput model inference. -- **ROCm 6.x** as the GPU software stack. -- **PyTorch with ROCm support** for Whisper inference. -- **vLLM ROCm backend** for fast Qwen2.5 inference. -- **Optimum-AMD** as an optimization path for Hugging Face models on AMD hardware. -- **ffmpeg hardware acceleration hooks** for faster video encoding where available. - -The app has a local `DEMO_MODE=true` path so judges and teammates can inspect the UI/API without downloading large models. On AMD Developer Cloud, set `DEMO_MODE=false` to activate the real model stack. +--- -### Hugging Face Integration +## Demo -Hugging Face is used as the model hub and deployment layer: +> Try it live: [HuggingFace Space](https://huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI) -- `openai/whisper-large-v3` for transcription. -- `Qwen/Qwen2.5-7B-Instruct` for highlight analysis. -- `Qwen/Qwen2-VL-7B-Instruct` for multimodal video understanding. -- Public Hugging Face Space for the hackathon demo page: - `https://huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI` +--- -### Qwen Integration +## What It Does -Qwen is not used as a generic chatbot. It is part of the core product logic: +ElevenClip AI ingests an uploaded video and automatically finds the best moments to clip for TikTok using three AI modalities working together. The backend also keeps optional yt-dlp/YouTube support, but the public demo focuses on uploads because public video platforms can trigger anti-bot restrictions. -- Reads timestamped transcript segments. -- Considers creator profile settings. -- Scores engagement potential. -- Explains why a segment should become a clip. -- Returns structured JSON with timestamps, titles, scores, reasons, and subtitle text. +| Modality | Model | What it detects | +|---|---|---| +| **Vision** | Qwen2.5-VL-7B on ROCm | Excitement, faces, action type, humor, TikTok potential | +| **Audio** | insanely-fast-whisper (ROCm) | Word-level transcript + language detection | +| **Audio Signal** | librosa | RMS energy → loud/quiet moments | +| **Vision+Text** | Qwen2.5-VL (multimodal) | Frame + transcript context fused together | +| **Text** | Python keyword scorer + Qwen2.5-VL text prompt | Style keyword matching, emoji selection | -## Current MVP Features +### Highlight Scoring Formula -- Channel profile onboarding: - - niche - - preferred clip style - - preferred clip length - - primary language - - target platform -- YouTube URL ingestion through `yt-dlp`. -- Direct video upload endpoint. -- Whisper transcription service boundary. -- Qwen highlight detection service boundary. -- Optional Qwen2-VL multimodal analysis service boundary. -- ffmpeg clip generation with subtitle file creation. -- Vertical 9:16 export path for TikTok, Shorts, and Reels. -- Human-AI review UI: - - trim start/end - - edit subtitles inline - - approve clips - - delete clips - - regenerate a clip - - download MP4 output -- Timing logs for benchmark demos. -- Docker and AMD Cloud deployment notes. - -## Architecture - -```mermaid -flowchart LR - A["Creator Profile"] --> D["Qwen2.5 Highlight Scoring"] - B["YouTube URL"] --> C["yt-dlp / Video Input"] - B2["Uploaded Video"] --> C - C --> W["Whisper Large V3 Transcription"] - W --> D - C --> V["Qwen2-VL Visual Analysis (Optional)"] - D --> R["Clip Plan JSON"] - V --> R - R --> F["ffmpeg Clip Rendering + Subtitles"] - F --> E["React Human-AI Editor"] - E --> O["Approved Short-Form Clips"] ``` +final_score = 0.40 × vision_score + 0.35 × audio_energy + 0.25 × text_keywords -## Repository Structure - -```text -. -├── backend/ -│ ├── app/ -│ │ ├── core/ # configuration and timing instrumentation -│ │ ├── models/ # Pydantic request/response schemas -│ │ ├── services/ # ingest, transcription, Qwen scoring, subtitles, rendering -│ │ ├── utils/ # ROCm / accelerator detection -│ │ ├── workers/ # optional Celery wiring -│ │ ├── main.py # FastAPI application -│ │ └── storage.py # file-backed job storage for MVP -│ ├── Dockerfile -│ └── pyproject.toml -├── frontend/ -│ ├── src/ -│ │ ├── App.jsx # creator workflow and clip editor -│ │ ├── main.jsx -│ │ └── styles.css -│ ├── Dockerfile -│ └── package.json -├── infra/ -│ └── amd-cloud.md # AMD Developer Cloud deployment guide -├── scripts/ -│ └── benchmark.py # end-to-end API benchmark helper -├── docker-compose.yml -└── README.md +where: + vision_score = 0.5 × excitement + 0.3 × tiktok_potential + 0.2 × humor_level ``` -## Processing Pipeline - -### 1. Video Input - -The backend accepts: - -- YouTube URL through `POST /api/jobs/youtube` -- Uploaded video file through `POST /api/jobs/upload` - -In production, YouTube videos are downloaded with `yt-dlp`. In demo mode, the app can generate a synthetic ffmpeg test video so the workflow can be tested without external downloads. - -### 2. Transcription - -The transcription service is implemented in `backend/app/services/transcription.py`. - -Production target: - -- Model: `openai/whisper-large-v3` -- Runtime: Hugging Face Transformers -- Accelerator: PyTorch ROCm on AMD MI300X -- Language goal: Thai and multilingual support - -### 3. Highlight Detection - -The highlight detector is implemented in `backend/app/services/highlight.py`. - -Production target: - -- Model: `Qwen/Qwen2.5-7B-Instruct` -- Runtime: vLLM with ROCm backend -- Output: strict structured JSON +--- -Highlight scoring considers: +## AI Pipeline + +``` +┌─ Input ──────────────────────────────────────────────────────────┐ +│ Uploaded video file (YouTube backend support is optional) │ +└──────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─ Audio Extraction (ffmpeg) ──────────────────────────────────────┐ +│ 16kHz mono WAV for Whisper │ +└──────────────────────────────────────────────────────────────────┘ + │ + ┌──────┴──────┐ + │ │ ← PARALLEL on AMD GPU ───────────────────────── + ▼ ▼ +┌─ Scene ┌─ Whisper ROCm ────────────────────────────────────┐ +│ Detection │ insanely-fast-whisper (SDPA attention, 4.45×) │ +│ PyScene │ → transcript + word-level timestamps │ +│ Detect │ → auto language detection │ +└─────┬──────┴───────────────────────────────────────────────────┘ + │ │ + ▼ ▼ +┌─ Frame Sampling ──────────────────────────────────────────────────┐ +│ 3 frames per scene (20%, 50%, 80% of scene) │ +└──────────────────────────────────────────────────────────────────┘ + │ + ▼ ← CONCURRENT requests to vLLM ────────────────────── +┌─ Qwen2.5-VL Multimodal Analysis ───────────────────────────────────┐ +│ Input per scene: [frame1] [frame2] [frame3] + transcript text │ +│ Output: excitement_score, tiktok_potential, face_bbox, │ +│ emotion, action_type, humor_level, highlight_reason │ +│ All scenes sent concurrently — vLLM batches on AMD MI300X │ +└──────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─ Multi-Signal Scoring ────────────────────────────────────────────┐ +│ score = 0.40×vision + 0.35×audio_energy + 0.25×text_keywords │ +│ Select top-N non-overlapping clips (min 30s gap) │ +└──────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─ Branch ──────────────────────────────────────────────────────────┐ +│ │ +│ Normal Mode HRE (High-Retention Editing) │ +│ ───────────── ────────────────────────────── │ +│ • pysubs2 ASS • Silence removal (ffmpeg) │ +│ • User style config • Auto-zoom to face (zoompan) │ +│ • Font/color/animation • Jump cuts at boundaries │ +│ • Karaoke/pop/fade • Qwen2.5-VL emoji selection │ +│ • AMD AMF encode • Impact bold captions │ +└──────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─ Editor (/editor) ────────────────────────────────────────────────┐ +│ • Per-clip subtitle timeline editing │ +│ • Global style override (live preview) │ +│ • Re-render + download MP4 │ +└──────────────────────────────────────────────────────────────────┘ +``` -- questions -- punchlines -- emotional peaks -- key information -- channel niche -- preferred clip style -- target platform -- target clip length +--- -### 4. Multimodal Analysis +## AMD GPU Optimizations -The multimodal service boundary is implemented in `backend/app/services/multimodal.py`. +- **ROCm 6.3** — all model inference on AMD Instinct MI300X +- **vLLM** — serves Qwen2.5-VL with continuous batching and PagedAttention +- **SDPA attention** — PyTorch 2.0 Scaled Dot-Product Attention for Whisper (4.45× faster on ROCm) +- **float16 inference** — 7B model fits in ~14 GB VRAM, leaves 50+ GB for large videos +- **h264_amf** — AMD VCE hardware encoder for clip extraction (falls back to libx264) +- **Parallel pipeline** — scene detection (CPU) + Whisper (GPU) run simultaneously +- **Concurrent vLLM requests** — all scenes sent to Qwen2.5-VL in parallel; server batches them -Planned production target: +--- -- Model: `Qwen/Qwen2-VL-7B-Instruct` -- Inputs: sampled video frames, transcript context, and clip candidates -- Visual signals: - - creator or guest reactions - - scene changes - - on-screen text - - high-motion segments +## Two Output Modes + +### Normal Subtitles +Full creative control over: +- Font family (Noto Sans Thai, Noto Sans SC, Montserrat, Impact, ...) +- Font size, bold/italic/underline +- 4-layer ASS colors: primary, secondary, outline, shadow +- Display mode: word-by-word or sentence +- Animation: Fade / Karaoke / Pop / Typewriter / Bounce +- Alignment (3×3 grid) + margin sliders +- Per-subtitle-line style overrides in the editor + +### High-Retention Editing (HRE) +AI chooses everything: +- Silence removal (`ffmpeg silenceremove`) +- Auto-zoom to face region (`ffmpeg zoompan` using Qwen2.5-VL face_bbox) +- Jump cuts at scene boundaries +- Qwen2.5-VL selects contextually-appropriate emoji overlay +- Impact 64px bold white captions, word-by-word, pop animation -This is isolated as a replaceable pipeline step so it can be enabled when AMD Cloud resources are available. +--- -### 5. Clip Generation +## Multilingual Support -Clip rendering is implemented in `backend/app/services/clips.py`. +| Layer | Coverage | +|---|---| +| UI language | ไทย · English · 中文 | +| Video input language | Auto-detect + 15+ (Whisper) | +| Subtitle output language | Thai (Noto Sans Thai) · Chinese (Noto Sans SC) · Japanese (Noto Sans JP) · Korean (Noto Sans KR) · English + more | +| Cross-lingual | Whisper translate → English when English subtitles are requested; multilingual transcription/subtitle timing uses Whisper language support | +| Character-level splitting | Thai and Chinese use character-level subtitle timing (no word spaces) | -The ffmpeg stage: +--- -- cuts video by selected timestamps -- exports MP4 -- creates `.srt` subtitle files -- supports subtitle burn-in -- reformats to 9:16 vertical output for short-form platforms -- includes AMD hardware encoder configuration hooks +## Tech Stack + +| Layer | Technology | +|---|---| +| Vision AI | **Qwen2.5-VL-7B-Instruct** (Apache 2.0) via vLLM | +| Speech-to-Text | **insanely-fast-whisper** with PyTorch SDPA on ROCm | +| Audio Analysis | **librosa** — RMS energy per scene | +| Scene Detection | **PySceneDetect** — ContentDetector | +| Video Download | **yt-dlp** | +| Video Processing | **ffmpeg** (AMD AMF hardware encode) | +| Subtitle Engine | **pysubs2** — full ASS format with karaoke tags | +| GPU | **AMD Instinct MI300X** via ROCm 6.3 | +| Frontend | **Next.js 16.2.4** App Router + Tailwind CSS | +| Backend | **FastAPI** + WebSocket (real-time progress) | +| Deployment | HuggingFace Spaces public demo + AMD GPU Cloud backend | -### 6. Human-AI Collaborative Editing +--- -The frontend editor lets creators review AI-generated clips and make final decisions: +## Judge Demo -- adjust start and end timestamps -- edit subtitle text -- delete weak clips -- approve good clips -- regenerate a specific clip -- download the result +Public visitors can open the HuggingFace Space and click **Try Demo** to see a simulated flow without using AMD GPU credits. Full AMD MI300X generation is protected by an access code shared only in the lablab.ai submission notes for judges. -## API Overview +Recommended judging flow: +1. Open the HuggingFace Space. +2. Click **Try Demo** for the instant public demo. +3. Enter the judge access code from the lablab.ai submission notes to run real generation on AMD GPU Cloud. +4. Upload a short MP4 sample for the real run. -| Method | Endpoint | Description | -| --- | --- | --- | -| `GET` | `/health` | Returns service health and accelerator detection. | -| `POST` | `/api/jobs/youtube` | Creates a processing job from a YouTube URL. | -| `POST` | `/api/jobs/upload` | Creates a processing job from an uploaded video. | -| `GET` | `/api/jobs/{job_id}` | Returns status, transcript, clips, timings, and errors. | -| `PATCH` | `/api/jobs/{job_id}/clips/{clip_id}` | Updates trim times, subtitles, approval, or deletion state. | -| `POST` | `/api/jobs/{job_id}/clips/{clip_id}/regenerate` | Re-renders one clip with updated parameters. | -| `GET` | `/api/jobs/{job_id}/clips/{clip_id}/download` | Downloads an exported clip. | +--- ## Local Development -### Requirements - -- Python 3.11+ -- Node.js 20+ -- ffmpeg - -### Backend +For the real development/demo path, run the frontend locally and point it at the AMD GPU Cloud backend: -```bash -cd backend -python -m venv .venv -. .venv/bin/activate -pip install -e . -uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 -``` - -On Windows PowerShell: - -```powershell -cd backend -python -m venv .venv -.\.venv\Scripts\Activate.ps1 -pip install -e . -uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 +```env +# frontend/.env.local +NEXT_PUBLIC_API_URL=http://129.212.178.101:8080 +NEXT_PUBLIC_DEMO_ENABLED=true +NEXT_PUBLIC_DEMO_ONLY=false ``` -### Frontend - ```bash cd frontend npm install -npm run dev +npm run dev # http://localhost:3000 ``` -Open: - -```text -http://localhost:5173 -``` +The AMD GPU Cloud backend runs FastAPI on `:8080` and vLLM/Qwen2.5-VL on `:8000`. For development without a GPU, the backend can still run with fallback stubs (stubbed Whisper, fallback vision scores). -### Demo Mode - -By default, the project runs in demo mode: - -```env -DEMO_MODE=true -``` +--- -Demo mode avoids downloading multi-GB AI models and returns deterministic mock transcript/highlight data while still exercising the API, UI, job state, timing logs, subtitle generation, and ffmpeg rendering path. +## Safe Public Demo Setup -## AMD Developer Cloud Deployment +ElevenClip AI supports three deployment modes: -See [infra/amd-cloud.md](infra/amd-cloud.md) for a focused deployment guide. +| Mode | Frontend runs on | Backend/vLLM runs on | Use when | +|---|---|---|---| +| Local dev | Your laptop (`localhost:3000`) | AMD GPU Cloud (`129.212.178.101:8080`) | Iterating quickly while using MI300X remotely | +| HF public shell | HuggingFace Space CPU | AMD GPU Cloud | Public hackathon page, real generation gated by access code | +| HF self-contained GPU | HuggingFace Space | HuggingFace Space GPU | Only if the Space has suitable ROCm/AMD GPU hardware | -High-level steps: - -```bash -git clone https://github.com/JakgritB/ElevenClip.AI.git -cd ElevenClip.AI -cp .env.example .env -``` - -Edit `.env`: +For the current CPU Basic HuggingFace Space, use it as the public UI and keep real generation on AMD GPU Cloud: ```env -DEMO_MODE=false -HF_TOKEN=your_huggingface_token -WHISPER_MODEL_ID=openai/whisper-large-v3 -QWEN_TEXT_MODEL_ID=Qwen/Qwen2.5-7B-Instruct -QWEN_VL_MODEL_ID=Qwen/Qwen2-VL-7B-Instruct -``` - -Install the AI/ROCm stack on the AMD instance: - -```bash -cd backend -pip install -e ".[ai,rocm-inference]" -``` - -Start the API: - -```bash -uvicorn app.main:app --host 0.0.0.0 --port 8000 -``` - -Validate accelerator detection: - -```bash -curl http://localhost:8000/health -``` - -Expected on AMD Cloud: - -- `torch_available: true` -- `cuda_api_available: true` -- `rocm_hip_version` populated -- MI300X visible as the active device - -## Docker - -```bash -docker compose up --build +# frontend/.env.local for local development +NEXT_PUBLIC_API_URL=http://129.212.178.101:8080 +NEXT_PUBLIC_DEMO_ENABLED=true +NEXT_PUBLIC_DEMO_ONLY=false ``` -For AMD Developer Cloud with ROCm extras: +On the AMD GPU Cloud backend, protect expensive GPU endpoints before exposing the demo: ```bash -docker compose build --build-arg INSTALL_EXTRAS=.[ai,rocm-inference] backend -docker compose up +export DEMO_ACCESS_CODE="share-this-only-with-judges" +export MAX_CONCURRENT_JOBS=1 +export MAX_UPLOAD_MB=300 +export VLLM_IDLE_TIMEOUT=300 ``` -The compose file mounts AMD GPU devices (`/dev/kfd`, `/dev/dri`) and uses host IPC for large-model inference. - -## Benchmark Plan - -The hackathon judges care about technology application and real-world performance. ElevenClip.AI includes step-level timing logs so the demo can show why AMD acceleration matters. - -Run a benchmark against a running API: - -```bash -python scripts/benchmark.py \ - --api http://localhost:8000 \ - --youtube-url "https://youtube.com/watch?v=..." -``` - -Recommended benchmark comparison: - -| Scenario | Hardware | Expected Purpose | -| --- | --- | --- | -| CPU baseline | CPU-only runtime | Show the pain of long-form video processing without acceleration. | -| AMD GPU run | AMD Instinct MI300X + ROCm | Show high-throughput transcription and Qwen inference. | - -Metrics captured: - -- input/download time -- transcription time -- highlight detection time -- multimodal analysis time -- clip generation time -- total wall-clock time -- number of clips generated +When `DEMO_ACCESS_CODE` is set, `/api/process`, `/api/video-info`, and vLLM start/stop endpoints require the `X-Demo-Key` header. The frontend shows a Demo Access Code field and sends that header automatically. Leave `DEMO_ACCESS_CODE` unset only for private/local testing. -Demo target: +For a self-contained HuggingFace GPU Space, leave `NEXT_PUBLIC_API_URL=""` so nginx routes `/api`, `/ws`, and `/downloads` to FastAPI inside the same Space. Only use this mode if the Space hardware is actually GPU-capable. -- input: two-hour creator video -- output: 10 subtitle-ready clips -- goal: under 10 minutes on MI300X +For the public HuggingFace Space, set `NEXT_PUBLIC_DEMO_ONLY=true`. Visitors can open the UI and run the simulated demo without touching AMD GPU credits. Judges can enter the access code to run real generation against the protected AMD GPU Cloud backend. -## Submission Assets Checklist - -The lablab.ai submission asks for: - -- Project title: `ElevenClip.AI` -- Short description -- Long description -- Technology and category tags -- Cover image -- Video presentation -- Slide presentation -- Public GitHub repository -- Demo application platform -- Application URL - -Prepared submission docs: - -- `docs/SUBMISSION.md` - copy-ready project text for lablab.ai. -- `docs/DEMO_SCRIPT.md` - draft and final recording script. -- `docs/PITCH_DECK.md` - slide outline for the presentation deck. -- `docs/BUILD_IN_PUBLIC.md` - social post drafts and AMD feedback notes. -- `docs/AMD_CREDIT_RUNBOOK.md` - checklist for the first MI300X run. - -Recommended tags: - -```text -AMD, ROCm, MI300X, AMD Developer Cloud, Vision AI, Multimodal AI, Video AI, Whisper, Qwen, Qwen-VL, Hugging Face, FastAPI, React -``` - -## Suggested Short Description - -```text -ElevenClip.AI turns long-form videos into personalized short-form clips using Whisper, Qwen, Hugging Face, and AMD ROCm on MI300X. -``` - -## Suggested Long Description - -```text -ElevenClip.AI is a human-AI collaborative clip studio for creators. It takes a YouTube URL or uploaded long-form video, transcribes it with Whisper Large V3, uses Qwen2.5 to identify high-engagement highlight moments based on a reusable channel profile, optionally enriches candidates with Qwen2-VL visual analysis, and renders short-form MP4 clips with subtitles using ffmpeg. The React editor lets creators trim, edit subtitles, approve, delete, regenerate, and download final clips. The project is designed for AMD Developer Cloud with ROCm and AMD Instinct MI300X acceleration, demonstrating how high-throughput multimodal AI can reduce hours of manual editing into a fast creator workflow. -``` - -## Judging Criteria Mapping - -### Application of Technology - -ElevenClip.AI integrates Whisper, Qwen2.5, Qwen2-VL, Hugging Face, ROCm, vLLM, and AMD Developer Cloud into an end-to-end video processing product. - -### Presentation - -The demo is designed to be visual and easy to understand: input a long video, watch AI create candidates, edit clips, and download platform-ready MP4 files. - -### Business Value - -The product targets a real creator economy workflow. Creators, agencies, podcasters, educators, and streamers all need short-form repurposing, and manual editing is expensive. - -### Originality - -The system goes beyond generic clipping by personalizing highlight selection to a creator's niche, style, language, clip length, and platform. It also preserves human editorial control instead of fully automating final publishing. - -## Build-in-Public Plan - -The hackathon includes a build-in-public challenge. Suggested updates: - -1. Share the architecture and first local demo. -2. Share AMD Cloud/ROCm setup notes and benchmark results. -3. Publish meaningful feedback about ROCm, AMD Developer Cloud, or inference setup. - -Suggested hashtags/topics: +--- -```text -#AMDDeveloperHackathon #ROCm #MI300X #HuggingFace #Qwen #VideoAI #MultimodalAI -``` +## Hackathon Compliance -## Roadmap +| Requirement | Status | +|---|---| +| Track 3: Vision & Multimodal AI | ✅ Qwen2.5-VL processes frames + audio simultaneously | +| AMD Developer Cloud | ✅ All inference on AMD Instinct MI300X via ROCm 6.3 | +| ROCm acceleration | ✅ vLLM + SDPA Whisper + h264_amf encoder | +| Qwen partner integration | ✅ Qwen2.5-VL as primary multimodal model and text/emoji prompt model | +| HuggingFace Space | ✅ `lablab-ai-amd-developer-hackathon/ElevenClip-AI` | +| Public GitHub repo | ✅ `JakgritB/ElevenClip-AI` | +| Ship It challenge | ✅ Social posts tagging @AIatAMD + @lablab | +| MIT license | ✅ | -- Real Whisper Large V3 run on AMD Developer Cloud. -- Real Qwen2.5 vLLM ROCm inference path. -- Qwen2-VL frame sampling and visual scoring. -- Batch export for 10+ clips. -- Subtitle styling presets per platform. -- Creator profile memory and reusable brand presets. -- Hugging Face Space screenshot and richer project media. -- CPU vs MI300X benchmark report after AMD credits arrive. +--- ## License -MIT. See [LICENSE](LICENSE). +MIT — see [LICENSE](LICENSE) diff --git a/backend/Dockerfile b/backend/Dockerfile deleted file mode 100644 index ae8ae78fdc566b745e023c830d40d55414cf664c..0000000000000000000000000000000000000000 --- a/backend/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -ARG ROCM_PYTORCH_IMAGE=rocm/pytorch:latest -FROM ${ROCM_PYTORCH_IMAGE} - -WORKDIR /app - -RUN apt-get update \ - && apt-get install -y --no-install-recommends ffmpeg git curl \ - && rm -rf /var/lib/apt/lists/* - -COPY pyproject.toml ./ -ARG INSTALL_EXTRAS=. -RUN pip install --upgrade pip && pip install -e "${INSTALL_EXTRAS}" - -COPY app ./app - -EXPOSE 8000 -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/backend/app/__init__.py b/backend/app/__init__.py deleted file mode 100644 index 4ceb363cf661693e08dc1aa362b4713d9514a2d3..0000000000000000000000000000000000000000 --- a/backend/app/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""AI Clip Studio backend.""" diff --git a/backend/app/core/__init__.py b/backend/app/core/__init__.py deleted file mode 100644 index 5cc273bfa5be7330a96f7e9df046d7aded224d91..0000000000000000000000000000000000000000 --- a/backend/app/core/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Core configuration and instrumentation.""" diff --git a/backend/app/core/config.py b/backend/app/core/config.py deleted file mode 100644 index b8eb7ee4ca8e45853def357112fab0a99743db77..0000000000000000000000000000000000000000 --- a/backend/app/core/config.py +++ /dev/null @@ -1,68 +0,0 @@ -from functools import lru_cache -import os -from pathlib import Path - -from pydantic import Field -from pydantic import BaseModel - - -class Settings(BaseModel): - app_name: str = "ElevenClip.AI" - demo_mode: bool = True - storage_dir: Path = Path("data") - frontend_origin: str = "http://localhost:5173" - - whisper_model_id: str = "openai/whisper-large-v3" - qwen_text_model_id: str = "Qwen/Qwen2.5-7B-Instruct" - qwen_vl_model_id: str = "Qwen/Qwen2-VL-7B-Instruct" - hf_token: str | None = None - preferred_torch_dtype: str = "bfloat16" - - target_clip_count: int = Field(default=5, ge=1, le=20) - max_clips: int = Field(default=10, ge=1, le=50) - - ffmpeg_binary: str = "ffmpeg" - ffprobe_binary: str = "ffprobe" - ffmpeg_video_codec: str = "h264_amf" - ffmpeg_cpu_codec: str = "libx264" - - redis_url: str = "redis://redis:6379/0" - celery_enabled: bool = False - - -@lru_cache -def get_settings() -> Settings: - settings = Settings( - demo_mode=_bool_env("DEMO_MODE", True), - storage_dir=Path(os.getenv("STORAGE_DIR", "data")), - frontend_origin=os.getenv("FRONTEND_ORIGIN", "http://localhost:5173"), - whisper_model_id=os.getenv("WHISPER_MODEL_ID", "openai/whisper-large-v3"), - qwen_text_model_id=os.getenv("QWEN_TEXT_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct"), - qwen_vl_model_id=os.getenv("QWEN_VL_MODEL_ID", "Qwen/Qwen2-VL-7B-Instruct"), - hf_token=os.getenv("HF_TOKEN") or None, - preferred_torch_dtype=os.getenv("TORCH_DTYPE", "bfloat16"), - target_clip_count=_int_env("TARGET_CLIP_COUNT", 5), - max_clips=_int_env("MAX_CLIPS", 10), - ffmpeg_binary=os.getenv("FFMPEG_BINARY", "ffmpeg"), - ffprobe_binary=os.getenv("FFPROBE_BINARY", "ffprobe"), - ffmpeg_video_codec=os.getenv("FFMPEG_VIDEO_CODEC", "h264_amf"), - ffmpeg_cpu_codec=os.getenv("FFMPEG_CPU_CODEC", "libx264"), - redis_url=os.getenv("REDIS_URL", "redis://redis:6379/0"), - celery_enabled=_bool_env("CELERY_ENABLED", False), - ) - settings.storage_dir.mkdir(parents=True, exist_ok=True) - return settings - - -def _bool_env(name: str, default: bool) -> bool: - value = os.getenv(name) - if value is None: - return default - return value.strip().lower() in {"1", "true", "yes", "on"} - - -def _int_env(name: str, default: int) -> int: - value = os.getenv(name) - if value is None: - return default - return int(value) diff --git a/backend/app/core/timing.py b/backend/app/core/timing.py deleted file mode 100644 index 4cadb8581f7d865521b82172fddd4eaddd3c7dcf..0000000000000000000000000000000000000000 --- a/backend/app/core/timing.py +++ /dev/null @@ -1,20 +0,0 @@ -from collections.abc import Iterator -from contextlib import contextmanager -from time import perf_counter - - -class TimingLog: - def __init__(self) -> None: - self._steps: dict[str, float] = {} - - @contextmanager - def measure(self, name: str) -> Iterator[None]: - started = perf_counter() - try: - yield - finally: - self._steps[name] = round(perf_counter() - started, 3) - - def to_dict(self) -> dict[str, float]: - total = round(sum(self._steps.values()), 3) - return {**self._steps, "total": total} diff --git a/backend/app/main.py b/backend/app/main.py deleted file mode 100644 index 73f244d373cada8d46a1fc45a8c5f48073709dc1..0000000000000000000000000000000000000000 --- a/backend/app/main.py +++ /dev/null @@ -1,240 +0,0 @@ -from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import FileResponse -from fastapi.staticfiles import StaticFiles - -from app.core.config import get_settings -from app.models.schemas import ( - ChannelProfile, - ClipCandidate, - ClipPatch, - HealthResponse, - JobSnapshot, - PolishSubtitlesRequest, - RegenerateClipRequest, - SubtitleCue, - TranslateSubtitlesRequest, - YoutubeJobRequest, -) -from app.services.highlight import QwenHighlightDetector -from app.services.pipeline import VideoPipeline -from app.services.transcription import WhisperTranscriber -from app.services.video_input import save_upload -from app.storage import JobStore -from app.utils.rocm import detect_accelerator - -settings = get_settings() -store = JobStore(settings) -pipeline = VideoPipeline(settings, store) -highlight_detector = QwenHighlightDetector(settings) -transcriber = WhisperTranscriber(settings) - -app = FastAPI(title=settings.app_name, version="0.1.0") -app.add_middleware( - CORSMiddleware, - allow_origins=[settings.frontend_origin, "http://localhost:5173", "http://127.0.0.1:5173"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) -app.mount("/media", StaticFiles(directory=settings.storage_dir), name="media") - - -@app.get("/health", response_model=HealthResponse) -async def health() -> HealthResponse: - return HealthResponse( - ok=True, - app=settings.app_name, - demo_mode=settings.demo_mode, - accelerator=detect_accelerator(), - ) - - -@app.post("/api/jobs/youtube", response_model=JobSnapshot) -async def create_youtube_job( - request: YoutubeJobRequest, background_tasks: BackgroundTasks -) -> JobSnapshot: - snapshot = store.create_job( - request.profile, {"kind": "youtube", "url": str(request.youtube_url)} - ) - background_tasks.add_task( - pipeline.process_source, snapshot.id, "youtube", str(request.youtube_url), request.profile - ) - return snapshot - - -@app.post("/api/jobs/upload", response_model=JobSnapshot) -async def create_upload_job( - background_tasks: BackgroundTasks, - profile_json: str = Form(...), - file: UploadFile = File(...), -) -> JobSnapshot: - try: - profile = ChannelProfile.model_validate_json(profile_json) - except Exception as exc: - raise HTTPException(status_code=422, detail=f"Invalid profile JSON: {exc}") from exc - - snapshot = store.create_job(profile, {"kind": "upload", "filename": file.filename}) - source_path = await save_upload(file, store.job_dir(snapshot.id)) - background_tasks.add_task(pipeline.process_source, snapshot.id, "upload", str(source_path), profile) - return snapshot - - -@app.get("/api/jobs/{job_id}", response_model=JobSnapshot) -async def get_job(job_id: str) -> JobSnapshot: - try: - return store.get_job(job_id) - except FileNotFoundError as exc: - raise HTTPException(status_code=404, detail="Job not found") from exc - - -@app.patch("/api/jobs/{job_id}/clips/{clip_id}", response_model=ClipCandidate) -async def update_clip(job_id: str, clip_id: str, patch: ClipPatch) -> ClipCandidate: - try: - return pipeline.patch_clip(job_id, clip_id, patch.model_dump()) - except FileNotFoundError as exc: - raise HTTPException(status_code=404, detail="Job not found") from exc - except KeyError as exc: - raise HTTPException(status_code=404, detail="Clip not found") from exc - - -@app.post("/api/jobs/{job_id}/clips/{clip_id}/regenerate", response_model=ClipCandidate) -async def regenerate_clip( - job_id: str, clip_id: str, request: RegenerateClipRequest -) -> ClipCandidate: - try: - return pipeline.regenerate_clip( - job_id, - clip_id, - clip_style=request.clip_style, - clip_length_seconds=request.clip_length_seconds, - subtitle_text=request.subtitle_text, - ) - except FileNotFoundError as exc: - raise HTTPException(status_code=404, detail="Source video not found") from exc - except KeyError as exc: - raise HTTPException(status_code=404, detail="Clip not found") from exc - - -@app.get("/api/jobs/{job_id}/clips/{clip_id}/download") -async def download_clip(job_id: str, clip_id: str) -> FileResponse: - snapshot = store.get_job(job_id) - clip = next((item for item in snapshot.clips if item.id == clip_id), None) - if clip is None or clip.download_url is None: - raise HTTPException(status_code=404, detail="Clip not found") - filename = clip.download_url.rsplit("/", 1)[-1] - path = store.job_dir(job_id) / filename - if not path.exists(): - raise HTTPException(status_code=404, detail="Clip file not found") - return FileResponse(path, media_type="video/mp4", filename=filename) - - -# ───────────────────────────────────────────────────────────────── -# AI subtitle endpoints — work in demo mode immediately, switch to -# real Qwen / Whisper output once DEMO_MODE=false on AMD GPU cloud. -# ───────────────────────────────────────────────────────────────── - - -def _resolve_clip_cues(snapshot: JobSnapshot, clip: ClipCandidate) -> list[SubtitleCue]: - """Return the cue list to operate on. Prefer explicit subtitle_cues; fall - back to splitting subtitle_text into evenly-spaced cues.""" - if clip.subtitle_cues: - return [SubtitleCue(**cue.model_dump()) for cue in clip.subtitle_cues] - duration = max(0.5, clip.end_seconds - clip.start_seconds) - text = clip.subtitle_text.strip() - if not text: - return [SubtitleCue(start_seconds=0.0, end_seconds=duration, text="")] - # Reuse Whisper aligner's deterministic chunking for fallback - return transcriber._demo_align_words(text, 0.0, duration) - - -@app.post( - "/api/jobs/{job_id}/clips/{clip_id}/subtitle/polish", - response_model=ClipCandidate, -) -async def polish_clip_subtitles( - job_id: str, clip_id: str, request: PolishSubtitlesRequest -) -> ClipCandidate: - try: - snapshot = store.get_job(job_id) - except FileNotFoundError as exc: - raise HTTPException(status_code=404, detail="Job not found") from exc - clip = next((c for c in snapshot.clips if c.id == clip_id), None) - if clip is None: - raise HTTPException(status_code=404, detail="Clip not found") - - cues_in = _resolve_clip_cues(snapshot, clip) - polished = highlight_detector.polish_subtitles(cues_in, style=request.style) - return pipeline.patch_clip( - job_id, - clip_id, - { - "subtitle_cues": [cue.model_dump() for cue in polished], - "subtitle_text": " ".join(cue.text for cue in polished if cue.text), - }, - ) - - -@app.post( - "/api/jobs/{job_id}/clips/{clip_id}/subtitle/translate", - response_model=ClipCandidate, -) -async def translate_clip_subtitles( - job_id: str, clip_id: str, request: TranslateSubtitlesRequest -) -> ClipCandidate: - try: - snapshot = store.get_job(job_id) - except FileNotFoundError as exc: - raise HTTPException(status_code=404, detail="Job not found") from exc - clip = next((c for c in snapshot.clips if c.id == clip_id), None) - if clip is None: - raise HTTPException(status_code=404, detail="Clip not found") - - cues_in = _resolve_clip_cues(snapshot, clip) - translated = highlight_detector.translate_subtitles(cues_in, request.target_language) - return pipeline.patch_clip( - job_id, - clip_id, - { - "subtitle_cues": [cue.model_dump() for cue in translated], - "subtitle_text": " ".join(cue.text for cue in translated if cue.text), - }, - ) - - -@app.post( - "/api/jobs/{job_id}/clips/{clip_id}/subtitle/auto-time", - response_model=ClipCandidate, -) -async def auto_time_clip_subtitles(job_id: str, clip_id: str) -> ClipCandidate: - try: - snapshot = store.get_job(job_id) - except FileNotFoundError as exc: - raise HTTPException(status_code=404, detail="Job not found") from exc - clip = next((c for c in snapshot.clips if c.id == clip_id), None) - if clip is None: - raise HTTPException(status_code=404, detail="Clip not found") - - text = clip.subtitle_text or " ".join( - (cue.text for cue in (clip.subtitle_cues or []) if cue.text) - ) - # Best-effort: production mode uses the actual source video on disk; demo - # mode uses synthetic chunking that doesn't require the file at all. - source_path = "" - try: - for entry in store.job_dir(job_id).iterdir(): - if entry.suffix.lower() in {".mp4", ".mkv", ".mov", ".webm"}: - source_path = str(entry) - break - except Exception: - source_path = "" - - timed = transcriber.align_words(source_path, text, clip.start_seconds, clip.end_seconds) - return pipeline.patch_clip( - job_id, - clip_id, - { - "subtitle_cues": [cue.model_dump() for cue in timed], - "subtitle_text": " ".join(cue.text for cue in timed if cue.text), - }, - ) diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py deleted file mode 100644 index 7e7e0baec71a634c0056d316380a2d3f55b3adc1..0000000000000000000000000000000000000000 --- a/backend/app/models/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Pydantic models.""" diff --git a/backend/app/models/schemas.py b/backend/app/models/schemas.py deleted file mode 100644 index e889da11ce074006417b42c936fca45d8fb67b6b..0000000000000000000000000000000000000000 --- a/backend/app/models/schemas.py +++ /dev/null @@ -1,127 +0,0 @@ -from datetime import datetime, timezone -from enum import Enum -from typing import Any, Literal - -from pydantic import BaseModel, Field, HttpUrl, field_validator - - -def utc_now() -> datetime: - return datetime.now(timezone.utc) - - -class TargetPlatform(str, Enum): - tiktok = "tiktok" - youtube_shorts = "youtube_shorts" - instagram_reels = "instagram_reels" - - -class ChannelProfile(BaseModel): - niche: str = Field(default="education", min_length=2, max_length=80) - niche_custom: str = Field(default="", max_length=80) - channel_description: str = Field(default="", max_length=700) - clip_style: str = Field(default="informative", min_length=2, max_length=80) - clip_length_seconds: int = Field(default=60, ge=15, le=180) - clip_count: int = Field(default=5, ge=1, le=20) - primary_language: str = Field(default="Thai", min_length=2, max_length=40) - target_platform: TargetPlatform = TargetPlatform.tiktok - - @field_validator("niche", "niche_custom", "channel_description", "clip_style", "primary_language") - @classmethod - def clean_text(cls, value: str) -> str: - return value.strip() - - -class YoutubeJobRequest(BaseModel): - youtube_url: HttpUrl - profile: ChannelProfile - - -class TranscriptSegment(BaseModel): - id: str - start_seconds: float = Field(ge=0) - end_seconds: float = Field(ge=0) - text: str - language: str | None = None - - -class SubtitleCue(BaseModel): - """A single subtitle line with explicit timing relative to clip start.""" - - start_seconds: float = Field(ge=0) - end_seconds: float = Field(ge=0) - text: str = "" - - -class SkipRange(BaseModel): - """A range to splice out of the middle of a clip (relative to clip start).""" - - start_seconds: float = Field(ge=0) - end_seconds: float = Field(ge=0) - - -class ClipCandidate(BaseModel): - id: str - start_seconds: float = Field(ge=0) - end_seconds: float = Field(ge=0) - title: str - reason: str - score: float = Field(ge=0, le=100) - subtitle_text: str = "" - subtitle_cues: list[SubtitleCue] | None = None - skip_ranges: list[SkipRange] | None = None - video_url: str | None = None - download_url: str | None = None - approved: bool = False - deleted: bool = False - metadata: dict[str, Any] = Field(default_factory=dict) - - -class ClipPatch(BaseModel): - start_seconds: float | None = Field(default=None, ge=0) - end_seconds: float | None = Field(default=None, ge=0) - subtitle_text: str | None = None - subtitle_cues: list[SubtitleCue] | None = None - skip_ranges: list[SkipRange] | None = None - approved: bool | None = None - deleted: bool | None = None - - -class RegenerateClipRequest(BaseModel): - clip_style: str | None = None - clip_length_seconds: int | None = Field(default=None, ge=15, le=180) - subtitle_text: str | None = None - - -class TranslateSubtitlesRequest(BaseModel): - target_language: str = Field(min_length=2, max_length=40) - - -class PolishSubtitlesRequest(BaseModel): - style: str | None = None - - -class JobSnapshot(BaseModel): - id: str - status: Literal["queued", "running", "completed", "failed"] - progress: float = Field(ge=0, le=1) - message: str - current_step: str = "" - step_index: int = Field(default=0, ge=0) - step_total: int = Field(default=6, ge=1) - active_clip_index: int = Field(default=0, ge=0) - active_clip_total: int = Field(default=0, ge=0) - source: dict[str, Any] - profile: ChannelProfile - transcript: list[TranscriptSegment] = Field(default_factory=list) - clips: list[ClipCandidate] = Field(default_factory=list) - timings: dict[str, float] = Field(default_factory=dict) - error: str | None = None - created_at: datetime = Field(default_factory=utc_now) - updated_at: datetime = Field(default_factory=utc_now) - - -class HealthResponse(BaseModel): - ok: bool - app: str - demo_mode: bool - accelerator: dict[str, Any] diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py deleted file mode 100644 index 42fa497d20196761c1687b2e7e09031c1abd61e0..0000000000000000000000000000000000000000 --- a/backend/app/services/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Pipeline services.""" diff --git a/backend/app/services/clips.py b/backend/app/services/clips.py deleted file mode 100644 index 20fe9cbde681900dfe7282fa01b670c6a885192d..0000000000000000000000000000000000000000 --- a/backend/app/services/clips.py +++ /dev/null @@ -1,219 +0,0 @@ -import shutil -import subprocess -from pathlib import Path -from typing import Callable - -from app.core.config import Settings -from app.models.schemas import ChannelProfile, ClipCandidate, TranscriptSegment -from app.services.subtitles import write_single_caption_srt, write_srt, write_srt_from_cues -from app.storage import JobStore - - -class ClipGenerator: - def __init__(self, settings: Settings, store: JobStore) -> None: - self.settings = settings - self.store = store - - def generate( - self, - job_id: str, - video_path: Path, - clips: list[ClipCandidate], - transcript: list[TranscriptSegment], - profile: ChannelProfile, - progress_callback: Callable[[int, int], None] | None = None, - ) -> list[ClipCandidate]: - rendered: list[ClipCandidate] = [] - total = len(clips) - for index, clip in enumerate(clips, start=1): - if progress_callback: - progress_callback(index, total) - rendered.append(self.render_one(job_id, video_path, clip, transcript, profile, index)) - return rendered - - def render_one( - self, - job_id: str, - video_path: Path, - clip: ClipCandidate, - transcript: list[TranscriptSegment], - profile: ChannelProfile, - index: int = 1, - ) -> ClipCandidate: - job_dir = self.store.job_dir(job_id) - output_name = f"clip_{index:02}_{clip.id[:8]}.mp4" - subtitle_name = f"clip_{index:02}_{clip.id[:8]}.srt" - output_path = job_dir / output_name - subtitle_path = job_dir / subtitle_name - - duration = max(1.0, clip.end_seconds - clip.start_seconds) - if clip.subtitle_cues: - subtitle_cues = write_srt_from_cues(subtitle_path, clip.subtitle_cues) - elif clip.subtitle_text.strip(): - subtitle_cues = write_single_caption_srt(subtitle_path, duration, clip.subtitle_text) - else: - subtitle_cues = write_srt(subtitle_path, clip.start_seconds, clip.end_seconds, transcript) - self._run_ffmpeg(video_path, output_path, subtitle_path, clip, profile) - - clip.video_url = self.store.media_url(job_id, output_name) - clip.download_url = clip.video_url - clip.metadata["subtitle_file"] = self.store.media_url(job_id, subtitle_name) - clip.metadata["subtitle_cues"] = subtitle_cues - return clip - - def _run_ffmpeg( - self, - video_path: Path, - output_path: Path, - subtitle_path: Path, - clip: ClipCandidate, - profile: ChannelProfile, - ) -> None: - ffmpeg = shutil.which(self.settings.ffmpeg_binary) - if not ffmpeg or not video_path.exists() or video_path.stat().st_size == 0: - output_path.write_bytes(b"") - return - - keep_ranges = self._compute_keep_ranges(clip) - post_filters = [self._platform_filter(profile), self._subtitle_filter(subtitle_path)] - post_chain = ",".join(post_filters) - - if len(keep_ranges) <= 1: - start, end = keep_ranges[0] - command = [ - ffmpeg, - "-y", - "-ss", - f"{start:.3f}", - "-i", - str(video_path), - "-t", - f"{max(0.5, end - start):.3f}", - "-vf", - post_chain, - "-c:v", - self.settings.ffmpeg_video_codec, - "-c:a", - "aac", - "-b:a", - "160k", - "-movflags", - "+faststart", - str(output_path), - ] - else: - # Build concat filter that keeps multiple segments and skips middle ranges - parts = [] - labels_v = [] - labels_a = [] - for i, (start, end) in enumerate(keep_ranges): - parts.append( - f"[0:v]trim=start={start:.3f}:end={end:.3f},setpts=PTS-STARTPTS[v{i}]" - ) - parts.append( - f"[0:a]atrim=start={start:.3f}:end={end:.3f},asetpts=PTS-STARTPTS[a{i}]" - ) - labels_v.append(f"[v{i}]") - labels_a.append(f"[a{i}]") - concat_inputs = "".join( - f"{labels_v[i]}{labels_a[i]}" for i in range(len(keep_ranges)) - ) - parts.append( - f"{concat_inputs}concat=n={len(keep_ranges)}:v=1:a=1[vc][ac]" - ) - parts.append(f"[vc]{post_chain}[vout]") - filter_complex = ";".join(parts) - command = [ - ffmpeg, - "-y", - "-i", - str(video_path), - "-filter_complex", - filter_complex, - "-map", - "[vout]", - "-map", - "[ac]", - "-c:v", - self.settings.ffmpeg_video_codec, - "-c:a", - "aac", - "-b:a", - "160k", - "-movflags", - "+faststart", - str(output_path), - ] - - try: - subprocess.run(command, check=True, capture_output=True, text=True, timeout=180) - return - except Exception: - fallback = command.copy() - try: - fallback[fallback.index(self.settings.ffmpeg_video_codec)] = ( - self.settings.ffmpeg_cpu_codec - ) - except ValueError: - pass - try: - subprocess.run(fallback, check=True, capture_output=True, text=True, timeout=180) - return - except Exception: - output_path.write_bytes(b"") - - def _compute_keep_ranges(self, clip: ClipCandidate) -> list[tuple[float, float]]: - """Return absolute video time ranges to keep, after subtracting skip_ranges.""" - clip_start = float(clip.start_seconds) - clip_end = float(clip.end_seconds) - if not clip.skip_ranges: - return [(clip_start, clip_end)] - - # Skip ranges are relative to clip start. Convert to absolute and sort. - skips: list[tuple[float, float]] = [] - for skip in clip.skip_ranges: - s = clip_start + max(0.0, float(skip.start_seconds)) - e = clip_start + max(0.0, float(skip.end_seconds)) - if e > s: - skips.append((min(s, clip_end), min(e, clip_end))) - skips.sort() - - # Merge overlapping - merged: list[tuple[float, float]] = [] - for s, e in skips: - if merged and s <= merged[-1][1]: - merged[-1] = (merged[-1][0], max(merged[-1][1], e)) - else: - merged.append((s, e)) - - # Compute keep segments - keeps: list[tuple[float, float]] = [] - cursor = clip_start - for s, e in merged: - if s > cursor: - keeps.append((cursor, s)) - cursor = max(cursor, e) - if cursor < clip_end: - keeps.append((cursor, clip_end)) - - return keeps if keeps else [(clip_start, clip_end)] - - def _platform_filter(self, profile: ChannelProfile) -> str: - if profile.target_platform.value in {"tiktok", "youtube_shorts", "instagram_reels"}: - return "scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920" - return "scale=1280:720:force_original_aspect_ratio=decrease,pad=1280:720:(ow-iw)/2:(oh-ih)/2" - - def _subtitle_filter(self, subtitle_path: Path) -> str: - escaped = str(subtitle_path.resolve()).replace("\\", "/").replace(":", "\\:") - style = ( - "Fontname=Arial," - "Fontsize=22," - "PrimaryColour=&H00FFFFFF," - "OutlineColour=&H00000000," - "BorderStyle=1," - "Outline=2," - "Shadow=1," - "Alignment=2," - "MarginV=210" - ) - return f"subtitles='{escaped}':force_style='{style}'" diff --git a/backend/app/services/highlight.py b/backend/app/services/highlight.py deleted file mode 100644 index b8e3c286e37d62a519c410e0d2c301b0ecdacaf5..0000000000000000000000000000000000000000 --- a/backend/app/services/highlight.py +++ /dev/null @@ -1,434 +0,0 @@ -import json -import re -from uuid import uuid4 - -from app.core.config import Settings -from app.models.schemas import ChannelProfile, ClipCandidate, SubtitleCue, TranscriptSegment - - -class QwenHighlightDetector: - def __init__(self, settings: Settings) -> None: - self.settings = settings - self._llm = None - - def detect( - self, transcript: list[TranscriptSegment], profile: ChannelProfile - ) -> list[ClipCandidate]: - if self.settings.demo_mode: - return self._heuristic_detect(transcript, profile) - - try: - return self._qwen_detect(transcript, profile) - except Exception: - return self._heuristic_detect(transcript, profile) - - def _qwen_detect( - self, transcript: list[TranscriptSegment], profile: ChannelProfile - ) -> list[ClipCandidate]: - try: - from vllm import LLM, SamplingParams - except Exception as exc: - raise RuntimeError("vLLM with ROCm backend is required for Qwen inference") from exc - - if self._llm is None: - self._llm = LLM( - model=self.settings.qwen_text_model_id, - dtype=self.settings.preferred_torch_dtype, - trust_remote_code=True, - ) - - transcript_text = "\n".join( - f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}" - for segment in transcript - ) - niche = _effective_niche(profile) - channel_description = profile.channel_description or "No extra channel description provided." - clip_count = min(profile.clip_count, self.settings.max_clips) - prompt = f""" -You are selecting short-form clips for a creator. -Profile: -- niche: {niche} -- creator description: {channel_description} -- style: {profile.clip_style} -- target length seconds: {profile.clip_length_seconds} -- target number of clips: {clip_count} -- language: {profile.primary_language} -- platform: {profile.target_platform.value} - -Return strict JSON only. Shape: -[ - {{ - "start_seconds": 12.0, - "end_seconds": 72.0, - "title": "short title", - "reason": "why this will engage viewers", - "score": 91, - "subtitle_text": "clean subtitle text" - }} -] - -Transcript: -{transcript_text} -""".strip() - sampling = SamplingParams(temperature=0.2, max_tokens=1200) - outputs = self._llm.generate([prompt], sampling) - text = outputs[0].outputs[0].text - payload = self._parse_json_array(text) - clips = [ - ClipCandidate( - id=uuid4().hex, - start_seconds=float(item["start_seconds"]), - end_seconds=float(item["end_seconds"]), - title=str(item.get("title") or "Highlight"), - reason=str(item.get("reason") or "High engagement potential"), - score=float(item.get("score") or 75), - subtitle_text=str(item.get("subtitle_text") or ""), - metadata={"model": self.settings.qwen_text_model_id}, - ) - for item in payload[:clip_count] - ] - return clips or self._heuristic_detect(transcript, profile) - - def _parse_json_array(self, text: str) -> list[dict]: - match = re.search(r"\[[\s\S]*\]", text) - if not match: - raise ValueError("No JSON array in Qwen response") - payload = json.loads(match.group(0)) - if not isinstance(payload, list): - raise ValueError("Qwen response is not a list") - return payload - - # ────────────────────────────────────────────────────────────── - # AI subtitle actions (Polish, Translate) - # ────────────────────────────────────────────────────────────── - - def polish_subtitles( - self, cues: list[SubtitleCue], style: str | None = None - ) -> list[SubtitleCue]: - """Rewrite cue text to be punchier and more readable on short-form video. - - Demo mode returns deterministic polished text so the UX is testable - without GPU. Production mode calls Qwen2.5. - """ - if self.settings.demo_mode: - return self._heuristic_polish(cues, style) - try: - return self._qwen_polish(cues, style) - except Exception: - return self._heuristic_polish(cues, style) - - def translate_subtitles( - self, cues: list[SubtitleCue], target_language: str - ) -> list[SubtitleCue]: - """Translate cue text to target_language while preserving timing.""" - if self.settings.demo_mode: - return self._heuristic_translate(cues, target_language) - try: - return self._qwen_translate(cues, target_language) - except Exception: - return self._heuristic_translate(cues, target_language) - - # ────────────────────────────────────────────────────────────── - # Demo / fallback implementations - # ────────────────────────────────────────────────────────────── - - def _heuristic_polish( - self, cues: list[SubtitleCue], style: str | None - ) -> list[SubtitleCue]: - """Apply simple text transformations that look like an AI polish.""" - polished: list[SubtitleCue] = [] - for cue in cues: - text = (cue.text or "").strip() - if not text: - polished.append(cue.model_copy()) - continue - # Shorten redundant phrasing (heuristic) - text = re.sub(r"\s+", " ", text) - text = re.sub(r"^(so|well|like|um|uh|you know|i mean)[,\s]+", "", text, flags=re.IGNORECASE) - text = text.rstrip(" ,.;:") - # Add light emphasis based on style - if style and style.lower() == "dramatic" and not text.endswith("!"): - text = text + "!" - polished.append( - SubtitleCue( - start_seconds=cue.start_seconds, - end_seconds=cue.end_seconds, - text=text, - ) - ) - return polished - - def _heuristic_translate( - self, cues: list[SubtitleCue], target_language: str - ) -> list[SubtitleCue]: - """Demo translation: append a marker so the UX shows the action ran.""" - marker = f"[{target_language[:2].upper()}]" - translated: list[SubtitleCue] = [] - for cue in cues: - text = (cue.text or "").strip() - translated.append( - SubtitleCue( - start_seconds=cue.start_seconds, - end_seconds=cue.end_seconds, - text=f"{marker} {text}" if text else "", - ) - ) - return translated - - # ────────────────────────────────────────────────────────────── - # Production Qwen calls (used when DEMO_MODE=false on AMD GPU) - # ────────────────────────────────────────────────────────────── - - def _ensure_llm(self): - try: - from vllm import LLM - except Exception as exc: - raise RuntimeError("vLLM with ROCm backend is required for Qwen") from exc - if self._llm is None: - self._llm = LLM( - model=self.settings.qwen_text_model_id, - dtype=self.settings.preferred_torch_dtype, - trust_remote_code=True, - ) - return self._llm - - def _qwen_polish( - self, cues: list[SubtitleCue], style: str | None - ) -> list[SubtitleCue]: - from vllm import SamplingParams - - llm = self._ensure_llm() - joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues)) - prompt = f""" -Rewrite each subtitle line to be punchier and easier to read on short-form vertical video. -Keep the same number of lines and the same approximate length per line. -Style preference: {style or 'natural'}. -Return one rewritten line per row, prefixed with the original index. No commentary. - -Input: -{joined} -""".strip() - outputs = llm.generate([prompt], SamplingParams(temperature=0.3, max_tokens=800)) - raw = outputs[0].outputs[0].text - rewritten = self._parse_indexed_lines(raw, expected=len(cues)) - return [ - SubtitleCue( - start_seconds=cue.start_seconds, - end_seconds=cue.end_seconds, - text=rewritten[i] if i < len(rewritten) else cue.text, - ) - for i, cue in enumerate(cues) - ] - - def _qwen_translate( - self, cues: list[SubtitleCue], target_language: str - ) -> list[SubtitleCue]: - from vllm import SamplingParams - - llm = self._ensure_llm() - joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues)) - prompt = f""" -Translate each subtitle line into {target_language}. Preserve line count and order. -Return one translated line per row, prefixed with the original index. No commentary. - -Input: -{joined} -""".strip() - outputs = llm.generate([prompt], SamplingParams(temperature=0.2, max_tokens=1000)) - raw = outputs[0].outputs[0].text - translated = self._parse_indexed_lines(raw, expected=len(cues)) - return [ - SubtitleCue( - start_seconds=cue.start_seconds, - end_seconds=cue.end_seconds, - text=translated[i] if i < len(translated) else cue.text, - ) - for i, cue in enumerate(cues) - ] - - def _parse_indexed_lines(self, raw: str, expected: int) -> list[str]: - lines = [] - for line in raw.splitlines(): - stripped = line.strip() - if not stripped: - continue - match = re.match(r"^\s*\d+[.)\s-]+\s*(.*)$", stripped) - lines.append(match.group(1).strip() if match else stripped) - if len(lines) >= expected: - break - return lines - - def _heuristic_detect( - self, transcript: list[TranscriptSegment], profile: ChannelProfile - ) -> list[ClipCandidate]: - style_terms = { - "funny": ["react", "punchy", "mistake", "surprising"], - "informative": ["important", "practical", "takeaway", "explanation"], - "dramatic": ["problem", "surprising", "before-and-after", "stop scrolling"], - "educational": ["question", "answer", "context", "takeaway"], - } - preferred_terms = style_terms.get(profile.clip_style.lower(), []) - niche = _effective_niche(profile) - profile_terms = [ - term - for term in f"{niche} {profile.channel_description}".lower().split()[:30] - if len(term) > 2 - ] - scored: list[tuple[float, TranscriptSegment]] = [] - for segment in transcript: - text = segment.text.lower() - score = 45.0 - score += 12 if "?" in segment.text else 0 - score += 8 if any(term in text for term in preferred_terms) else 0 - score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0 - score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0 - score += 5 if any(term in text for term in profile_terms) else 0 - score += min(len(segment.text) / 12, 10) - scored.append((min(score, 100), segment)) - - scored.sort(key=lambda item: item[0], reverse=True) - clips: list[ClipCandidate] = [] - clip_count = min(profile.clip_count, self.settings.max_clips) - for score, segment in scored[:clip_count]: - start = max(0.0, segment.start_seconds - 5.0) - end = start + float(profile.clip_length_seconds) - clips.append( - ClipCandidate( - id=uuid4().hex, - start_seconds=start, - end_seconds=end, - title=self._title_for(segment.text), - reason=self._reason_for(profile, niche), - score=round(score, 1), - subtitle_text=segment.text, - metadata={"model": "heuristic-fallback"}, - ) - ) - return sorted(clips, key=lambda clip: clip.start_seconds) - - def _title_for(self, text: str) -> str: - clean = re.sub(r"\s+", " ", text).strip(" \t\r\n.,!?;:()[]{}\"'") - words = clean.split() - if len(words) > 1: - title = " ".join(words[:7]) - else: - title = clean[:48] - return title[:72].rstrip() or "Highlight" - - def _reason_for(self, profile: ChannelProfile, niche: str) -> str: - language = profile.primary_language.lower() - style = _localized_profile_word(profile.clip_style, language, "style") - niche_label = _localized_profile_word(niche, language, "niche") - if "thai" in language: - return f"ตรงกับสไตล์ {style} สำหรับผู้ชมช่องแนว {niche_label}" - if "japanese" in language: - return f"{niche_label} の視聴者に合う {style} スタイルの候補です。" - if "chinese" in language: - return f"符合 {niche_label} 受众期待的 {style} 风格。" - if "korean" in language: - return f"{niche_label} 시청자에게 맞는 {style} 스타일의 후보입니다." - return f"Matches the {profile.clip_style} style for a {niche} audience." - - -def _effective_niche(profile: ChannelProfile) -> str: - if profile.niche.lower() == "other" and profile.niche_custom: - return profile.niche_custom - return profile.niche - - -def _localized_profile_word(value: str, language: str, group: str) -> str: - key = value.lower().replace(" ", "_") - localized = { - "thai": { - "niche": { - "education": "การศึกษา", - "gaming": "เกม", - "podcast": "พอดแคสต์", - "commentary": "เล่า/วิเคราะห์", - "cars": "รถยนต์", - "beauty": "บิวตี้", - "fitness": "ฟิตเนส", - "finance": "การเงิน", - "tech": "เทคโนโลยี", - "lifestyle": "ไลฟ์สไตล์", - "music": "ดนตรี", - }, - "style": { - "informative": "ให้ข้อมูล", - "funny": "ตลก", - "dramatic": "ดราม่า", - "educational": "สอนเข้าใจง่าย", - "commentary": "วิเคราะห์", - }, - }, - "japanese": { - "niche": { - "education": "教育", - "gaming": "ゲーム", - "podcast": "ポッドキャスト", - "commentary": "解説", - "cars": "車", - "beauty": "美容", - "fitness": "フィットネス", - "finance": "金融", - "tech": "テック", - "lifestyle": "ライフスタイル", - "music": "音楽", - }, - "style": { - "informative": "情報性の高い", - "funny": "ユーモアのある", - "dramatic": "ドラマチックな", - "educational": "学びやすい", - "commentary": "解説型の", - }, - }, - "chinese": { - "niche": { - "education": "教育", - "gaming": "游戏", - "podcast": "播客", - "commentary": "解说", - "cars": "汽车", - "beauty": "美妆", - "fitness": "健身", - "finance": "金融", - "tech": "科技", - "lifestyle": "生活方式", - "music": "音乐", - }, - "style": { - "informative": "信息量高", - "funny": "有趣", - "dramatic": "戏剧化", - "educational": "教学型", - "commentary": "评论型", - }, - }, - "korean": { - "niche": { - "education": "교육", - "gaming": "게임", - "podcast": "팟캐스트", - "commentary": "해설", - "cars": "자동차", - "beauty": "뷰티", - "fitness": "피트니스", - "finance": "금융", - "tech": "테크", - "lifestyle": "라이프스타일", - "music": "음악", - }, - "style": { - "informative": "정보형", - "funny": "재미있는", - "dramatic": "극적인", - "educational": "교육형", - "commentary": "해설형", - }, - }, - } - for language_key, groups in localized.items(): - if language_key in language: - return groups.get(group, {}).get(key, value) - return value diff --git a/backend/app/services/multimodal.py b/backend/app/services/multimodal.py deleted file mode 100644 index f9b25b0e50e13f9f31e6b2babe1880beb008196b..0000000000000000000000000000000000000000 --- a/backend/app/services/multimodal.py +++ /dev/null @@ -1,200 +0,0 @@ -import os -import subprocess -import tempfile - -from app.core.config import Settings -from app.models.schemas import ClipCandidate - -_DEMO_VISUALS = [ - ("High-energy scene with strong visual contrast and clear subject focus.", 88.0), - ("Close-up with expressive reactions — excellent engagement framing.", 92.0), - ("Dynamic motion sequence; subject well-lit with clean background.", 84.0), - ("Text-overlay-friendly composition with natural colour grading.", 79.0), - ("Wide establishing shot; strong emotional beat in middle frames.", 81.0), -] - - -class QwenVisualAnalyzer: - def __init__(self, settings: Settings) -> None: - self.settings = settings - self._model = None - self._processor = None - - def enrich(self, video_path: str, clips: list[ClipCandidate]) -> list[ClipCandidate]: - if self.settings.demo_mode: - return self._demo_enrich(clips) - try: - return self._qwen_enrich(video_path, clips) - except Exception: - return clips - - # ------------------------------------------------------------------ - # Demo mode - # ------------------------------------------------------------------ - - def _demo_enrich(self, clips: list[ClipCandidate]) -> list[ClipCandidate]: - enriched = [] - for i, clip in enumerate(clips): - note, vscore = _DEMO_VISUALS[i % len(_DEMO_VISUALS)] - enriched.append( - clip.model_copy( - update={ - "metadata": { - **clip.metadata, - "visual_model": "demo", - "visual_note": note, - "visual_score": vscore, - } - } - ) - ) - return enriched - - # ------------------------------------------------------------------ - # Production mode — Qwen2-VL on ROCm - # ------------------------------------------------------------------ - - def _load_model(self) -> None: - try: - import torch - from transformers import AutoProcessor, Qwen2VLForConditionalGeneration - except ImportError as exc: - raise RuntimeError("transformers + ROCm PyTorch are required for Qwen2-VL") from exc - - dtype = getattr(torch, self.settings.preferred_torch_dtype, torch.bfloat16) - self._model = Qwen2VLForConditionalGeneration.from_pretrained( - self.settings.qwen_vl_model_id, - torch_dtype=dtype, - device_map="auto", - trust_remote_code=True, - token=self.settings.hf_token or None, - ) - self._processor = AutoProcessor.from_pretrained( - self.settings.qwen_vl_model_id, - trust_remote_code=True, - token=self.settings.hf_token or None, - ) - - def _qwen_enrich(self, video_path: str, clips: list[ClipCandidate]) -> list[ClipCandidate]: - if self._model is None: - self._load_model() - - enriched = [] - for clip in clips: - try: - frames = _sample_frames(video_path, clip.start_seconds, clip.end_seconds, self.settings.ffmpeg_binary) - if not frames: - enriched.append(clip) - continue - note, vscore = self._analyze(frames, clip.title) - enriched.append( - clip.model_copy( - update={ - "metadata": { - **clip.metadata, - "visual_model": self.settings.qwen_vl_model_id, - "visual_note": note, - "visual_score": vscore, - } - } - ) - ) - except Exception: - enriched.append( - clip.model_copy( - update={ - "metadata": { - **clip.metadata, - "visual_model": self.settings.qwen_vl_model_id, - "visual_status": "analysis_failed", - } - } - ) - ) - return enriched - - def _analyze(self, frames: list, title: str) -> tuple[str, float]: - import torch - - messages = [ - { - "role": "user", - "content": [ - *[{"type": "image", "image": f} for f in frames], - { - "type": "text", - "text": ( - f'These frames are from a clip titled "{title}". ' - "Describe the visual quality and short-form engagement potential in 1-2 sentences. " - "Then output exactly: SCORE: " - ), - }, - ], - } - ] - text = self._processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - inputs = self._processor(text=[text], images=frames, return_tensors="pt").to(self._model.device) - with torch.no_grad(): - ids = self._model.generate(**inputs, max_new_tokens=140) - reply = self._processor.batch_decode( - ids[:, inputs["input_ids"].shape[1]:], - skip_special_tokens=True, - )[0].strip() - - vscore = 75.0 - for line in reversed(reply.splitlines()): - upper = line.strip().upper() - if upper.startswith("SCORE:"): - try: - vscore = float(upper.split(":", 1)[1].strip()) - except ValueError: - pass - break - - note = reply.split("SCORE:")[0].strip() or reply - return note, min(max(vscore, 0.0), 100.0) - - -# ------------------------------------------------------------------ -# Frame extraction helper -# ------------------------------------------------------------------ - -def _sample_frames(video_path: str, start: float, end: float, ffmpeg: str, n: int = 4) -> list: - try: - from PIL import Image - except ImportError: - return [] - - duration = max(end - start, 1.0) - timestamps = [start + duration * i / max(n - 1, 1) for i in range(n)] - frames = [] - tmp_files = [] - try: - for ts in timestamps: - fd, tmp = tempfile.mkstemp(suffix=".jpg") - os.close(fd) - tmp_files.append(tmp) - result = subprocess.run( - [ - ffmpeg, - "-ss", f"{ts:.3f}", - "-i", video_path, - "-vframes", "1", - "-q:v", "2", - "-y", tmp, - ], - capture_output=True, - timeout=15, - ) - if result.returncode == 0: - try: - frames.append(Image.open(tmp).convert("RGB")) - except Exception: - pass - finally: - for tmp in tmp_files: - try: - os.unlink(tmp) - except OSError: - pass - return frames diff --git a/backend/app/services/pipeline.py b/backend/app/services/pipeline.py deleted file mode 100644 index b98ae81a483d8c0c3649c5b8a3ee48c881ff5952..0000000000000000000000000000000000000000 --- a/backend/app/services/pipeline.py +++ /dev/null @@ -1,236 +0,0 @@ -import asyncio -from pathlib import Path - -from app.core.config import Settings -from app.core.timing import TimingLog -from app.models.schemas import ChannelProfile, ClipCandidate -from app.services.clips import ClipGenerator -from app.services.highlight import QwenHighlightDetector -from app.services.multimodal import QwenVisualAnalyzer -from app.services.transcription import WhisperTranscriber -from app.services.video_input import resolve_youtube_url -from app.storage import JobStore - - -class VideoPipeline: - def __init__(self, settings: Settings, store: JobStore) -> None: - self.settings = settings - self.store = store - self.transcriber = WhisperTranscriber(settings) - self.highlight_detector = QwenHighlightDetector(settings) - self.visual_analyzer = QwenVisualAnalyzer(settings) - self.clip_generator = ClipGenerator(settings, store) - - async def process_source( - self, - job_id: str, - source_kind: str, - source_value: str, - profile: ChannelProfile, - ) -> None: - timings = TimingLog() - try: - self.store.update_job( - job_id, - status="running", - progress=0.04, - message="Preparing video input", - current_step="input", - step_index=1, - step_total=6, - ) - with timings.measure("input"): - if source_kind == "youtube": - video_path = await resolve_youtube_url( - source_value, self.store.job_dir(job_id), self.settings - ) - else: - video_path = Path(source_value) - - self.store.update_job( - job_id, - progress=0.18, - message="Transcribing with Whisper Large V3", - current_step="transcription", - step_index=2, - step_total=6, - ) - with timings.measure("transcription"): - transcript = await asyncio.to_thread( - self.transcriber.transcribe, str(video_path), profile - ) - self.store.write_json( - job_id, - "transcript.json", - [segment.model_dump(mode="json") for segment in transcript], - ) - self.store.update_job( - job_id, - progress=0.42, - message="Transcript ready", - transcript=transcript, - timings=timings.to_dict(), - ) - - self.store.update_job( - job_id, - progress=0.48, - message="Scoring highlights with Qwen", - current_step="highlight_detection", - step_index=3, - step_total=6, - ) - with timings.measure("highlight_detection"): - clips = await asyncio.to_thread(self.highlight_detector.detect, transcript, profile) - - self.store.update_job( - job_id, - progress=0.62, - message="Checking visual highlights", - current_step="multimodal_analysis", - step_index=4, - step_total=6, - ) - with timings.measure("multimodal_analysis"): - clips = await asyncio.to_thread(self.visual_analyzer.enrich, str(video_path), clips) - - clip_total = len(clips) - self.store.update_job( - job_id, - progress=0.72, - message=f"Preparing to render {clip_total} clips", - current_step="clip_generation", - step_index=5, - step_total=6, - active_clip_index=0, - active_clip_total=clip_total, - ) - - def update_render_progress(index: int, total: int) -> None: - progress = 0.72 + (0.22 * ((index - 1) / max(total, 1))) - self.store.update_job( - job_id, - progress=min(progress, 0.94), - message=f"Rendering clip {index}/{total}", - current_step="clip_generation", - step_index=5, - step_total=6, - active_clip_index=index, - active_clip_total=total, - timings=timings.to_dict(), - ) - - with timings.measure("clip_generation"): - rendered = await asyncio.to_thread( - self.clip_generator.generate, - job_id, - video_path, - clips, - transcript, - profile, - update_render_progress, - ) - - self.store.update_job( - job_id, - progress=0.97, - message="Finalizing clips", - current_step="finalizing", - step_index=6, - step_total=6, - active_clip_index=clip_total, - active_clip_total=clip_total, - timings=timings.to_dict(), - ) - self.store.write_json( - job_id, "clips.json", [clip.model_dump(mode="json") for clip in rendered] - ) - self.store.update_job( - job_id, - status="completed", - progress=1, - message="Clips ready", - current_step="completed", - step_index=6, - step_total=6, - active_clip_index=clip_total, - active_clip_total=clip_total, - transcript=transcript, - clips=rendered, - timings=timings.to_dict(), - ) - except Exception as exc: - self.store.update_job( - job_id, - status="failed", - progress=1, - message="Processing failed", - current_step="failed", - error=str(exc), - timings=timings.to_dict(), - ) - - def patch_clip(self, job_id: str, clip_id: str, updates: dict) -> ClipCandidate: - snapshot = self.store.get_job(job_id) - patched: ClipCandidate | None = None - clips: list[ClipCandidate] = [] - for clip in snapshot.clips: - if clip.id == clip_id: - clean_updates = {key: value for key, value in updates.items() if value is not None} - clip = clip.model_copy(update=clean_updates) - if clip.end_seconds <= clip.start_seconds: - clip = clip.model_copy(update={"end_seconds": clip.start_seconds + 1}) - patched = clip - clips.append(clip) - if patched is None: - raise KeyError(clip_id) - self.store.update_job(job_id, clips=clips) - return patched - - def regenerate_clip( - self, - job_id: str, - clip_id: str, - clip_style: str | None = None, - clip_length_seconds: int | None = None, - subtitle_text: str | None = None, - ) -> ClipCandidate: - snapshot = self.store.get_job(job_id) - source_path = self._source_path(job_id) - clips: list[ClipCandidate] = [] - regenerated: ClipCandidate | None = None - for index, clip in enumerate(snapshot.clips, start=1): - if clip.id == clip_id: - profile = snapshot.profile.model_copy( - update={ - key: value - for key, value in { - "clip_style": clip_style, - "clip_length_seconds": clip_length_seconds, - }.items() - if value is not None - } - ) - if clip_length_seconds is not None: - clip = clip.model_copy( - update={"end_seconds": clip.start_seconds + clip_length_seconds} - ) - if subtitle_text is not None: - clip = clip.model_copy(update={"subtitle_text": subtitle_text}) - clip = self.clip_generator.render_one( - job_id, source_path, clip, snapshot.transcript, profile, index - ) - clip.metadata["regenerated"] = True - regenerated = clip - clips.append(clip) - if regenerated is None: - raise KeyError(clip_id) - self.store.update_job(job_id, clips=clips) - return regenerated - - def _source_path(self, job_id: str) -> Path: - job_dir = self.store.job_dir(job_id) - matches = sorted(job_dir.glob("source.*")) - if not matches: - raise FileNotFoundError("source video missing") - return matches[0] diff --git a/backend/app/services/subtitles.py b/backend/app/services/subtitles.py deleted file mode 100644 index 9e4127c08191cd52c63fc75c528ddff3b9574477..0000000000000000000000000000000000000000 --- a/backend/app/services/subtitles.py +++ /dev/null @@ -1,151 +0,0 @@ -import re -from pathlib import Path - -from app.models.schemas import TranscriptSegment - - -def seconds_to_srt_time(value: float) -> str: - millis = int(round(value * 1000)) - hours, remainder = divmod(millis, 3_600_000) - minutes, remainder = divmod(remainder, 60_000) - seconds, millis = divmod(remainder, 1000) - return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}" - - -def write_srt( - path: Path, clip_start: float, clip_end: float, segments: list[TranscriptSegment] -) -> list[dict]: - cues: list[dict] = [] - rows: list[str] = [] - index = 1 - for segment in segments: - if segment.end_seconds < clip_start or segment.start_seconds > clip_end: - continue - start = max(0.0, segment.start_seconds - clip_start) - end = min(clip_end - clip_start, segment.end_seconds - clip_start) - for cue in split_timed_caption(segment.text, start, max(end, start + 1.2)): - rows.extend(_srt_row(index, cue["start_seconds"], cue["end_seconds"], cue["text"])) - cues.append(cue) - index += 1 - if not rows: - cues = [{"start_seconds": 0.0, "end_seconds": 3.0, "text": ""}] - rows = _srt_row(1, 0.0, 3.0, "") - path.write_text("\n".join(rows), encoding="utf-8") - return cues - - -def write_single_caption_srt(path: Path, duration: float, text: str) -> list[dict]: - safe_duration = max(duration, 1.0) - cues = split_timed_caption(text, 0.0, safe_duration) - rows: list[str] = [] - for index, cue in enumerate(cues, start=1): - rows.extend(_srt_row(index, cue["start_seconds"], cue["end_seconds"], cue["text"])) - if not rows: - cues = [{"start_seconds": 0.0, "end_seconds": min(safe_duration, 3.0), "text": ""}] - rows = _srt_row(1, cues[0]["start_seconds"], cues[0]["end_seconds"], "") - path.write_text("\n".join(rows), encoding="utf-8") - return cues - - -def write_srt_from_cues(path: Path, cues: list) -> list[dict]: - """Write SRT using user-supplied per-cue timing (preferred over auto-distribution). - - Accepts list of objects with .start_seconds / .end_seconds / .text attributes - (Pydantic SubtitleCue) or dicts with the same keys. - """ - rows: list[str] = [] - out_cues: list[dict] = [] - index = 1 - for cue in cues: - start = float(getattr(cue, "start_seconds", None) or cue.get("start_seconds", 0)) - end = float(getattr(cue, "end_seconds", None) or cue.get("end_seconds", 0)) - text = str(getattr(cue, "text", None) or cue.get("text", "")) - if end <= start: - end = start + 1.0 - clean_text = text.strip() - if not clean_text: - continue - rows.extend(_srt_row(index, start, end, clean_text)) - out_cues.append({"start_seconds": round(start, 3), "end_seconds": round(end, 3), "text": clean_text}) - index += 1 - if not rows: - out_cues = [{"start_seconds": 0.0, "end_seconds": 3.0, "text": ""}] - rows = _srt_row(1, 0.0, 3.0, "") - path.write_text("\n".join(rows), encoding="utf-8") - return out_cues - - -def split_timed_caption(text: str, start: float, end: float) -> list[dict]: - phrases = split_caption_text(text) - if not phrases: - return [] - - total_duration = max(end - start, 1.2) - max_cues = max(1, int(total_duration / 1.2)) - if len(phrases) > max_cues: - phrases = _merge_phrases(phrases, max_cues) - - cue_duration = min(4.0, max(1.2, total_duration / len(phrases))) - cues: list[dict] = [] - cursor = start - for index, phrase in enumerate(phrases): - remaining = len(phrases) - index - max_end = end - ((remaining - 1) * 1.2) - cue_end = min(max_end, cursor + cue_duration) - cue_end = max(cue_end, cursor + 1.2) - if index == len(phrases) - 1: - cue_end = end - cues.append( - { - "start_seconds": round(cursor, 3), - "end_seconds": round(max(cue_end, cursor + 0.8), 3), - "text": phrase, - } - ) - cursor = cue_end - return cues - - -def split_caption_text(text: str, max_chars: int = 42, max_words: int = 7) -> list[str]: - clean = re.sub(r"\s+", " ", text.strip()) - if not clean: - return [] - - words = clean.split() - if len(words) <= 1: - return [clean[index : index + max_chars] for index in range(0, len(clean), max_chars)] - - phrases: list[str] = [] - current: list[str] = [] - for word in words: - candidate = " ".join([*current, word]).strip() - punctuation_break = bool(current and re.search(r"[,.!?;:]$", current[-1])) - if current and (len(candidate) > max_chars or len(current) >= max_words or punctuation_break): - phrases.append(" ".join(current)) - current = [word] - else: - current.append(word) - if current: - phrases.append(" ".join(current)) - return phrases - - -def _merge_phrases(phrases: list[str], target_count: int) -> list[str]: - if target_count <= 1: - return [" ".join(phrases)] - merged: list[str] = [] - bucket_size = len(phrases) / target_count - for index in range(target_count): - start = round(index * bucket_size) - end = round((index + 1) * bucket_size) - merged.append(" ".join(phrases[start:end]).strip()) - return [phrase for phrase in merged if phrase] - - -def _srt_row(index: int, start: float, end: float, text: str) -> list[str]: - return [ - str(index), - f"{seconds_to_srt_time(start)} --> {seconds_to_srt_time(end)}", - text.strip(), - "", - ] diff --git a/backend/app/services/transcription.py b/backend/app/services/transcription.py deleted file mode 100644 index 678240592a9a2d6b970331066cd5a878964cb73e..0000000000000000000000000000000000000000 --- a/backend/app/services/transcription.py +++ /dev/null @@ -1,366 +0,0 @@ -from pathlib import Path -from uuid import uuid4 - -from app.core.config import Settings -from app.models.schemas import ChannelProfile, SubtitleCue, TranscriptSegment -from app.utils.rocm import torch_device_index - - -class WhisperTranscriber: - def __init__(self, settings: Settings) -> None: - self.settings = settings - self._pipeline = None - - def transcribe(self, video_path: str, profile: ChannelProfile) -> list[TranscriptSegment]: - if self.settings.demo_mode: - return self._demo_transcript(profile) - - try: - from transformers import pipeline - except Exception as exc: - raise RuntimeError("transformers is required for Whisper transcription") from exc - - if self._pipeline is None: - self._pipeline = pipeline( - task="automatic-speech-recognition", - model=self.settings.whisper_model_id, - device=torch_device_index(), - token=self.settings.hf_token, - chunk_length_s=30, - return_timestamps=True, - ) - - generate_kwargs = {"task": "transcribe"} - if profile.primary_language and profile.primary_language.lower() != "auto": - generate_kwargs["language"] = profile.primary_language.lower() - - result = self._pipeline(str(video_path), generate_kwargs=generate_kwargs) - chunks = result.get("chunks") or [] - if not chunks: - text = result.get("text", "").strip() - return [ - TranscriptSegment( - id=uuid4().hex, - start_seconds=0, - end_seconds=max(profile.clip_length_seconds, 15), - text=text, - language=profile.primary_language, - ) - ] - - segments: list[TranscriptSegment] = [] - for chunk in chunks: - timestamp = chunk.get("timestamp") or (0, 0) - start = float(timestamp[0] or 0) - end = float(timestamp[1] or start + 5) - text = (chunk.get("text") or "").strip() - if text: - segments.append( - TranscriptSegment( - id=uuid4().hex, - start_seconds=start, - end_seconds=max(end, start + 1), - text=text, - language=profile.primary_language, - ) - ) - return segments - - def align_words( - self, - video_path: str | Path, - text: str, - clip_start: float, - clip_end: float, - ) -> list[SubtitleCue]: - """Estimate per-word/per-phrase timing within [clip_start, clip_end]. - - Demo mode: split the text into chunks of ~3 words, distribute timings - across the clip duration. Production: run Whisper word-level timestamps. - - Returns SubtitleCues with timing relative to clip_start. - """ - if self.settings.demo_mode or not text.strip(): - return self._demo_align_words(text, clip_start, clip_end) - try: - return self._whisper_align_words(video_path, text, clip_start, clip_end) - except Exception: - return self._demo_align_words(text, clip_start, clip_end) - - def _demo_align_words( - self, text: str, clip_start: float, clip_end: float - ) -> list[SubtitleCue]: - clean = " ".join(text.split()) - if not clean: - return [SubtitleCue(start_seconds=0.0, end_seconds=2.0, text="")] - words = clean.split() - # Group into ~3 word chunks (typical for short-form caption pacing) - chunk_size = max(2, min(4, max(1, len(words) // 6))) - chunks: list[str] = [] - for i in range(0, len(words), chunk_size): - chunks.append(" ".join(words[i : i + chunk_size])) - duration = max(0.5, clip_end - clip_start) - per = duration / len(chunks) - cues: list[SubtitleCue] = [] - for i, chunk in enumerate(chunks): - cue_start = round(i * per, 3) - cue_end = round((i + 1) * per, 3) - cues.append( - SubtitleCue( - start_seconds=cue_start, - end_seconds=max(cue_end, cue_start + 0.4), - text=chunk, - ) - ) - return cues - - def _whisper_align_words( - self, video_path: str | Path, text: str, clip_start: float, clip_end: float - ) -> list[SubtitleCue]: - try: - from transformers import pipeline - except Exception as exc: - raise RuntimeError("transformers is required for word-level timestamps") from exc - - if self._pipeline is None: - self._pipeline = pipeline( - task="automatic-speech-recognition", - model=self.settings.whisper_model_id, - device=torch_device_index(), - token=self.settings.hf_token, - chunk_length_s=30, - return_timestamps="word", - ) - - result = self._pipeline( - str(video_path), - generate_kwargs={"task": "transcribe"}, - return_timestamps="word", - ) - chunks = result.get("chunks") or [] - # Filter to chunks inside [clip_start, clip_end] and convert to relative time - cues: list[SubtitleCue] = [] - buffer_words: list[tuple[str, float, float]] = [] - for chunk in chunks: - ts = chunk.get("timestamp") or (0, 0) - start = float(ts[0] or 0) - end = float(ts[1] or start + 0.3) - word = (chunk.get("text") or "").strip() - if not word: - continue - if end < clip_start or start > clip_end: - continue - buffer_words.append( - (word, max(0.0, start - clip_start), min(clip_end - clip_start, end - clip_start)) - ) - - # Group into ~3 word phrases - chunk_size = 3 - for i in range(0, len(buffer_words), chunk_size): - group = buffer_words[i : i + chunk_size] - text_chunk = " ".join(w for w, _, _ in group) - cue_start = group[0][1] - cue_end = group[-1][2] - cues.append( - SubtitleCue( - start_seconds=round(cue_start, 3), - end_seconds=round(max(cue_end, cue_start + 0.4), 3), - text=text_chunk, - ) - ) - return cues if cues else self._demo_align_words(text, clip_start, clip_end) - - def _demo_transcript(self, profile: ChannelProfile) -> list[TranscriptSegment]: - style = profile.clip_style.lower() - language = profile.primary_language.lower() - niche_value = ( - profile.niche_custom - if profile.niche.lower() == "other" and profile.niche_custom - else profile.niche - ) - niche = niche_value.lower() - creator_context = ( - profile.channel_description - or "The creator wants clips that feel useful and easy to share." - ) - lines = _demo_lines( - language, - _localized_profile_word(niche, language, "niche"), - _localized_profile_word(style, language, "style"), - creator_context, - ) - segments: list[TranscriptSegment] = [] - cursor = 0.0 - for line in lines: - end = cursor + 15.0 - segments.append( - TranscriptSegment( - id=uuid4().hex, - start_seconds=cursor, - end_seconds=end, - text=line, - language=profile.primary_language, - ) - ) - cursor = end - return segments - - -def _demo_lines(language: str, niche: str, style: str, creator_context: str) -> list[str]: - if "thai" in language: - return [ - "ช่วงเปิดนี้วางปัญหาหลักของครีเอเตอร์ เวลาวิดีโอยาวซ่อนช่วงที่ดีที่สุดไว้", - "นี่คือความผิดพลาดที่หลายทีมทำ คือเลือกคลิปจากยอดวิวอย่างเดียว", - "คำถามสำคัญคือ ช่วงไหนที่จะทำให้คนหยุดเลื่อนหน้าจอได้ทันที", - f"สำหรับช่องแนว {niche} คำตอบจะเปลี่ยน เพราะผู้ชมคาดหวังจังหวะที่ {style}", - f"บริบทของช่องคือ {creator_context}", - "ช่วงนี้อธิบายได้ชัดที่สุด และมีภาพเปรียบเทียบก่อนกับหลังที่แรง", - "จากนั้นแขกรับเชิญตอบสนองด้วยประโยคสั้นที่เหมาะมากสำหรับ hook", - "ตรงนี้มีข้อคิดที่เอาไปใช้ได้ทันที และยืนเป็นคลิปสั้นได้ด้วยตัวเอง", - "ช่วงท้ายสรุปไอเดียด้วยประโยคชัด ๆ ที่ทำซับได้ง่าย", - ] - if "japanese" in language: - return [ - "この冒頭では、長い動画に最高の瞬間が埋もれてしまう問題を示しています。", - "多くのチームが再生数だけでクリップを選ぶという意外なミスをしています。", - "大事な問いは、この瞬間が今すぐスクロールを止めさせるかどうかです。", - f"{niche} チャンネルでは、視聴者が {style} なテンポを期待するため答えが変わります。", - f"チャンネルの文脈はこうです。{creator_context}", - "この部分は説明が最も明確で、ビフォーアフターの対比も強いです。", - "その後、ゲストが短いフックとして使いやすい一言で反応します。", - "ここには単独の短尺クリップとして成立する実用的な学びがあります。", - "最後の部分は字幕にしやすい明確な一言でアイデアをまとめます。", - ] - if "chinese" in language: - return [ - "这个开头点出了创作者常遇到的问题:长视频里藏着最好的瞬间。", - "很多团队都会犯一个意外错误,只根据播放量来选择剪辑片段。", - "关键问题很简单:哪个瞬间能让观众立刻停下滑动?", - f"对于 {niche} 频道,答案会不同,因为观众期待 {style} 的节奏。", - f"频道背景是:{creator_context}", - "这一段解释最清楚,并且有很强的前后对比。", - "接着嘉宾给出一句有冲击力的回应,很适合作为短视频 hook。", - "这里有一个实用结论,足够独立成为一个短视频片段。", - "最后一段用一句清晰的话收束观点,也很适合做字幕。", - ] - if "korean" in language: - return [ - "이 오프닝은 긴 영상 속 좋은 순간이 묻히는 문제를 보여줍니다.", - "많은 팀이 조회수만 보고 클립을 고르는 의외의 실수를 합니다.", - "핵심 질문은 간단합니다. 어떤 순간이 시청자의 스크롤을 멈추게 할까요?", - f"{niche} 채널에서는 시청자가 {style} 리듬을 기대하기 때문에 답이 달라집니다.", - f"채널 맥락은 다음과 같습니다. {creator_context}", - "이 부분은 설명이 가장 명확하고 전후 대비도 강합니다.", - "그다음 게스트가 짧은 훅으로 쓰기 좋은 강한 한마디를 합니다.", - "여기에는 단독 숏폼 클립으로도 충분한 실용적인 takeaway가 있습니다.", - "마지막 부분은 자막으로 만들기 쉬운 명확한 문장으로 아이디어를 정리합니다.", - ] - return [ - "This opening sets up the main problem creators face when a long video hides the best moments.", - "Here is the surprising mistake most teams make when they choose clips only by view count.", - "The important question is simple: which moment would make someone stop scrolling right now?", - f"For a {niche} channel, the answer changes because the audience expects a {style} rhythm.", - f"The channel context is simple: {creator_context}", - "This section has the clearest explanation and a strong before-and-after contrast.", - "Then the guest reacts with a punchy line that works well as a short hook.", - "A practical takeaway lands here, with enough context to stand alone as a sixty second clip.", - "The final segment wraps the idea with a direct callout that is easy to subtitle.", - ] - - -def _localized_profile_word(value: str, language: str, group: str) -> str: - key = value.lower().replace(" ", "_") - localized = { - "thai": { - "niche": { - "education": "การศึกษา", - "gaming": "เกม", - "podcast": "พอดแคสต์", - "commentary": "เล่า/วิเคราะห์", - "cars": "รถยนต์", - "beauty": "บิวตี้", - "fitness": "ฟิตเนส", - "finance": "การเงิน", - "tech": "เทคโนโลยี", - "lifestyle": "ไลฟ์สไตล์", - "music": "ดนตรี", - }, - "style": { - "informative": "ให้ข้อมูล", - "funny": "ตลก", - "dramatic": "ดราม่า", - "educational": "สอนเข้าใจง่าย", - "commentary": "วิเคราะห์", - }, - }, - "japanese": { - "niche": { - "education": "教育", - "gaming": "ゲーム", - "podcast": "ポッドキャスト", - "commentary": "解説", - "cars": "車", - "beauty": "美容", - "fitness": "フィットネス", - "finance": "金融", - "tech": "テック", - "lifestyle": "ライフスタイル", - "music": "音楽", - }, - "style": { - "informative": "情報性の高い", - "funny": "ユーモアのある", - "dramatic": "ドラマチックな", - "educational": "学びやすい", - "commentary": "解説型の", - }, - }, - "chinese": { - "niche": { - "education": "教育", - "gaming": "游戏", - "podcast": "播客", - "commentary": "解说", - "cars": "汽车", - "beauty": "美妆", - "fitness": "健身", - "finance": "金融", - "tech": "科技", - "lifestyle": "生活方式", - "music": "音乐", - }, - "style": { - "informative": "信息量高", - "funny": "有趣", - "dramatic": "戏剧化", - "educational": "教学型", - "commentary": "评论型", - }, - }, - "korean": { - "niche": { - "education": "교육", - "gaming": "게임", - "podcast": "팟캐스트", - "commentary": "해설", - "cars": "자동차", - "beauty": "뷰티", - "fitness": "피트니스", - "finance": "금융", - "tech": "테크", - "lifestyle": "라이프스타일", - "music": "음악", - }, - "style": { - "informative": "정보형", - "funny": "재미있는", - "dramatic": "극적인", - "educational": "교육형", - "commentary": "해설형", - }, - }, - } - for language_key, groups in localized.items(): - if language_key in language: - return groups.get(group, {}).get(key, value) - return value diff --git a/backend/app/services/video_input.py b/backend/app/services/video_input.py deleted file mode 100644 index 93faf02278219af19636f5ae01a13e3ade8966ae..0000000000000000000000000000000000000000 --- a/backend/app/services/video_input.py +++ /dev/null @@ -1,80 +0,0 @@ -import asyncio -import shutil -import subprocess -from pathlib import Path - -from fastapi import UploadFile - -from app.core.config import Settings - - -async def save_upload(upload: UploadFile, job_dir: Path) -> Path: - suffix = Path(upload.filename or "upload.mp4").suffix or ".mp4" - destination = job_dir / f"source{suffix.lower()}" - with destination.open("wb") as handle: - while chunk := await upload.read(1024 * 1024): - handle.write(chunk) - return destination - - -async def resolve_youtube_url(url: str, job_dir: Path, settings: Settings) -> Path: - if settings.demo_mode: - return await asyncio.to_thread(create_demo_video, job_dir, settings) - - try: - import yt_dlp - except Exception as exc: - raise RuntimeError("yt-dlp is required for YouTube ingestion") from exc - - output_template = str(job_dir / "source.%(ext)s") - ydl_opts = { - "outtmpl": output_template, - "format": "bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4]/best", - "merge_output_format": "mp4", - "quiet": True, - "noprogress": True, - } - - def download() -> Path: - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - ydl.download([url]) - matches = sorted(job_dir.glob("source.*")) - if not matches: - raise RuntimeError("yt-dlp finished without producing a video") - return matches[0] - - return await asyncio.to_thread(download) - - -def create_demo_video(job_dir: Path, settings: Settings) -> Path: - destination = job_dir / "source.mp4" - ffmpeg = shutil.which(settings.ffmpeg_binary) - if not ffmpeg: - destination.write_bytes(b"") - return destination - - command = [ - ffmpeg, - "-y", - "-f", - "lavfi", - "-i", - "testsrc2=size=1280x720:rate=30:duration=120", - "-f", - "lavfi", - "-i", - "sine=frequency=660:sample_rate=48000:duration=120", - "-shortest", - "-c:v", - "libx264", - "-pix_fmt", - "yuv420p", - "-c:a", - "aac", - str(destination), - ] - try: - subprocess.run(command, check=True, capture_output=True, text=True, timeout=45) - except Exception: - destination.write_bytes(b"") - return destination diff --git a/backend/app/storage.py b/backend/app/storage.py deleted file mode 100644 index 4e4ad1de7644a6da25d2ce4b85bd105ca9016fef..0000000000000000000000000000000000000000 --- a/backend/app/storage.py +++ /dev/null @@ -1,58 +0,0 @@ -import json -from pathlib import Path -from uuid import uuid4 - -from app.core.config import Settings -from app.models.schemas import ChannelProfile, JobSnapshot, utc_now - - -class JobStore: - def __init__(self, settings: Settings) -> None: - self.settings = settings - self.root = settings.storage_dir - self.jobs_root = self.root / "jobs" - self.jobs_root.mkdir(parents=True, exist_ok=True) - - def create_job(self, profile: ChannelProfile, source: dict) -> JobSnapshot: - job_id = uuid4().hex - job_dir = self.job_dir(job_id) - job_dir.mkdir(parents=True, exist_ok=True) - snapshot = JobSnapshot( - id=job_id, - status="queued", - progress=0, - message="Queued", - source=source, - profile=profile, - ) - self.save_job(snapshot) - return snapshot - - def job_dir(self, job_id: str) -> Path: - return self.jobs_root / job_id - - def media_url(self, job_id: str, filename: str) -> str: - return f"/media/jobs/{job_id}/{filename}" - - def save_job(self, snapshot: JobSnapshot) -> JobSnapshot: - snapshot.updated_at = utc_now() - path = self.job_dir(snapshot.id) / "job.json" - path.write_text(snapshot.model_dump_json(indent=2), encoding="utf-8") - return snapshot - - def get_job(self, job_id: str) -> JobSnapshot: - path = self.job_dir(job_id) / "job.json" - if not path.exists(): - raise FileNotFoundError(job_id) - data = json.loads(path.read_text(encoding="utf-8")) - return JobSnapshot.model_validate(data) - - def update_job(self, job_id: str, **updates) -> JobSnapshot: - snapshot = self.get_job(job_id) - updated = snapshot.model_copy(update=updates) - return self.save_job(updated) - - def write_json(self, job_id: str, filename: str, payload: object) -> Path: - path = self.job_dir(job_id) / filename - path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") - return path diff --git a/backend/app/utils/__init__.py b/backend/app/utils/__init__.py deleted file mode 100644 index db8f6ca2cdf5d33c46cd6197f82dea6d5d547579..0000000000000000000000000000000000000000 --- a/backend/app/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Runtime helpers.""" diff --git a/backend/app/utils/rocm.py b/backend/app/utils/rocm.py deleted file mode 100644 index eaafafeb4388d8c79c4504500341c86597e5e154..0000000000000000000000000000000000000000 --- a/backend/app/utils/rocm.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import Any - - -def detect_accelerator() -> dict[str, Any]: - try: - import torch - except Exception as exc: - return { - "torch_available": False, - "cuda_api_available": False, - "rocm_hip_version": None, - "device_name": None, - "error": str(exc), - } - - cuda_available = bool(torch.cuda.is_available()) - device_name = torch.cuda.get_device_name(0) if cuda_available else None - return { - "torch_available": True, - "cuda_api_available": cuda_available, - "rocm_hip_version": getattr(torch.version, "hip", None), - "cuda_version": getattr(torch.version, "cuda", None), - "device_name": device_name, - "device_count": torch.cuda.device_count() if cuda_available else 0, - } - - -def torch_device_index() -> int: - try: - import torch - except Exception: - return -1 - return 0 if torch.cuda.is_available() else -1 diff --git a/backend/app/workers/__init__.py b/backend/app/workers/__init__.py deleted file mode 100644 index 75319bfe62ffe1a54d28364e97272615cdd1dbe4..0000000000000000000000000000000000000000 --- a/backend/app/workers/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Optional async workers.""" diff --git a/backend/app/workers/celery_app.py b/backend/app/workers/celery_app.py deleted file mode 100644 index 26edb61ef634a0f2ac22428d1a392a0e3aa8b096..0000000000000000000000000000000000000000 --- a/backend/app/workers/celery_app.py +++ /dev/null @@ -1,15 +0,0 @@ -from celery import Celery - -from app.core.config import get_settings - -settings = get_settings() - -celery_app = Celery("ai_clip_studio", broker=settings.redis_url, backend=settings.redis_url) -celery_app.conf.task_serializer = "json" -celery_app.conf.result_serializer = "json" -celery_app.conf.accept_content = ["json"] - - -@celery_app.task(name="pipeline.process_job") -def process_job(job_id: str) -> str: - return f"Queued job {job_id}. FastAPI background tasks are active by default." diff --git a/backend/main.py b/backend/main.py new file mode 100644 index 0000000000000000000000000000000000000000..66281a023ab373c08df66493fc05eabe71687c1f --- /dev/null +++ b/backend/main.py @@ -0,0 +1,466 @@ +"""ElevenClip AI — FastAPI Backend. + +Endpoints: + POST /api/video-info — get YouTube metadata (no download) + POST /api/process — full pipeline (download/upload → clips) + WS /ws/progress/{session} — real-time pipeline progress + GET /api/clips/{session} — list generated clips + PATCH /api/clips/{session}/{index}/subtitles — update subtitle event + PATCH /api/clips/{session}/{index}/style — apply global style override + POST /api/clips/{session}/{index}/render — burn-in subtitles → download + GET /downloads/{session}/{filename} — serve output files +""" +import asyncio +import json +import os +import uuid +from pathlib import Path +from typing import Optional + +from fastapi import FastAPI, UploadFile, File, Form, Header, WebSocket, WebSocketDisconnect, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles +from pydantic import BaseModel +from loguru import logger + +from src.gpu.rocm_utils import get_device, log_gpu_status +from src.gpu.vllm_manager import ensure_vllm_running, vllm_stop, vllm_status +from src.ingestion.youtube import download_video_async, get_video_info +from src.transcription.whisper import transcribe_async, extract_audio +from src.analysis.scene_detector import detect_scenes, sample_frames +from src.analysis.vision import analyze_scenes_batch_async +from src.analysis.highlight_scorer import score_scenes, select_top_clips +from src.processing.clip_extractor import extract_all_clips_async, burn_subtitles +from src.processing.subtitle import generate_subtitles, update_subtitle_event, apply_global_style_override +from src.processing.high_retention import apply_hre + +app = FastAPI(title="ElevenClip AI", version="1.0.0") + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +WORK_DIR = Path(os.getenv("WORK_DIR", "/tmp/elevnclip")) +WORK_DIR.mkdir(parents=True, exist_ok=True) + +DEMO_ACCESS_CODE = os.getenv("DEMO_ACCESS_CODE", "").strip() +MAX_CONCURRENT_JOBS = int(os.getenv("MAX_CONCURRENT_JOBS", "1")) +MAX_UPLOAD_MB = int(os.getenv("MAX_UPLOAD_MB", "500")) + +app.mount("/downloads", StaticFiles(directory=str(WORK_DIR)), name="downloads") + +# In-memory session store + WebSocket registry +sessions: dict[str, dict] = {} +ws_connections: dict[str, WebSocket] = {} +ws_queues: dict[str, list[dict]] = {} # buffer progress messages until WS connects +active_jobs: set[str] = set() + + +def _require_access(x_demo_key: Optional[str]) -> None: + """Optional public-demo guard for expensive GPU endpoints.""" + if DEMO_ACCESS_CODE and (x_demo_key or "").strip() != DEMO_ACCESS_CODE: + raise HTTPException(403, "Access code required for generation") + + +# ─── Startup ────────────────────────────────────────────────────────────── + +@app.on_event("startup") +async def startup(): + log_gpu_status() + # Pre-populate demo session so /editor?session=demo always works + sessions["demo"] = {"status": "done", "clips": _build_demo_clips()} + + +def _build_demo_clips() -> list[dict]: + return [ + { + "index": 1, "start": 0.0, "end": 45.0, "duration": 45.0, "score": 0.91, + "clip_path": None, "final_path": None, "ass_path": None, + "download_url": None, "raw_url": None, + "highlight_reason": "High-energy moment with peak audience reaction", + "vision_analysis": {"excitement_score": 0.92, "tiktok_potential": 0.89, "emotion": "excited", "action_type": "gaming"}, + }, + { + "index": 2, "start": 90.0, "end": 150.0, "duration": 60.0, "score": 0.83, + "clip_path": None, "final_path": None, "ass_path": None, + "download_url": None, "raw_url": None, + "highlight_reason": "Funny reaction — peak humor level detected", + "vision_analysis": {"excitement_score": 0.78, "tiktok_potential": 0.85, "emotion": "funny", "action_type": "reaction"}, + }, + { + "index": 3, "start": 210.0, "end": 270.0, "duration": 60.0, "score": 0.76, + "clip_path": None, "final_path": None, "ass_path": None, + "download_url": None, "raw_url": None, + "highlight_reason": "Educational highlight with strong engagement signal", + "vision_analysis": {"excitement_score": 0.70, "tiktok_potential": 0.80, "emotion": "happy", "action_type": "tutorial"}, + }, + ] + + +# ─── WebSocket Progress ──────────────────────────────────────────────────── + +@app.websocket("/ws/progress/{session_id}") +async def ws_progress(websocket: WebSocket, session_id: str): + await websocket.accept() + ws_connections[session_id] = websocket + + # Flush messages that were sent before the WS connected + for msg in ws_queues.pop(session_id, []): + try: + await websocket.send_json(msg) + except Exception: + break + + try: + while True: + await asyncio.sleep(30) # keep-alive + except WebSocketDisconnect: + ws_connections.pop(session_id, None) + + +async def send_progress(session_id: str, stage: str, pct: int, message: str = ""): + payload = {"stage": stage, "pct": pct, "message": message} + sessions.setdefault(session_id, {})["last_progress"] = payload + + ws = ws_connections.get(session_id) + if ws: + try: + await ws.send_json(payload) + return + except Exception: + ws_connections.pop(session_id, None) + + # WS not yet connected — buffer for flush on connect + ws_queues.setdefault(session_id, []).append(payload) + + +# ─── Models ─────────────────────────────────────────────────────────────── + +class VideoInfoRequest(BaseModel): + url: str + +DEMO_VIDEO_DIR = Path("/root/ElevenClip-AI/demo_videos") +_DEMO_CANDIDATES = ["demo1.mp4", "demo2.mp4", "demo.mp4"] + +def _get_demo_video() -> Path | None: + import random + available = [DEMO_VIDEO_DIR / f for f in _DEMO_CANDIDATES if (DEMO_VIDEO_DIR / f).exists()] + return random.choice(available) if available else None + +class ProcessSettings(BaseModel): + youtube_url: Optional[str] = None + use_demo_video: bool = False + channel_description: str = "" + clip_style: str = "entertaining" + target_duration: int = 60 + clip_count: int = 3 + clip_language: str = "auto" + subtitle_language: str = "en" + mode: str = "normal" # "normal" | "hre" + aspect_mode: str = "crop" # "crop" | "letterbox" + style_config: dict = {} + +class SubtitlePatch(BaseModel): + event_index: int + updates: dict # {text, start, end} + +class GlobalStylePatch(BaseModel): + style_config: dict + + +# ─── Routes ─────────────────────────────────────────────────────────────── + +@app.get("/health") +async def health(): + return {"status": "ok", "device": get_device()} + + +@app.post("/api/video-info") +async def video_info(req: VideoInfoRequest, x_demo_key: Optional[str] = Header(None, alias="X-Demo-Key")): + _require_access(x_demo_key) + try: + return get_video_info(req.url) + except Exception as e: + raise HTTPException(400, str(e)) + + +@app.post("/api/process") +async def process( + settings_json: str = Form(...), + file: Optional[UploadFile] = File(None), + x_demo_key: Optional[str] = Header(None, alias="X-Demo-Key"), +): + """Main pipeline endpoint. Returns session_id immediately; progress via WebSocket.""" + _require_access(x_demo_key) + if len(active_jobs) >= MAX_CONCURRENT_JOBS: + raise HTTPException(429, "GPU is busy. Please try again in a few minutes.") + + settings = ProcessSettings(**json.loads(settings_json)) + session_id = str(uuid.uuid4()) + session_dir = WORK_DIR / session_id + session_dir.mkdir(parents=True, exist_ok=True) + + sessions[session_id] = {"status": "starting", "clips": []} + + # Read file bytes NOW — UploadFile becomes invalid once the response is sent + file_bytes: Optional[bytes] = None + file_name: Optional[str] = None + if file: + file_bytes = await file.read() + file_name = file.filename or "upload.mp4" + if len(file_bytes) > MAX_UPLOAD_MB * 1024 * 1024: + raise HTTPException(413, f"File too large. Max upload size is {MAX_UPLOAD_MB} MB.") + + active_jobs.add(session_id) + asyncio.create_task(_run_pipeline(session_id, session_dir, settings, file_bytes, file_name)) + return {"session_id": session_id} + + +# ─── Pipeline ───────────────────────────────────────────────────────────── + +async def _run_pipeline( + session_id: str, + session_dir: Path, + settings: ProcessSettings, + file_bytes: Optional[bytes], + file_name: Optional[str], +): + loop = asyncio.get_running_loop() + frames_dir = session_dir / "frames" + + try: + # ── 1. Acquire video ────────────────────────────────────────────── + await send_progress(session_id, "download", 5, "Acquiring video...") + + if settings.use_demo_video and (demo_vid := _get_demo_video()): + video_path = demo_vid + await send_progress(session_id, "download", 30, f"Using demo video: {demo_vid.name}") + elif settings.youtube_url: + def pct_cb(p): + asyncio.run_coroutine_threadsafe( + send_progress(session_id, "download", max(5, int(p * 0.28)), f"Downloading {p:.0f}%"), + loop, + ) + video_path = await download_video_async( + settings.youtube_url, session_dir, session_id, pct_cb + ) + elif file_bytes: + suffix = Path(file_name).suffix if file_name else ".mp4" + video_path = session_dir / f"{session_id}_input{suffix}" + await loop.run_in_executor(None, video_path.write_bytes, file_bytes) + else: + raise ValueError("No video source provided") + + await send_progress(session_id, "download", 30, "Video ready") + + # ── 2. Extract audio ───────────────────────────────────────────── + await send_progress(session_id, "audio", 32, "Extracting audio (16kHz mono)...") + audio_path = session_dir / f"{session_id}_audio.wav" + await loop.run_in_executor(None, lambda: extract_audio(video_path, audio_path)) + + # ── 3+4. Scene detection AND Whisper transcription IN PARALLEL ─── + # Scene detection runs on CPU; Whisper runs on AMD GPU. True concurrency. + await send_progress(session_id, "scenes", 35, "Scene detection + Whisper transcription (parallel on AMD ROCm)...") + device = get_device() + + scenes_future = loop.run_in_executor(None, lambda: detect_scenes(video_path)) + transcript_task = transcribe_async( + audio_path, + clip_language=settings.clip_language, + subtitle_language=settings.subtitle_language, + device=device, + ) + scenes, transcript = await asyncio.gather(scenes_future, transcript_task) + + await send_progress( + session_id, "transcribe", 58, + f"Whisper: {len(transcript.get('segments', []))} segments | SceneDetect: {len(scenes)} scenes" + ) + + # Frame sampling (after scenes list is known) + scenes_with_frames = await loop.run_in_executor( + None, lambda: sample_frames(video_path, scenes, frames_dir) + ) + + # ── 5. Qwen2.5-VL multimodal analysis (concurrent requests to vLLM) ─ + n_scenes = len(scenes_with_frames) + await send_progress(session_id, "vision", 58, "Ensuring AI model is running...") + await loop.run_in_executor( + None, + lambda: ensure_vllm_running( + progress_cb=lambda msg: asyncio.run_coroutine_threadsafe( + send_progress(session_id, "vision", 59, msg), loop + ) + ), + ) + await send_progress(session_id, "vision", 60, f"Qwen2.5-VL analyzing {n_scenes} scenes (vision + audio + text fusion)...") + scenes_analyzed = await analyze_scenes_batch_async( + scenes_with_frames, + transcript.get("segments", []), + channel_description=settings.channel_description, + clip_style=settings.clip_style, + ) + await send_progress(session_id, "vision", 76, f"Multimodal analysis complete: {n_scenes} scenes scored") + + # ── 6. Multi-signal scoring ───────────────────────────────────── + await send_progress(session_id, "scoring", 77, "Scoring: 0.40×vision + 0.35×audio_energy + 0.25×text_keywords") + scored = score_scenes(scenes_analyzed, audio_path, settings.clip_style, settings.target_duration) + selected = select_top_clips(scored, settings.clip_count, settings.target_duration) + + # ── 7. Extract clips (AMD AMF hardware encoder) ───────────────── + await send_progress(session_id, "cutting", 81, f"Cutting {len(selected)} clips (h264_amf)...") + clips = await extract_all_clips_async(video_path, selected, session_dir, session_id, aspect_mode=settings.aspect_mode) + + # ── 8. Subtitles / HRE (all clips in parallel) ───────────────── + await send_progress(session_id, "subtitles", 86, "Generating subtitles (parallel)...") + + subtitle_tasks = [] + final_clips = [] + + for clip in clips: + if not clip.get("clip_path"): + continue + clip_path = Path(clip["clip_path"]) + i = clip["clip_index"] + + clip_transcript = { + **transcript, + "segments": [ + s for s in transcript.get("segments", []) + if s["start"] < clip["end"] and s["end"] > clip["start"] + ], + } + + ass_path = session_dir / f"{session_id}_clip_{i:02d}.ass" + final_path = session_dir / f"{session_id}_clip_{i:02d}_final.mp4" + + if settings.mode == "hre": + subtitle_tasks.append(loop.run_in_executor( + None, + lambda cp=clip_path, cd=clip, tr=clip_transcript, fp=final_path: + apply_hre(cp, cd, tr, fp) + )) + else: + def _gen_and_burn(cp=clip_path, ap=ass_path, tr=clip_transcript, cs=clip["start"], fp=final_path): + generate_subtitles(tr, ap, settings.style_config, clip_start_offset=cs) + burn_subtitles(cp, ap, fp) + subtitle_tasks.append(loop.run_in_executor(None, _gen_and_burn)) + + final_clips.append({ + "index": i, + "start": clip["start"], + "end": clip["end"], + "duration": clip["end"] - clip["start"], + "score": clip.get("final_score", 0), + "clip_path": str(clip_path), + "final_path": str(final_path), + "ass_path": str(ass_path) if settings.mode == "normal" else None, + "download_url": f"/downloads/{session_id}/{final_path.name}", + "raw_url": f"/downloads/{session_id}/{clip_path.name}", + "vision_analysis": clip.get("vision_analysis", {}), + "highlight_reason": clip.get("vision_analysis", {}).get("highlight_reason", ""), + }) + + if subtitle_tasks: + await asyncio.gather(*subtitle_tasks) + + sessions[session_id] = {"status": "done", "clips": final_clips} + await send_progress(session_id, "done", 100, f"Done! {len(final_clips)} clips ready for download.") + + except Exception as e: + logger.exception(f"Pipeline failed [{session_id}]") + sessions[session_id] = {"status": "error", "error": str(e), "clips": []} + await send_progress(session_id, "error", 0, f"Pipeline error: {e}") + finally: + active_jobs.discard(session_id) + + +# ─── Editor API ─────────────────────────────────────────────────────────── + +@app.get("/api/clips/{session_id}") +async def get_clips(session_id: str): + session = sessions.get(session_id) + if not session: + raise HTTPException(404, "Session not found") + return session + + +@app.patch("/api/clips/{session_id}/{clip_index}/subtitles") +async def patch_subtitle(session_id: str, clip_index: int, patch: SubtitlePatch): + clip = _get_clip_or_404(session_id, clip_index) + if not clip.get("ass_path"): + raise HTTPException(404, "No subtitle file for this clip") + update_subtitle_event(Path(clip["ass_path"]), patch.event_index, patch.updates) + return {"ok": True} + + +@app.patch("/api/clips/{session_id}/{clip_index}/style") +async def patch_global_style(session_id: str, clip_index: int, patch: GlobalStylePatch): + clip = _get_clip_or_404(session_id, clip_index) + if not clip.get("ass_path"): + raise HTTPException(404, "No subtitle file for this clip") + apply_global_style_override(Path(clip["ass_path"]), patch.style_config) + return {"ok": True} + + +@app.post("/api/clips/{session_id}/{clip_index}/render") +async def render_clip(session_id: str, clip_index: int): + clip = _get_clip_or_404(session_id, clip_index) + + clip_path = Path(clip["clip_path"]) + ass_path = Path(clip["ass_path"]) if clip.get("ass_path") else None + final_path = clip_path.parent / f"{clip_path.stem}_edited.mp4" + + if ass_path and ass_path.exists(): + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, lambda: burn_subtitles(clip_path, ass_path, final_path)) + else: + final_path = Path(clip["final_path"]) + + download_url = f"/downloads/{session_id}/{final_path.name}" + clip["download_url"] = download_url + clip["final_path"] = str(final_path) + return {"download_url": download_url} + + +def _get_clip_or_404(session_id: str, clip_index: int) -> dict: + session = sessions.get(session_id) + if not session: + raise HTTPException(404, "Session not found") + clip = next((c for c in session.get("clips", []) if c["index"] == clip_index), None) + if not clip: + raise HTTPException(404, f"Clip {clip_index} not found") + return clip + + +# ─── vLLM management endpoints ──────────────────────────────────────────────── + +@app.get("/api/vllm/status") +async def get_vllm_status(): + return vllm_status() + + +@app.post("/api/vllm/stop") +async def stop_vllm(x_demo_key: Optional[str] = Header(None, alias="X-Demo-Key")): + _require_access(x_demo_key) + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, vllm_stop) + return {"ok": True, "message": "vLLM stopped — will restart automatically on next job"} + + +@app.post("/api/vllm/start") +async def start_vllm(x_demo_key: Optional[str] = Header(None, alias="X-Demo-Key")): + _require_access(x_demo_key) + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, ensure_vllm_running) + return {"ok": True, "status": vllm_status()} + + +if __name__ == "__main__": + import uvicorn + log_gpu_status() + uvicorn.run(app, host="0.0.0.0", port=8000, reload=False) diff --git a/backend/pyproject.toml b/backend/pyproject.toml deleted file mode 100644 index 137dee8342bd67b024dd64d2816d50e82dbba007..0000000000000000000000000000000000000000 --- a/backend/pyproject.toml +++ /dev/null @@ -1,44 +0,0 @@ -[project] -name = "elevenclip-ai-backend" -version = "0.1.0" -description = "FastAPI backend for ElevenClip.AI on AMD ROCm" -requires-python = ">=3.11" -dependencies = [ - "fastapi>=0.115.0", - "uvicorn[standard]>=0.30.0", - "pydantic>=2.8.0", - "python-multipart>=0.0.9", - "yt-dlp>=2025.1.15", - "celery[redis]>=5.4.0", - "redis>=5.0.0" -] - -[project.optional-dependencies] -ai = [ - "transformers>=4.47.0", - "accelerate>=1.2.0", - "sentencepiece>=0.2.0", - "safetensors>=0.4.5", - "Pillow>=10.0.0", - "qwen-vl-utils>=0.0.8" -] -rocm-inference = [ - "vllm>=0.6.6", - "optimum-amd>=0.1.0; platform_system == 'Linux'" -] -dev = [ - "pytest>=8.3.0", - "httpx>=0.27.0", - "ruff>=0.6.0" -] - -[build-system] -requires = ["setuptools>=69.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -include = ["app*"] - -[tool.ruff] -line-length = 100 -target-version = "py311" diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..80377ee46c38598592534ac32aabf1d5e421b113 --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,37 @@ +# FastAPI & server +fastapi==0.115.5 +uvicorn[standard]==0.32.1 +python-multipart==0.0.20 +websockets==14.1 +aiofiles==24.1.0 +httpx==0.28.1 + +# Video download +yt-dlp==2025.4.30 + +# Video processing (ffmpeg called via subprocess — no Python wrapper needed) +scenedetect[opencv]==0.6.5.2 +librosa==0.10.2 +soundfile==0.12.1 + +# AI — Whisper STT (ROCm-optimized) +# PyTorch must be installed separately with ROCm wheels: +# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2 +transformers==4.47.1 +accelerate==1.2.1 + +# AI — Vision: Qwen2.5-VL via vLLM OpenAI-compatible API +# vLLM installed separately: +# pip install vllm --extra-index-url https://download.pytorch.org/whl/rocm6.2 +openai==1.57.4 + +# Subtitles +pysubs2==1.7.3 + +# Utils +numpy==1.26.4 +pillow==11.0.0 +python-dotenv==1.0.1 +pydantic==2.10.4 +pydantic-settings==2.7.0 +loguru==0.7.3 diff --git a/backend/src/__init__.py b/backend/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/backend/src/analysis/__init__.py b/backend/src/analysis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/backend/src/analysis/highlight_scorer.py b/backend/src/analysis/highlight_scorer.py new file mode 100644 index 0000000000000000000000000000000000000000..bb6e06771f19ca13ea38b1dee5039b477a7b5ac4 --- /dev/null +++ b/backend/src/analysis/highlight_scorer.py @@ -0,0 +1,166 @@ +"""Multi-signal highlight scoring: Vision + Audio energy + Text keywords.""" +import math +from pathlib import Path +from loguru import logger + + +# Style-specific keyword boosts +STYLE_KEYWORDS = { + "funny": ["haha", "lol", "funny", "joke", "laugh", "omg", "what", "no way", "ตลก", "ฮา", "โอ้โห", "搞笑", "哈哈"], + "serious": ["important", "key", "must", "critical", "สำคัญ", "ต้อง", "หลัก", "重要", "关键"], + "educational": ["learn", "tip", "trick", "how", "why", "เรียน", "วิธี", "ทำไม", "学习", "方法", "技巧"], + "gaming": ["win", "lose", "boss", "kill", "score", "level", "ชนะ", "แพ้", "赢", "输"], + "entertainment": ["wow", "amazing", "incredible", "unbelievable", "เจ๋ง", "เยี่ยม", "厉害", "太棒了"], +} + + +def compute_audio_energy(audio_path: Path, scenes: list[dict]) -> list[float]: + """Compute RMS energy per scene using librosa.""" + try: + import librosa + import numpy as np + + y, sr = librosa.load(str(audio_path), sr=16000, mono=True) + energies = [] + + for scene in scenes: + start_sample = int(scene["start"] * sr) + end_sample = int(scene["end"] * sr) + segment = y[start_sample:end_sample] + if len(segment) == 0: + energies.append(0.0) + continue + rms = float(np.sqrt(np.mean(segment ** 2))) + energies.append(rms) + + # Normalize to 0-1 + if max(energies) > 0: + max_e = max(energies) + energies = [e / max_e for e in energies] + + return energies + + except ImportError: + logger.warning("librosa not installed, using uniform audio energy") + return [0.5] * len(scenes) + except Exception as e: + logger.error(f"Audio energy computation failed: {e}") + return [0.5] * len(scenes) + + +def compute_text_score(transcript_text: str, clip_style: str) -> float: + """Score transcript text based on style keywords (0-1).""" + if not transcript_text: + return 0.3 + + text_lower = transcript_text.lower() + keywords = STYLE_KEYWORDS.get(clip_style.lower(), []) + if not keywords: + return 0.3 + + hits = sum(1 for kw in keywords if kw in text_lower) + score = min(1.0, hits / max(len(keywords) * 0.2, 1)) + return max(0.1, score) + + +def score_scenes( + scenes_analyzed: list[dict], + audio_path: Path, + clip_style: str = "entertaining", + target_duration: int = 60, +) -> list[dict]: + """Compute final highlight scores for all scenes. + + Final score = 0.40 × vision + 0.35 × audio_energy + 0.25 × text_keywords + """ + # Audio energy per scene + audio_energies = compute_audio_energy(audio_path, scenes_analyzed) + + scored = [] + for i, scene in enumerate(scenes_analyzed): + analysis = scene.get("vision_analysis", {}) + + vision_score = ( + analysis.get("excitement_score", 0.5) * 0.5 + + analysis.get("tiktok_potential", 0.5) * 0.3 + + analysis.get("humor_level", 0.3) * 0.2 + ) + + audio_score = audio_energies[i] + + # Text from transcript segments overlapping this scene + transcript_text = scene.get("transcript_text", "") + text_score = compute_text_score(transcript_text, clip_style) + + final_score = ( + 0.40 * vision_score + + 0.35 * audio_score + + 0.25 * text_score + ) + + # Penalize very short or very long scenes relative to target + duration = scene["duration"] + duration_penalty = 1.0 - abs(duration - target_duration) / max(target_duration * 2, 1) + duration_penalty = max(0.5, duration_penalty) + + scored.append({ + **scene, + "vision_score": round(vision_score, 3), + "audio_score": round(audio_score, 3), + "text_score": round(text_score, 3), + "final_score": round(final_score * duration_penalty, 3), + }) + + scored.sort(key=lambda s: s["final_score"], reverse=True) + logger.info(f"Top scene: {scored[0]['start']:.1f}s score={scored[0]['final_score']:.3f}" if scored else "No scenes") + return scored + + +def select_top_clips( + scored_scenes: list[dict], + count: int, + target_duration: int, + min_gap_sec: float = 30.0, +) -> list[dict]: + """Select top-N non-overlapping clips. + + Merges adjacent high-scoring scenes to reach target_duration. + Ensures clips don't overlap (min_gap_sec between selections). + """ + selected = [] + used_ranges = [] + + for scene in scored_scenes: + if len(selected) >= count: + break + + # Check overlap with already selected clips + overlaps = any( + abs(scene["start"] - used_start) < min_gap_sec + for used_start in used_ranges + ) + if overlaps: + continue + + # Adjust clip boundaries to match target_duration + clip = _adjust_clip_duration(scene, target_duration) + selected.append(clip) + used_ranges.append(clip["start"]) + + logger.info(f"Selected {len(selected)}/{count} clips") + return sorted(selected, key=lambda c: c["start"]) + + +def _adjust_clip_duration(scene: dict, target_sec: int) -> dict: + """Expand or shrink a scene to approximately target_sec.""" + current_dur = scene["end"] - scene["start"] + if abs(current_dur - target_sec) < 5: + return scene + + # Center the target window on the scene midpoint + mid = (scene["start"] + scene["end"]) / 2 + half = target_sec / 2 + new_start = max(0, mid - half) + new_end = new_start + target_sec + + return {**scene, "start": new_start, "end": new_end, "duration": target_sec} diff --git a/backend/src/analysis/scene_detector.py b/backend/src/analysis/scene_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..8930e4ff56e2305bc1adcf176005e5c8f35c6997 --- /dev/null +++ b/backend/src/analysis/scene_detector.py @@ -0,0 +1,111 @@ +"""Scene detection using PySceneDetect.""" +from pathlib import Path +from typing import Optional +from loguru import logger + + +def detect_scenes( + video_path: Path, + threshold: float = 27.0, + min_scene_len_sec: float = 2.0, +) -> list[dict]: + """Detect scene cuts and return list of scenes with timestamps. + + Returns: + [{"start": float, "end": float, "duration": float}, ...] + """ + try: + from scenedetect import open_video, SceneManager + from scenedetect.detectors import ContentDetector + + video = open_video(str(video_path)) + scene_manager = SceneManager() + scene_manager.add_detector(ContentDetector(threshold=threshold)) + + logger.info(f"Running scene detection on: {video_path.name}") + scene_manager.detect_scenes(video, show_progress=False) + scene_list = scene_manager.get_scene_list() + + scenes = [] + for start_tc, end_tc in scene_list: + start = start_tc.get_seconds() + end = end_tc.get_seconds() + duration = end - start + if duration >= min_scene_len_sec: + scenes.append({"start": start, "end": end, "duration": duration}) + + logger.info(f"Detected {len(scenes)} scenes") + if not scenes: + logger.warning("0 scenes from ContentDetector — using fixed-interval fallback") + return _fixed_interval_scenes(video_path, interval_sec=8.0) + return scenes + + except ImportError: + logger.warning("scenedetect not installed, using fixed-interval fallback") + return _fixed_interval_scenes(video_path, interval_sec=5.0) + except Exception as e: + logger.error(f"Scene detection failed: {e}") + return _fixed_interval_scenes(video_path, interval_sec=5.0) + + +def _fixed_interval_scenes(video_path: Path, interval_sec: float = 5.0) -> list[dict]: + """Fallback: split video into fixed-interval scenes.""" + import subprocess + result = subprocess.run( + ["ffprobe", "-v", "error", "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", str(video_path)], + capture_output=True, text=True + ) + try: + total = float(result.stdout.strip()) + except ValueError: + total = 300.0 + + scenes = [] + t = 0.0 + while t < total: + end = min(t + interval_sec, total) + scenes.append({"start": t, "end": end, "duration": end - t}) + t = end + return scenes + + +def sample_frames( + video_path: Path, + scenes: list[dict], + output_dir: Path, + frames_per_scene: int = 3, +) -> list[dict]: + """Extract representative frames from each scene for vision analysis. + + Returns scenes with added 'frame_paths' key. + """ + import subprocess + output_dir.mkdir(parents=True, exist_ok=True) + + result_scenes = [] + for i, scene in enumerate(scenes): + mid = scene["start"] + scene["duration"] / 2 + frame_paths = [] + + # Sample frames at start, middle, end of scene + timestamps = [ + scene["start"] + scene["duration"] * 0.2, + mid, + scene["start"] + scene["duration"] * 0.8, + ][:frames_per_scene] + + for j, ts in enumerate(timestamps): + frame_path = output_dir / f"scene_{i:04d}_frame_{j}.jpg" + cmd = [ + "ffmpeg", "-y", "-ss", str(ts), "-i", str(video_path), + "-vframes", "1", "-q:v", "2", "-vf", "scale=640:-1", + str(frame_path) + ] + subprocess.run(cmd, capture_output=True) + if frame_path.exists(): + frame_paths.append(str(frame_path)) + + result_scenes.append({**scene, "index": i, "frame_paths": frame_paths}) + + return result_scenes diff --git a/backend/src/analysis/vision.py b/backend/src/analysis/vision.py new file mode 100644 index 0000000000000000000000000000000000000000..72f22a826c1b4a64d81d21184d382a5c08a85245 --- /dev/null +++ b/backend/src/analysis/vision.py @@ -0,0 +1,305 @@ +"""Qwen2.5-VL multimodal scene analysis via vLLM OpenAI-compatible API. + +Sends video frames + transcript text together (true multimodal fusion). +Outputs: excitement_score, face_bbox, action_type, humor_level, emotion. +All scenes analyzed concurrently — vLLM handles GPU batching internally. +""" +import asyncio +import base64 +import json +import os +from pathlib import Path +from typing import Optional +from loguru import logger + +VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "http://localhost:8000/v1") +VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct") +VLLM_API_KEY = os.getenv("VLLM_API_KEY", "EMPTY") + +ANALYSIS_PROMPT = """You are a TikTok content expert analyzing a livestream segment for highlight potential. +Analyze the provided video frames and transcript text together as a unified multimodal signal. + +Respond ONLY with valid JSON matching this exact schema — no markdown, no explanation: +{{ + "excitement_score": <0.0-1.0>, + "humor_level": <0.0-1.0>, + "emotion": "", + "action_type": "", + "has_face": , + "face_bbox": [, , , ] or null, + "highlight_reason": "", + "tiktok_potential": <0.0-1.0> +}} + +Channel context: {channel_description} +Requested clip style: {clip_style} +""" + + +def _encode_image(image_path: str) -> str: + with open(image_path, "rb") as f: + return base64.b64encode(f.read()).decode("utf-8") + + +def analyze_scene( + scene: dict, + transcript_text: str = "", + channel_description: str = "", + clip_style: str = "entertaining", +) -> dict: + """Analyze a single scene using Qwen2.5-VL (vision + text multimodal fusion). + + Sends up to 3 representative frames + transcript context to vLLM. + Returns analysis dict with excitement_score, face_bbox, etc. + """ + try: + from openai import OpenAI + + client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY) + frame_paths = scene.get("frame_paths", []) + if not frame_paths: + return _default_analysis() + + content = [] + + # Add up to 3 frames as base64 images + for frame_path in frame_paths[:3]: + if Path(frame_path).exists(): + b64 = _encode_image(frame_path) + content.append({ + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, + }) + + if not content: + return _default_analysis() + + prompt = ANALYSIS_PROMPT.format( + channel_description=channel_description or "General content creator", + clip_style=clip_style, + ) + if transcript_text.strip(): + prompt += f"\n\nTranscript for this segment:\n\"{transcript_text.strip()}\"" + + content.append({"type": "text", "text": prompt}) + + response = client.chat.completions.create( + model=VLLM_MODEL, + messages=[{"role": "user", "content": content}], + max_tokens=300, + temperature=0.1, + ) + + raw = response.choices[0].message.content.strip() + # Strip markdown code fences if present + if raw.startswith("```"): + parts = raw.split("```") + raw = parts[1] if len(parts) > 1 else raw + if raw.startswith("json"): + raw = raw[4:] + + analysis = json.loads(raw.strip()) + logger.debug( + f"Scene [{scene['start']:.1f}s-{scene['end']:.1f}s]: " + f"excitement={analysis.get('excitement_score', 0):.2f} " + f"tiktok={analysis.get('tiktok_potential', 0):.2f} | " + f"{analysis.get('highlight_reason', '')[:60]}" + ) + try: + from src.gpu.vllm_manager import vllm_touch + vllm_touch() + except Exception: + pass + return analysis + + except Exception as e: + logger.warning(f"Vision analysis failed at {scene.get('start', 0):.1f}s: {e}") + return _default_analysis() + + +async def analyze_scenes_batch_async( + scenes_with_frames: list[dict], + transcript_segments: list[dict], + channel_description: str = "", + clip_style: str = "entertaining", +) -> list[dict]: + """Analyze all scenes concurrently. + + Sends all vLLM requests in parallel — the server queues and batches them + internally, giving full GPU utilization on AMD MI300X. + Each result includes 'vision_analysis' and 'transcript_text' for scoring. + """ + loop = asyncio.get_running_loop() + + async def _analyze_one(scene: dict) -> dict: + scene_text = " ".join( + seg["text"] for seg in transcript_segments + if seg["start"] < scene["end"] and seg["end"] > scene["start"] + ) + analysis = await loop.run_in_executor( + None, + lambda s=scene, t=scene_text: analyze_scene(s, t, channel_description, clip_style), + ) + return {**scene, "vision_analysis": analysis, "transcript_text": scene_text} + + results = await asyncio.gather(*[_analyze_one(s) for s in scenes_with_frames]) + logger.info(f"Vision analysis complete: {len(results)} scenes") + return list(results) + + +def _default_analysis() -> dict: + """Fallback analysis when vLLM is unavailable (keeps pipeline running).""" + return { + "excitement_score": 0.5, + "humor_level": 0.3, + "emotion": "neutral", + "action_type": "talking", + "has_face": False, + "face_bbox": None, + "highlight_reason": "Vision model unavailable — using audio+text signals only", + "tiktok_potential": 0.4, + } + + +HRE_SEGMENT_PROMPT = """Analyze this video frame for high-retention TikTok editing decisions. + +Segment {seg_idx} of {n_total}. Transcript: "{context}" + +Respond ONLY with valid JSON — no markdown: +{{ + "zoom_direction": "", + "zoom_speed": "", + "face_detected": , + "face_cx": <0.0-1.0>, + "face_cy": <0.0-1.0>, + "subtitle_position": "", + "subtitle_color": "", + "energy_level": "", + "moment_type": "" +}} + +Rules: +- seg_idx==0: always zoom_direction=in, zoom_speed=fast (hook the viewer) +- zoom IN fast: punchlines, reactions, peak energy +- zoom IN slow: context, buildup, moderate energy +- zoom OUT: reveals, breathing room after intensity +- HOLD: stable content, text-heavy moments +- subtitle TOP: face is in bottom half → put text at top +- subtitle BOTTOM: face is in top half → text at bottom +- face_cx/face_cy: face center as 0.0-1.0 fraction of frame +""" + + +def analyze_frame_for_hre( + frame_path: "Path", + context: str = "", + seg_idx: int = 0, + n_total: int = 1, +) -> dict: + """Per-segment HRE: zoom direction, subtitle position+color for this moment.""" + try: + from openai import OpenAI + + client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY) + if not Path(frame_path).exists(): + return _default_hre_analysis(seg_idx, n_total) + + b64 = _encode_image(str(frame_path)) + prompt = HRE_SEGMENT_PROMPT.format( + seg_idx=seg_idx, n_total=n_total, context=context[:200] + ) + response = client.chat.completions.create( + model=VLLM_MODEL, + messages=[{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}, + {"type": "text", "text": prompt}, + ], + }], + max_tokens=200, + temperature=0.1, + ) + raw = response.choices[0].message.content.strip() + if raw.startswith("```"): + parts = raw.split("```") + raw = parts[1] if len(parts) > 1 else raw + if raw.startswith("json"): + raw = raw[4:] + + analysis = json.loads(raw.strip()) + logger.debug( + f"HRE seg {seg_idx}/{n_total}: " + f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) " + f"sub={analysis.get('subtitle_position')}/{analysis.get('subtitle_color')} " + f"type={analysis.get('moment_type')}" + ) + try: + from src.gpu.vllm_manager import vllm_touch + vllm_touch() + except Exception: + pass + return analysis + + except Exception as e: + logger.warning(f"HRE frame analysis failed (seg {seg_idx}): {e}") + return _default_hre_analysis(seg_idx, n_total) + + +def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict: + """Fallback with varied decisions based on position in clip.""" + if seg_idx == 0: + zoom_dir, zoom_speed, moment = "in", "fast", "hook" + elif seg_idx == n_total - 1: + zoom_dir, zoom_speed, moment = "out", "slow", "transition" + elif seg_idx % 3 == 1: + zoom_dir, zoom_speed, moment = "hold", "slow", "context" + else: + zoom_dir, zoom_speed, moment = "in", "slow", "reaction" + + _colors = ["yellow", "white", "cyan", "orange", "white", "yellow"] + _positions = ["bottom", "top", "bottom", "top", "bottom", "top"] + + return { + "zoom_direction": zoom_dir, + "zoom_speed": zoom_speed, + "face_detected": False, + "face_cx": 0.5, + "face_cy": 0.38, + "subtitle_position": _positions[seg_idx % len(_positions)], + "subtitle_color": _colors[seg_idx % len(_colors)], + "energy_level": "medium", + "moment_type": moment, + } + + +def get_emoji_for_scene(scene_text: str, emotion: str, action_type: str) -> str: + """Use the configured Qwen2.5-VL model as a text prompt to select an emoji.""" + try: + from openai import OpenAI + client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY) + + response = client.chat.completions.create( + model=VLLM_MODEL, + messages=[{"role": "user", "content": ( + f"Select ONE emoji for this TikTok moment.\n" + f"Emotion: {emotion}\nAction: {action_type}\n" + f"Text: \"{scene_text[:200]}\"\n" + f"Reply with ONLY the emoji character, nothing else." + )}], + max_tokens=5, + temperature=0.3, + ) + emoji = response.choices[0].message.content.strip() + if len(emoji) <= 4: + return emoji + except Exception: + pass + + emoji_map = { + "happy": "😄", "excited": "🔥", "funny": "😂", + "surprised": "😲", "angry": "😤", "sad": "😢", + "neutral": "💡", "gaming": "🎮", "tutorial": "📚", + "entertainment": "✨", "reaction": "😱", + } + return emoji_map.get(emotion) or emoji_map.get(action_type, "⚡") diff --git a/backend/src/gpu/__init__.py b/backend/src/gpu/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/backend/src/gpu/rocm_utils.py b/backend/src/gpu/rocm_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3edbcbd68c09a0ea4f2d0fce7e9a5a66f4cb9014 --- /dev/null +++ b/backend/src/gpu/rocm_utils.py @@ -0,0 +1,92 @@ +"""AMD ROCm device management and monitoring.""" +import os +import subprocess +from loguru import logger + + +def get_device() -> str: + """Return 'cuda' (ROCm uses cuda device name in PyTorch) or 'cpu'.""" + try: + import torch + if torch.cuda.is_available(): + device_name = torch.cuda.get_device_name(0) + logger.info(f"GPU detected: {device_name}") + return "cuda" + except ImportError: + pass + logger.warning("No GPU available, falling back to CPU") + return "cpu" + + +def get_vram_gb() -> float: + """Return available VRAM in GB.""" + try: + import torch + if torch.cuda.is_available(): + total = torch.cuda.get_device_properties(0).total_memory + return round(total / 1024**3, 1) + except Exception: + pass + return 0.0 + + +def get_gpu_utilization() -> dict: + """Return GPU utilization stats via rocm-smi.""" + try: + result = subprocess.run( + ["rocm-smi", "--showuse", "--showmemuse", "--csv"], + capture_output=True, text=True, timeout=5 + ) + if result.returncode == 0: + lines = result.stdout.strip().split("\n") + if len(lines) >= 2: + headers = lines[0].split(",") + values = lines[1].split(",") + return dict(zip(headers, values)) + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + + # Fallback: PyTorch memory stats + try: + import torch + if torch.cuda.is_available(): + allocated = torch.cuda.memory_allocated(0) / 1024**3 + reserved = torch.cuda.memory_reserved(0) / 1024**3 + total = torch.cuda.get_device_properties(0).total_memory / 1024**3 + return { + "vram_used_gb": round(allocated, 2), + "vram_reserved_gb": round(reserved, 2), + "vram_total_gb": round(total, 2), + "vram_pct": round(allocated / total * 100, 1) if total > 0 else 0, + } + except Exception: + pass + return {} + + +def get_optimal_batch_size(model_type: str = "whisper") -> int: + """Return optimal batch size based on available VRAM.""" + vram = get_vram_gb() + if model_type == "whisper": + if vram >= 48: + return 32 + elif vram >= 24: + return 16 + elif vram >= 16: + return 8 + return 4 + elif model_type == "vision": + if vram >= 80: + return 8 + elif vram >= 48: + return 4 + return 1 + return 1 + + +def log_gpu_status(): + stats = get_gpu_utilization() + if stats: + logger.info(f"GPU stats: {stats}") + else: + logger.info(f"GPU: {get_device()} | VRAM: {get_vram_gb()} GB") diff --git a/backend/src/gpu/vllm_manager.py b/backend/src/gpu/vllm_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..330f011c9088966725eb52864920d6429c4c609f --- /dev/null +++ b/backend/src/gpu/vllm_manager.py @@ -0,0 +1,208 @@ +"""On-demand vLLM process manager. + +Starts vLLM when first needed, shuts it down after idle. +Set VLLM_ON_DEMAND=false to use an externally managed vLLM instead. +Set VLLM_IDLE_TIMEOUT=300 (seconds) to control the idle shutdown window. +""" +import os +import subprocess +import threading +import time + +import requests +from loguru import logger + +VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct") +VLLM_PORT = int(os.getenv("VLLM_PORT", "8000")) +IDLE_TIMEOUT = int(os.getenv("VLLM_IDLE_TIMEOUT", "300")) # 5 min default +ON_DEMAND = os.getenv("VLLM_ON_DEMAND", "true").lower() == "true" +DOCKER_CONTAINER = os.getenv("VLLM_DOCKER_CONTAINER", "rocm") # container that has vllm installed + + +class _VLLMManager: + def __init__(self): + self._proc: subprocess.Popen | None = None + self._lock = threading.Lock() + self._last_used = 0.0 + threading.Thread(target=self._watchdog, daemon=True, name="vllm-watchdog").start() + + # ── Public ──────────────────────────────────────────────────────────── + + def is_running(self) -> bool: + if not ON_DEMAND or DOCKER_CONTAINER: + # Docker mode or external vLLM: rely solely on health endpoint + return self._check_health() + with self._lock: + if self._proc is None or self._proc.poll() is not None: + return False + return self._check_health() + + def ensure_running(self, progress_cb=None) -> None: + """Start vLLM if not running. Blocks until healthy (max 3 min).""" + if not ON_DEMAND: + return + with self._lock: + if self._check_health(): + self._last_used = time.time() + return + self._start(progress_cb) + + def stop(self) -> None: + if not ON_DEMAND: + return + with self._lock: + self._stop_locked() + + def touch(self) -> None: + """Reset idle timer — call after each successful vLLM API call.""" + self._last_used = time.time() + + def status(self) -> dict: + running = self.is_running() + idle = round(time.time() - self._last_used, 1) if self._last_used else None + return { + "running": running, + "on_demand": ON_DEMAND, + "idle_seconds": idle, + "idle_timeout": IDLE_TIMEOUT, + "model": VLLM_MODEL, + } + + # ── Internal ────────────────────────────────────────────────────────── + + def _health_url(self) -> str: + return f"http://localhost:{VLLM_PORT}/health" + + def _check_health(self) -> bool: + try: + return requests.get(self._health_url(), timeout=2).status_code == 200 + except Exception: + return False + + def _start(self, progress_cb=None) -> None: + logger.info("vLLM: starting on demand…") + if progress_cb: + progress_cb("Starting AI model (Qwen2.5-VL)… ~2 min first time") + + # Try Docker container first (vLLM may only be installed inside a container) + if DOCKER_CONTAINER: + self._start_via_docker(progress_cb) + else: + self._start_via_subprocess(progress_cb) + + def _start_via_docker(self, progress_cb=None) -> None: + """Start vLLM inside an existing Docker container via docker exec.""" + cmd = ( + f"vllm serve {VLLM_MODEL} " + f"--host 0.0.0.0 --port {VLLM_PORT} " + f"--gpu-memory-utilization 0.85 --max-model-len 4096 " + f"> /tmp/vllm_server.log 2>&1" + ) + subprocess.Popen( + ["docker", "exec", "-d", DOCKER_CONTAINER, "bash", "-c", cmd], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + self._proc = None # process lives inside container, tracked by health check + + deadline = time.time() + 200 + tick = 0 + while time.time() < deadline: + time.sleep(5) + tick += 1 + if self._check_health(): + self._last_used = time.time() + logger.info(f"vLLM (docker) ready after {tick * 5}s") + return + if progress_cb and tick % 6 == 0: + progress_cb(f"AI model loading… {tick * 5}s") + + raise RuntimeError("vLLM did not start within 200s") + + def _start_via_subprocess(self, progress_cb=None) -> None: + """Start vLLM as a direct subprocess (vllm must be in current Python env).""" + import sys + self._proc = subprocess.Popen( + [ + sys.executable, "-m", "vllm.entrypoints.openai.api_server", + "--model", VLLM_MODEL, + "--device", "rocm", + "--port", str(VLLM_PORT), + "--gpu-memory-utilization", "0.85", + "--max-model-len", "4096", + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + + deadline = time.time() + 200 + tick = 0 + while time.time() < deadline: + time.sleep(5) + tick += 1 + if self._proc.poll() is not None: + err = self._proc.stderr.read().decode()[-600:] + raise RuntimeError(f"vLLM exited during startup: {err}") + if self._check_health(): + self._last_used = time.time() + logger.info(f"vLLM ready after {tick * 5}s") + return + if progress_cb and tick % 6 == 0: + progress_cb(f"AI model loading… {tick * 5}s") + + raise RuntimeError("vLLM did not start within 200s") + + def _stop_locked(self) -> None: + if DOCKER_CONTAINER: + subprocess.run( + ["docker", "exec", DOCKER_CONTAINER, "pkill", "-f", "vllm"], + capture_output=True, + ) + self._proc = None + elif self._proc and self._proc.poll() is None: + self._proc.terminate() + try: + self._proc.wait(timeout=10) + except subprocess.TimeoutExpired: + self._proc.kill() + self._proc = None + logger.info("vLLM stopped") + + def _watchdog(self) -> None: + while True: + time.sleep(60) + if not ON_DEMAND or IDLE_TIMEOUT <= 0: + continue + with self._lock: + if (self._proc + and self._proc.poll() is None + and self._last_used > 0 + and time.time() - self._last_used > IDLE_TIMEOUT): + logger.info( + f"vLLM idle {IDLE_TIMEOUT}s → shutting down to save GPU credits" + ) + self._stop_locked() + + +_manager = _VLLMManager() + + +# ── Module-level helpers ────────────────────────────────────────────────────── + +def ensure_vllm_running(progress_cb=None) -> None: + _manager.ensure_running(progress_cb) + + +def vllm_touch() -> None: + _manager.touch() + + +def vllm_stop() -> None: + _manager.stop() + + +def vllm_is_running() -> bool: + return _manager.is_running() + + +def vllm_status() -> dict: + return _manager.status() diff --git a/backend/src/ingestion/__init__.py b/backend/src/ingestion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/backend/src/ingestion/uploader.py b/backend/src/ingestion/uploader.py new file mode 100644 index 0000000000000000000000000000000000000000..52c9502f09aed6e7ff01b2722b503108021079d2 --- /dev/null +++ b/backend/src/ingestion/uploader.py @@ -0,0 +1,34 @@ +"""Handle file upload from user.""" +import shutil +from pathlib import Path +from fastapi import UploadFile +from loguru import logger + +ALLOWED_EXTENSIONS = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".m4v"} +MAX_SIZE_BYTES = 2 * 1024 * 1024 * 1024 # 2 GB + + +async def save_upload( + file: UploadFile, + output_dir: Path, + session_id: str, +) -> Path: + """Save uploaded video file to disk.""" + suffix = Path(file.filename or "video.mp4").suffix.lower() + if suffix not in ALLOWED_EXTENSIONS: + raise ValueError(f"Unsupported file type: {suffix}. Allowed: {ALLOWED_EXTENSIONS}") + + output_dir.mkdir(parents=True, exist_ok=True) + dest = output_dir / f"{session_id}_input{suffix}" + + size = 0 + with open(dest, "wb") as f: + while chunk := await file.read(1024 * 1024): # 1MB chunks + size += len(chunk) + if size > MAX_SIZE_BYTES: + dest.unlink(missing_ok=True) + raise ValueError("File too large (max 2 GB)") + f.write(chunk) + + logger.info(f"Saved upload: {dest} ({size / 1024 / 1024:.1f} MB)") + return dest diff --git a/backend/src/ingestion/youtube.py b/backend/src/ingestion/youtube.py new file mode 100644 index 0000000000000000000000000000000000000000..b46b693b89c0745cd1e41ea0456e9ef721ba3833 --- /dev/null +++ b/backend/src/ingestion/youtube.py @@ -0,0 +1,147 @@ +"""YouTube video downloader using yt-dlp.""" +import asyncio +import subprocess +from pathlib import Path +from typing import Optional, Callable +import yt_dlp +from loguru import logger + + +def _progress_hook(callback: Optional[Callable] = None): + def hook(d: dict): + if d["status"] == "downloading" and callback: + pct = d.get("_percent_str", "0%").strip().replace("%", "") + try: + callback(float(pct)) + except ValueError: + pass + return hook + + +def download_video( + url: str, + output_dir: Path, + session_id: str, + progress_callback: Optional[Callable] = None, + max_height: int = 1080, +) -> Path: + """Download video from YouTube (or any yt-dlp-supported site). + + Returns path to downloaded MP4 file. + """ + output_dir.mkdir(parents=True, exist_ok=True) + output_template = str(output_dir / f"{session_id}_input.%(ext)s") + + ydl_opts = { + "format": ( + f"bestvideo[vcodec^=avc1][height<={max_height}]+bestaudio/" + f"bestvideo[vcodec^=avc][height<={max_height}]+bestaudio/" + f"bestvideo[vcodec!^=av01][height<={max_height}]+bestaudio/" + f"best[height<={max_height}]/best" + ), + "format_sort": ["vcodec:h264"], + "outtmpl": output_template, + "merge_output_format": "mp4", + "quiet": True, + "no_warnings": True, + "progress_hooks": [_progress_hook(progress_callback)], + "postprocessors": [{ + "key": "FFmpegVideoConvertor", + "preferedformat": "mp4", + }], + # Use iOS/Android clients to bypass datacenter IP bot-detection + "extractor_args": { + "youtube": { + "player_client": ["ios", "android", "tv_embedded"], + } + }, + } + _inject_cookies(ydl_opts) + + logger.info(f"Downloading: {url}") + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=True) + title = info.get("title", "video") + duration = info.get("duration", 0) + logger.info(f"Downloaded: '{title}' ({duration}s)") + + output_path = output_dir / f"{session_id}_input.mp4" + if not output_path.exists(): + for f in output_dir.glob(f"{session_id}_input.*"): + output_path = f + break + + # Safety: transcode AV1 → h264 if yt-dlp still picked it + output_path = _ensure_h264(output_path) + return output_path + + +def _ensure_h264(video_path: Path) -> Path: + """Transcode to h264 if video codec is AV1 (not supported by PySceneDetect on this server).""" + probe = subprocess.run( + ["ffprobe", "-v", "error", "-select_streams", "v:0", + "-show_entries", "stream=codec_name", "-of", "csv=p=0", str(video_path)], + capture_output=True, text=True, + ) + codec = probe.stdout.strip().lower() + if codec not in ("av1", "av01"): + return video_path + + logger.warning(f"AV1 detected ({video_path.name}), transcoding to h264...") + out = video_path.with_name(video_path.stem + "_h264.mp4") + result = subprocess.run( + ["ffmpeg", "-y", "-i", str(video_path), "-c:v", "libx264", "-preset", "fast", + "-crf", "23", "-c:a", "aac", "-b:a", "128k", str(out)], + capture_output=True, text=True, + ) + if result.returncode == 0: + logger.info(f"Transcoded to h264: {out.name}") + return out + logger.error(f"Transcode failed: {result.stderr[-200:]}") + return video_path + + +_COOKIES_PATH = Path("/root/cookies.txt") + + +def _inject_cookies(opts: dict) -> None: + """Add cookiefile to ydl_opts if cookies.txt exists on server.""" + if _COOKIES_PATH.exists(): + opts["cookiefile"] = str(_COOKIES_PATH) + logger.debug(f"Using cookies: {_COOKIES_PATH}") + + +def get_video_info(url: str) -> dict: + """Return metadata without downloading.""" + ydl_opts = { + "quiet": True, + "no_warnings": True, + "skip_download": True, + "extractor_args": { + "youtube": {"player_client": ["ios", "android", "tv_embedded"]} + }, + } + _inject_cookies(ydl_opts) + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=False) + return { + "title": info.get("title", ""), + "duration": info.get("duration", 0), + "thumbnail": info.get("thumbnail", ""), + "channel": info.get("channel", ""), + "view_count": info.get("view_count", 0), + "description": info.get("description", "")[:500], + } + + +async def download_video_async( + url: str, + output_dir: Path, + session_id: str, + progress_callback: Optional[Callable] = None, +) -> Path: + """Async wrapper for download_video.""" + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + None, lambda: download_video(url, output_dir, session_id, progress_callback) + ) diff --git a/backend/src/processing/__init__.py b/backend/src/processing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/backend/src/processing/clip_extractor.py b/backend/src/processing/clip_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..321604dd302946d0cd271476a28577f3f6076bd8 --- /dev/null +++ b/backend/src/processing/clip_extractor.py @@ -0,0 +1,131 @@ +"""Extract video clips using ffmpeg-python.""" +import asyncio +import subprocess +from pathlib import Path +from loguru import logger + + +def extract_clip( + video_path: Path, + start: float, + end: float, + output_path: Path, + use_hw_encode: bool = True, + vertical: bool = True, + face_bbox: list = None, + **kwargs, +) -> Path: + """Cut a clip and convert to 9:16 vertical (1080x1920) for TikTok. + + face_bbox: [x1, y1, x2, y2] in pixels from Qwen2.5-VL — used to center + the crop on the face. Falls back to center crop when None. + Uses AMD AMF hardware encoder when available. + """ + output_path.parent.mkdir(parents=True, exist_ok=True) + encoders = ["h264_amf", "libx264"] if use_hw_encode else ["libx264"] + + # 9:16 vertical conversion filter + vf_filters = [] + if vertical: + aspect_mode = kwargs.get("aspect_mode", "crop") + if aspect_mode == "letterbox": + # Fit entire 16:9 frame into 9:16, black bars top+bottom + vf_filters.append( + "scale=1080:1920:force_original_aspect_ratio=decrease," + "pad=1080:1920:(ow-iw)/2:(oh-ih)/2:black" + ) + else: + # Crop: scale to 1920 height first, then center-crop to 1080 wide + # Optionally center on face_bbox x when available + if face_bbox and len(face_bbox) == 4: + x1, _, x2, _ = face_bbox + face_cx = int((x1 + x2) / 2) + crop = f"scale=-1:1920,crop=1080:1920:max(0\\,min(iw-1080\\,{face_cx}*iw/in_w-540)):0" + else: + crop = "scale=-1:1920,crop=1080:1920:(iw-1080)/2:0" + vf_filters.append(crop) + + for encoder in encoders: + cmd = ["ffmpeg", "-y", "-ss", str(start), "-to", str(end), "-i", str(video_path)] + if vf_filters: + cmd += ["-vf", ",".join(vf_filters)] + cmd += ["-c:v", encoder, "-c:a", "aac", "-b:a", "128k", "-movflags", "+faststart", str(output_path)] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + if encoder == "h264_amf": + logger.info(f"Encoded 9:16 with AMD AMF: {output_path.name}") + return output_path + elif encoder == "h264_amf": + logger.debug("AMD AMF not available, falling back to libx264") + + raise RuntimeError(f"All encoders failed for clip {output_path.name}") + + +def burn_subtitles( + clip_path: Path, + ass_path: Path, + output_path: Path, + use_hw_encode: bool = True, +) -> Path: + """Burn ASS subtitles into video using ffmpeg. + + Returns path to output video with burned-in subtitles. + """ + output_path.parent.mkdir(parents=True, exist_ok=True) + ass_str = str(ass_path).replace("\\", "/").replace(":", "\\:") + + encoders = ["h264_amf", "libx264"] if use_hw_encode else ["libx264"] + + for encoder in encoders: + cmd = [ + "ffmpeg", "-y", + "-i", str(clip_path), + "-vf", f"ass='{ass_str}'", + "-c:v", encoder, + "-c:a", "copy", + "-movflags", "+faststart", + str(output_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + return output_path + elif encoder == "h264_amf": + logger.debug("AMD AMF burn-sub failed, using libx264") + + raise RuntimeError(f"Subtitle burn-in failed for {clip_path.name}\n{result.stderr[-500:]}") + + +def extract_all_clips( + video_path: Path, + selected_clips: list[dict], + output_dir: Path, + session_id: str, + aspect_mode: str = "crop", +) -> list[dict]: + """Extract all selected clips from video. Returns list with added 'clip_path'.""" + results = [] + for i, clip in enumerate(selected_clips): + out_path = output_dir / f"{session_id}_clip_{i+1:02d}_raw.mp4" + face_bbox = clip.get("vision_analysis", {}).get("face_bbox") + try: + extract_clip(video_path, clip["start"], clip["end"], out_path, face_bbox=face_bbox, aspect_mode=aspect_mode) + results.append({**clip, "clip_index": i + 1, "clip_path": str(out_path)}) + logger.info(f"Extracted clip {i+1}: {clip['start']:.1f}s–{clip['end']:.1f}s → {out_path.name}") + except Exception as e: + logger.error(f"Failed to extract clip {i+1}: {e}") + results.append({**clip, "clip_index": i + 1, "clip_path": None, "error": str(e)}) + return results + + +async def extract_all_clips_async( + video_path: Path, + selected_clips: list[dict], + output_dir: Path, + session_id: str, + aspect_mode: str = "crop", +) -> list[dict]: + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + None, + lambda: extract_all_clips(video_path, selected_clips, output_dir, session_id, aspect_mode) + ) diff --git a/backend/src/processing/emoji_overlay.py b/backend/src/processing/emoji_overlay.py new file mode 100644 index 0000000000000000000000000000000000000000..2c24eb8efdeb56ae313a3a8aa0c08e84454566b8 --- /dev/null +++ b/backend/src/processing/emoji_overlay.py @@ -0,0 +1,36 @@ +"""Emoji and text overlay utilities for HRE pipeline.""" +import subprocess +from pathlib import Path +from loguru import logger + + +def add_emoji_overlay( + video_path: Path, + emoji: str, + output_path: Path, + x: str = "w-100", + y: str = "50", + size: int = 80, + start_sec: float = 0.0, + end_sec: float = 3.0, +) -> Path: + """Add emoji text overlay to video using ffmpeg drawtext.""" + escaped = emoji.replace("'", "\\'").replace(":", "\\:") + + vf = ( + f"drawtext=text='{escaped}'" + f":fontsize={size}:x={x}:y={y}" + f":enable='between(t,{start_sec},{end_sec})'" + ) + + cmd = [ + "ffmpeg", "-y", "-i", str(video_path), + "-vf", vf, + "-c:v", "libx264", "-c:a", "copy", + str(output_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0 and output_path.exists(): + return output_path + logger.warning(f"Emoji overlay failed: {result.stderr[-200:]}") + return video_path # fallback to original diff --git a/backend/src/processing/high_retention.py b/backend/src/processing/high_retention.py new file mode 100644 index 0000000000000000000000000000000000000000..f28559be5f63c05d19dfc2184e86f0eb0f9a10b3 --- /dev/null +++ b/backend/src/processing/high_retention.py @@ -0,0 +1,491 @@ +"""High-Retention Editing pipeline — per-segment AI decisions. + +Each 3-5s segment gets its own zoom direction, subtitle position, +and caption color driven by Qwen2.5-VL analyzing one frame per segment. + +Pipeline per clip: + 1. Segment clip at speech pauses (3-5s chunks) + 2. Extract midpoint frame from each segment + 3. Qwen2.5-VL analyzes each frame → zoom + subtitle decisions + 4. ffmpeg filter_complex: per-segment zoompan + concat + 5. ASS subtitles with per-segment alignment/color/size override tags +""" +import subprocess +import tempfile +from pathlib import Path +from loguru import logger + + +# ─── Video metadata ──────────────────────────────────────────────────────────── + +def _probe_dimensions(video_path: Path) -> tuple[int, int]: + probe = subprocess.run( + ["ffprobe", "-v", "error", "-select_streams", "v:0", + "-show_entries", "stream=width,height", "-of", "csv=p=0", + str(video_path)], + capture_output=True, text=True, + ) + try: + w, h = map(int, probe.stdout.strip().split(",")) + return w, h + except Exception: + return 1080, 1920 + + +def _probe_duration(video_path: Path) -> float: + probe = subprocess.run( + ["ffprobe", "-v", "error", "-show_entries", "format=duration", + "-of", "csv=p=0", str(video_path)], + capture_output=True, text=True, + ) + try: + return float(probe.stdout.strip()) + except Exception: + return 0.0 + + +def _has_audio_stream(video_path: Path) -> bool: + probe = subprocess.run( + ["ffprobe", "-v", "error", "-select_streams", "a", + "-show_entries", "stream=codec_type", "-of", "csv=p=0", + str(video_path)], + capture_output=True, text=True, + ) + return bool(probe.stdout.strip()) + + +# ─── Segmentation ───────────────────────────────────────────────────────────── + +def _segment_clip( + duration: float, + transcript: dict, + clip_start: float, + max_seg: float = 4.5, +) -> list[dict]: + """Divide clip into segments at speech pauses, max_seg seconds each.""" + words: list[dict] = [] + for seg in transcript.get("segments", []): + words.extend(seg.get("words", [])) + + if clip_start > 0: + words = [ + {**w, "start": max(0.0, w["start"] - clip_start), + "end": max(0.0, w["end"] - clip_start)} + for w in words + ] + words = [w for w in words if w["end"] > 0 and w["start"] < duration] + + # Collect pause midpoints as candidate cut times + cuts = [0.0] + for i in range(len(words) - 1): + gap = words[i + 1]["start"] - words[i]["end"] + if gap > 0.2: + cuts.append((words[i]["end"] + words[i + 1]["start"]) / 2.0) + cuts.append(duration) + cuts = sorted(set(cuts)) + + # Merge short intervals, split long ones + segs: list[dict] = [] + start = 0.0 + for cut in cuts[1:]: + seg_len = cut - start + if seg_len < 1.5 and cut < duration: + continue # too short — extend to next cut + if seg_len > max_seg: + t = start + while t + max_seg < cut: + segs.append({"start": t, "end": t + max_seg}) + t += max_seg + if cut - t > 0.5: + segs.append({"start": t, "end": cut}) + start = cut + else: + segs.append({"start": start, "end": cut}) + start = cut + + # Fallback: split evenly if not enough segments + if len(segs) < 2: + n = max(2, round(duration / 4.0)) + d = duration / n + segs = [{"start": i * d, "end": min((i + 1) * d, duration)} for i in range(n)] + + return segs + + +# ─── Frame extraction ───────────────────────────────────────────────────────── + +def _extract_frame(video_path: Path, t: float, out_path: Path) -> bool: + cmd = [ + "ffmpeg", "-y", "-ss", f"{t:.3f}", "-i", str(video_path), + "-vframes", "1", "-q:v", "3", str(out_path), + ] + result = subprocess.run(cmd, capture_output=True, timeout=30) + return result.returncode == 0 and out_path.exists() + + +# ─── Per-segment AI analysis ────────────────────────────────────────────────── + +def _analyze_segment( + video_path: Path, + seg: dict, + seg_idx: int, + n_total: int, + transcript: dict, + clip_start: float, + tmp_dir: Path, +) -> dict: + from src.analysis.vision import analyze_frame_for_hre, _default_hre_analysis + + mid_t = (seg["start"] + seg["end"]) / 2.0 + frame_path = tmp_dir / f"seg_{seg_idx:03d}.jpg" + + if not _extract_frame(video_path, mid_t, frame_path): + return _default_hre_analysis(seg_idx, n_total) + + words_all: list[dict] = [] + for s in transcript.get("segments", []): + words_all.extend(s.get("words", [])) + + abs_start = seg["start"] + clip_start + abs_end = seg["end"] + clip_start + context = " ".join( + w.get("word", w.get("text", "")) + for w in words_all + if w.get("start", 0) < abs_end and w.get("end", 0) > abs_start + ).strip() + + return analyze_frame_for_hre(frame_path, context, seg_idx, n_total) + + +# ─── Zoom expression builders ───────────────────────────────────────────────── + +def _build_zoom_exprs( + analysis: dict, + w: int, + h: int, +) -> tuple[str, str, str]: + """Return (z_expr, x_expr, y_expr) for ffmpeg zoompan from HRE analysis. + Note: \\, escapes comma inside ffmpeg filter expressions. + """ + direction = analysis.get("zoom_direction", "in") + speed = analysis.get("zoom_speed", "slow") + face_detected = bool(analysis.get("face_detected", False)) + face_cx = float(analysis.get("face_cx") or 0.5) + face_cy = float(analysis.get("face_cy") or 0.38) + + if direction == "in": + if speed == "fast": + z_expr, max_zoom = "min(1.2+n*0.0014\\,1.6)", 1.6 + else: + z_expr, max_zoom = "min(1.05+n*0.0006\\,1.35)", 1.35 + elif direction == "out": + if speed == "fast": + z_expr, max_zoom = "max(1.6-n*0.0016\\,1.0)", 1.6 + else: + z_expr, max_zoom = "max(1.4-n*0.0010\\,1.0)", 1.4 + else: # hold + z_expr, max_zoom = "1.1", 1.1 + + if face_detected and direction == "in" and max_zoom > 1.05: + raw_cx = int(face_cx * w - w / (max_zoom * 2)) + raw_cy = int(face_cy * h - h / (max_zoom * 2)) + safe_cx = max(0, min(w - int(w / max_zoom), raw_cx)) + safe_cy = max(0, min(h - int(h / max_zoom), raw_cy)) + ctr_x = w / 2 - w / (max_zoom * 2) + ctr_y = h / 2 - h / (max_zoom * 2) + x_expr = ( + f"(iw/2-(iw/zoom/2))+({safe_cx}-{ctr_x:.1f})*(zoom-1)/({max_zoom}-1)" + ) + y_expr = ( + f"(ih/2-(ih/zoom/2))+({safe_cy}-{ctr_y:.1f})*(zoom-1)/({max_zoom}-1)" + ) + else: + x_expr = "iw/2-(iw/zoom/2)" + if direction == "in": + y_bias = min(face_cy, 0.5) if face_cy < 0.55 else 0.38 + y_expr = f"ih*{y_bias:.2f}-(ih/zoom/2)" + else: + y_expr = "ih/2-(ih/zoom/2)" + + return z_expr, x_expr, y_expr + + +# ─── Per-segment zoom via filter_complex ────────────────────────────────────── + +def _apply_per_segment_zoom( + input_path: Path, + segments: list[dict], + analyses: list[dict], + w: int, + h: int, + output_path: Path, + has_audio: bool = True, +) -> Path: + """Apply different zoompan to each segment, concat into single stream.""" + filter_parts: list[str] = [] + v_labels: list[str] = [] + a_labels: list[str] = [] + + for i, (seg, analysis) in enumerate(zip(segments, analyses)): + s = f"{seg['start']:.3f}" + e = f"{seg['end']:.3f}" + z, x, y = _build_zoom_exprs(analysis, w, h) + zp = f"zoompan=z='{z}':x='{x}':y='{y}':d=1:s={w}x{h}:fps=30" + filter_parts.append(f"[0:v]trim={s}:{e},setpts=PTS-STARTPTS,{zp}[v{i}]") + v_labels.append(f"[v{i}]") + if has_audio: + filter_parts.append(f"[0:a]atrim={s}:{e},asetpts=PTS-STARTPTS[a{i}]") + a_labels.append(f"[a{i}]") + + n = len(segments) + filter_parts.append("".join(v_labels) + f"concat=n={n}:v=1:a=0[vout]") + if has_audio: + filter_parts.append("".join(a_labels) + f"concat=n={n}:v=0:a=1[aout]") + + cmd = [ + "ffmpeg", "-y", "-i", str(input_path), + "-filter_complex", ";".join(filter_parts), + "-map", "[vout]", + ] + if has_audio: + cmd += ["-map", "[aout]", "-c:a", "aac"] + cmd += ["-c:v", "libx264", "-movflags", "+faststart", str(output_path)] + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + if result.returncode == 0 and output_path.exists(): + logger.info(f"Per-segment zoom: {n} segments, {w}x{h}") + return output_path + logger.warning(f"Per-segment zoom failed: {result.stderr[-800:]}") + return input_path + + +# ─── Per-segment ASS subtitles ──────────────────────────────────────────────── + +_ASS_COLORS = { + "white": "&H00FFFFFF", + "yellow": "&H0000FFFF", + "cyan": "&H00FFFF00", + "orange": "&H000066FF", + "green": "&H0000FF00", + "red": "&H000000FF", +} + + +def _ts(t: float) -> str: + h = int(t // 3600) + m = int((t % 3600) // 60) + s = t % 60 + return f"{h}:{m:02d}:{s:06.3f}" + + +def _generate_per_segment_subtitles( + transcript: dict, + ass_path: Path, + clip_start: float, + segments: list[dict], + analyses: list[dict], +) -> None: + """Write ASS with per-segment alignment, color, and font-size overrides.""" + events: list[dict] = [] + + # Word-level events + for seg in transcript.get("segments", []): + for w in seg.get("words", []): + t0 = max(0.0, float(w.get("start", 0)) - clip_start) + t1 = max(0.0, float(w.get("end", 0)) - clip_start) + text = w.get("word", w.get("text", "")).strip() + if text and t1 > 0: + events.append({"start": t0, "end": max(t1, t0 + 0.08), "text": text}) + + # Sentence-level fallback (split into 3-word chunks) + if not events: + for seg in transcript.get("segments", []): + t0 = max(0.0, float(seg.get("start", 0)) - clip_start) + t1 = max(0.0, float(seg.get("end", 0)) - clip_start) + text = seg.get("text", "").strip() + if not text or t1 <= 0: + continue + wlist = text.split() + chunk = 3 + n_ch = max(1, (len(wlist) + chunk - 1) // chunk) + dur = (t1 - t0) / n_ch + for j in range(n_ch): + events.append({ + "start": t0 + j * dur, + "end": t0 + (j + 1) * dur, + "text": " ".join(wlist[j * chunk:(j + 1) * chunk]), + }) + + def get_an(t: float) -> dict: + for seg, an in zip(segments, analyses): + if seg["start"] <= t < seg["end"]: + return an + return analyses[-1] if analyses else {} + + lines = [ + "[Script Info]", + "ScriptType: v4.00+", + "PlayResX: 1080", + "PlayResY: 1920", + "ScaledBorderAndShadow: yes", + "", + "[V4+ Styles]", + "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, " + "OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, " + "ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, " + "Alignment, MarginL, MarginR, MarginV, Encoding", + "Style: Default,Impact,90,&H00FFFFFF,&H0000FFFF,&H00000000,&H80000000," + "-1,0,0,0,100,100,0,0,1,4,0,2,40,40,200,1", + "", + "[Events]", + "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text", + ] + + for ev in events: + an = get_an(ev["start"]) + color = _ASS_COLORS.get(an.get("subtitle_color", "white"), "&H00FFFFFF") + pos = an.get("subtitle_position", "bottom") + energy = an.get("energy_level", "medium") + moment = an.get("moment_type", "context") + + alignment = 8 if pos == "top" else 2 + margin_v = 120 if pos == "top" else 200 + fs = (108 if energy == "high" or moment in ("hook", "punchline") + else 80 if energy == "low" else 92) + + # Pop animation: start 130% scale, shrink to 100% in 120ms + pop = "{\\fscx130\\fscy130\\t(0,120,\\fscx100\\fscy100)}" + tag = f"{{\\an{alignment}\\1c{color}&\\fs{fs}\\b1}}{pop}" + + lines.append( + f"Dialogue: 0,{_ts(ev['start'])},{_ts(ev['end'])}," + f"Default,,0,0,{margin_v},,{tag}{ev['text'].upper()}" + ) + + ass_path.write_text("\n".join(lines), encoding="utf-8") + logger.debug(f"ASS: {len(events)} events across {len(segments)} segments") + + +# ─── Emoji ───────────────────────────────────────────────────────────────────── + +def _get_emoji(clip_data: dict, analyses: list[dict] | None = None) -> str: + if analyses: + energy_rank = {"high": 3, "medium": 2, "low": 1} + best = max(analyses, key=lambda a: energy_rank.get(a.get("energy_level", "low"), 1)) + moment_emoji = { + "hook": "🔥", "punchline": "😂", "reaction": "😲", + "context": "💡", "transition": "✨", + } + if emoji := moment_emoji.get(best.get("moment_type", "")): + return emoji + + a = clip_data.get("vision_analysis", {}) + emotion = a.get("emotion", "excited") + action = a.get("action_type", "entertainment") + transcript_text = clip_data.get("transcript_text", "") + if transcript_text: + try: + from src.analysis.vision import get_emoji_for_scene + return get_emoji_for_scene(transcript_text, emotion, action) + except Exception: + pass + + fb = {"happy": "😄", "excited": "🔥", "funny": "😂", "surprised": "😲", + "gaming": "🎮", "tutorial": "📚", "angry": "😤", "sad": "😢"} + return fb.get(emotion, fb.get(action, "⚡")) + + +# ─── Final render ───────────────────────────────────────────────────────────── + +def _render_final( + video_path: Path, + ass_path: Path, + emoji: str, + output_path: Path, +) -> None: + ass_str = str(ass_path).replace("\\", "/").replace(":", "\\:") + emoji_filter = ( + f"drawtext=text='{emoji}':fontsize=80:x=w-100:y=50" + f":enable='between(t\\,0\\,3)'" + ) + vf = f"ass='{ass_str}',{emoji_filter}" + + cmd = [ + "ffmpeg", "-y", "-i", str(video_path), + "-vf", vf, "-c:v", "libx264", "-c:a", "copy", + "-movflags", "+faststart", str(output_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + if result.returncode != 0: + cmd2 = [ + "ffmpeg", "-y", "-i", str(video_path), + "-vf", f"ass='{ass_str}'", + "-c:v", "libx264", "-c:a", "copy", str(output_path), + ] + result2 = subprocess.run(cmd2, capture_output=True, text=True, timeout=300) + if result2.returncode != 0: + logger.error(f"HRE render failed: {result2.stderr[-300:]}") + return + logger.info(f"HRE render complete → {output_path.name}") + + +# ─── Main pipeline ──────────────────────────────────────────────────────────── + +def apply_hre( + clip_path: Path, + clip_data: dict, + transcript: dict, + output_path: Path, +) -> Path: + """Apply per-segment AI-driven HRE: each 3-5s chunk gets its own zoom + subtitle style.""" + output_path.parent.mkdir(parents=True, exist_ok=True) + clip_start = clip_data.get("start", 0.0) + + with tempfile.TemporaryDirectory() as _tmp: + tmp_dir = Path(_tmp) + tmp_zoomed = tmp_dir / "zoomed.mp4" + + w, h = _probe_dimensions(clip_path) + duration = _probe_duration(clip_path) + if duration <= 0: + duration = float(clip_data.get("end", clip_start + 30)) - clip_start + has_audio = _has_audio_stream(clip_path) + + # 1. Segment at speech pauses + segments = _segment_clip(duration, transcript, clip_start) + n = len(segments) + logger.info( + f"HRE clip {clip_data.get('index', '?')}: " + f"{duration:.1f}s → {n} segments (AI analyzing each)" + ) + + # 2. Qwen2.5-VL analyzes each segment + analyses = [ + _analyze_segment(clip_path, seg, i, n, transcript, clip_start, tmp_dir) + for i, seg in enumerate(segments) + ] + + for i, (seg, an) in enumerate(zip(segments, analyses)): + logger.info( + f" [{seg['start']:.1f}s-{seg['end']:.1f}s] " + f"zoom={an.get('zoom_direction')}({an.get('zoom_speed')}) " + f"sub={an.get('subtitle_position')}/{an.get('subtitle_color')} " + f"type={an.get('moment_type')} energy={an.get('energy_level')}" + ) + + # 3. Per-segment zoom via filter_complex + zoomed = _apply_per_segment_zoom( + clip_path, segments, analyses, w, h, tmp_zoomed, has_audio=has_audio + ) + + # 4. Per-segment ASS subtitles + ass_path = output_path.with_suffix(".ass") + _generate_per_segment_subtitles(transcript, ass_path, clip_start, segments, analyses) + + # 5. Emoji from highest-energy segment + emoji = _get_emoji(clip_data, analyses) + + # 6. Render + _render_final(zoomed, ass_path, emoji, output_path) + + return output_path diff --git a/backend/src/processing/subtitle.py b/backend/src/processing/subtitle.py new file mode 100644 index 0000000000000000000000000000000000000000..6e5aac52d82ccb16b4efaecd4e651a8aca28e252 --- /dev/null +++ b/backend/src/processing/subtitle.py @@ -0,0 +1,291 @@ +"""Generate ASS subtitles using pysubs2. + +Supports: word-by-word, sentence, karaoke, fade, pop, typewriter animations. +Full ASS spec: font, size, 4-color layers, border, shadow, position, alignment. +Handles Thai/Chinese character-level splitting. +""" +from pathlib import Path +from typing import Optional +import pysubs2 +from pysubs2 import SSAFile, SSAEvent, SSAStyle +from loguru import logger + +# Languages that split by character rather than word +CHAR_LEVEL_LANGUAGES = {"th", "zh", "ja", "km", "lo"} + +# Default font per language +DEFAULT_FONTS = { + "th": "Noto Sans Thai", + "zh": "Noto Sans SC", + "zh-tw": "Noto Sans TC", + "ja": "Noto Sans JP", + "ko": "Noto Sans KR", + "en": "Montserrat", + "default": "Noto Sans", +} + +# Animation presets (ASS override tags) +def _fade_tags(fade_in_ms: int = 200, fade_out_ms: int = 200) -> str: + return f"{{\\fade({fade_in_ms},{fade_out_ms})}}" + +def _pop_tags() -> str: + return "{\\t(0,100,\\fscx120\\fscy120)\\t(100,200,\\fscx100\\fscy100)}" + +def _typewriter_per_char(char: str, delay_ms: int) -> str: + return f"{{\\alpha&HFF&\\t({delay_ms},{delay_ms+80},\\alpha&H00&)}}{char}" + +def _bounce_tags() -> str: + return "{\\t(0,150,\\frz-5)\\t(150,300,\\frz5)\\t(300,400,\\frz0)}" + + +def _color_to_ass(hex_color: str, alpha: int = 0) -> str: + """Convert #RRGGBB hex to ASS &HAABBGGRR format.""" + hex_color = hex_color.lstrip("#") + if len(hex_color) == 6: + r, g, b = hex_color[0:2], hex_color[2:4], hex_color[4:6] + else: + r, g, b = "FF", "FF", "FF" + aa = f"{alpha:02X}" + return f"&H{aa}{b}{g}{r}" + + +def build_style( + font_family: str = "Noto Sans", + font_size: int = 72, + primary_color: str = "#FFFFFF", + secondary_color: str = "#FFFF00", + outline_color: str = "#000000", + shadow_color: str = "#000000", + primary_alpha: int = 0, + outline_alpha: int = 0, + shadow_alpha: int = 80, + bold: bool = True, + italic: bool = False, + underline: bool = False, + outline_size: float = 4.0, + shadow_size: float = 2.0, + alignment: int = 2, # 2=bottom-center, 8=top-center + margin_l: int = 40, + margin_r: int = 40, + margin_v: int = 250, + scale_x: int = 100, + scale_y: int = 100, + spacing: float = 0.0, + angle: float = 0.0, +) -> SSAStyle: + style = SSAStyle() + style.fontname = font_family + style.fontsize = font_size + style.primarycolor = pysubs2.Color(*_hex_to_rgba(primary_color, primary_alpha)) + style.secondarycolor = pysubs2.Color(*_hex_to_rgba(secondary_color, 0)) + style.outlinecolor = pysubs2.Color(*_hex_to_rgba(outline_color, outline_alpha)) + style.backcolor = pysubs2.Color(*_hex_to_rgba(shadow_color, shadow_alpha)) + style.bold = bold + style.italic = italic + style.underline = underline + style.outline = outline_size + style.shadow = shadow_size + style.alignment = alignment + style.marginl = margin_l + style.marginr = margin_r + style.marginv = margin_v + style.scalex = scale_x + style.scaley = scale_y + style.spacing = spacing + style.angle = angle + style.borderstyle = 1 # outline + shadow + return style + + +def _hex_to_rgba(hex_color: str, alpha_0_255: int = 0): + """Convert #RRGGBB to (R, G, B, A) where A=0 is opaque.""" + hex_color = hex_color.lstrip("#") + if len(hex_color) == 6: + r = int(hex_color[0:2], 16) + g = int(hex_color[2:4], 16) + b = int(hex_color[4:6], 16) + else: + r, g, b = 255, 255, 255 + return r, g, b, alpha_0_255 + + +def generate_subtitles( + transcript: dict, + output_path: Path, + style_config: dict, + clip_start_offset: float = 0.0, +) -> Path: + """Generate .ass subtitle file from transcript. + + Args: + transcript: Output from whisper.py + output_path: Where to save the .ass file + style_config: Dict with font/color/animation settings from frontend + clip_start_offset: Shift all timestamps (for sub-clips from longer video) + """ + subs = SSAFile() + subs.info["PlayResX"] = "1080" + subs.info["PlayResY"] = "1920" + subs.info["ScaledBorderAndShadow"] = "yes" + subs.info["WrapStyle"] = "0" + + display_mode = style_config.get("display_mode", "word") # "word" or "sentence" + animation = style_config.get("animation", "none") # none|fade|karaoke|pop|typewriter|bounce + subtitle_lang = style_config.get("subtitle_language", "en") + char_level = transcript.get("char_level", False) or subtitle_lang in CHAR_LEVEL_LANGUAGES + + font_family = style_config.get("font_family") or DEFAULT_FONTS.get(subtitle_lang, DEFAULT_FONTS["default"]) + + style = build_style( + font_family=font_family, + font_size=style_config.get("font_size", 72), + primary_color=style_config.get("primary_color", "#FFFFFF"), + secondary_color=style_config.get("secondary_color", "#FFFF00"), + outline_color=style_config.get("outline_color", "#000000"), + shadow_color=style_config.get("shadow_color", "#000000"), + primary_alpha=style_config.get("primary_alpha", 0), + outline_alpha=style_config.get("outline_alpha", 0), + shadow_alpha=style_config.get("shadow_alpha", 80), + bold=style_config.get("bold", True), + italic=style_config.get("italic", False), + underline=style_config.get("underline", False), + outline_size=style_config.get("outline_size", 4.0), + shadow_size=style_config.get("shadow_size", 2.0), + alignment=style_config.get("alignment", 2), + margin_l=style_config.get("margin_l", 40), + margin_r=style_config.get("margin_r", 40), + margin_v=style_config.get("margin_v", 250), + scale_x=style_config.get("scale_x", 100), + scale_y=style_config.get("scale_y", 100), + spacing=style_config.get("spacing", 0.0), + angle=style_config.get("angle", 0.0), + ) + subs.styles["Default"] = style + + segments = transcript.get("segments", []) + + for seg in segments: + words = seg.get("words", []) + seg_end = seg["end"] - clip_start_offset + if seg_end <= 0: + continue # segment ends before clip starts — skip entirely + + seg_start = max(0.0, seg["start"] - clip_start_offset) + + if display_mode == "sentence" or not words: + _add_sentence_event(subs, seg["text"], seg_start, seg_end, animation, style_config) + else: + if animation == "karaoke": + _add_karaoke_line(subs, words, seg_start, seg_end, clip_start_offset, char_level) + else: + _add_word_events(subs, words, seg_start, seg_end, animation, char_level, style_config, clip_start_offset) + + output_path.parent.mkdir(parents=True, exist_ok=True) + subs.save(str(output_path), encoding="utf-8") + logger.info(f"Generated {len(subs)} subtitle events → {output_path.name}") + return output_path + + +def _add_sentence_event(subs, text, start, end, animation, style_config): + tags = "" + if animation == "fade": + fi = style_config.get("fade_in_ms", 200) + fo = style_config.get("fade_out_ms", 200) + tags = _fade_tags(fi, fo) + elif animation == "pop": + tags = _pop_tags() + elif animation == "bounce": + tags = _bounce_tags() + + event = SSAEvent( + start=pysubs2.make_time(s=start), + end=pysubs2.make_time(s=end), + text=tags + text.strip(), + ) + subs.append(event) + + +def _add_word_events(subs, words, seg_start, seg_end, animation, char_level, style_config, clip_offset=0.0): + """Add one SSAEvent per word (word-by-word mode).""" + unit_list = [] + for w in words: + if char_level: + for ch in w["word"]: + unit_list.append({"word": ch, "start": w["start"], "end": w["end"]}) + else: + unit_list.append(w) + + for i, unit in enumerate(unit_list): + start = unit["start"] - clip_offset + end = (unit["end"] - clip_offset) if unit["end"] > unit["start"] else start + 0.3 + if start < 0: + continue + + tags = "" + if animation == "fade": + fi = style_config.get("fade_in_ms", 150) + fo = style_config.get("fade_out_ms", 100) + tags = _fade_tags(fi, fo) + elif animation == "pop": + tags = _pop_tags() + elif animation == "typewriter": + delay = int((start - seg_start) * 1000) + tags = _typewriter_per_char("", delay) + + event = SSAEvent( + start=pysubs2.make_time(s=start), + end=pysubs2.make_time(s=end), + text=tags + unit["word"].strip(), + ) + subs.append(event) + + +def _add_karaoke_line(subs, words, seg_start, seg_end, clip_offset, char_level): + """Add karaoke-style line: full line visible, words highlight in sequence.""" + karaoke_text = "" + for w in words: + duration_cs = int((w["end"] - w["start"]) * 100) + word_text = w["word"].strip() + if char_level: + for ch in word_text: + karaoke_text += f"{{\\kf{duration_cs // max(len(word_text), 1)}}}{ch}" + else: + karaoke_text += f"{{\\kf{duration_cs}}}{word_text} " + + event = SSAEvent( + start=pysubs2.make_time(s=seg_start), + end=pysubs2.make_time(s=seg_end), + text=karaoke_text.strip(), + ) + subs.append(event) + + +def update_subtitle_event( + ass_path: Path, + event_index: int, + updates: dict, +) -> Path: + """Update a single subtitle event (for editor patches).""" + subs = SSAFile.load(str(ass_path)) + if event_index >= len(subs): + raise IndexError(f"Event index {event_index} out of range") + + evt = subs[event_index] + if "text" in updates: + evt.text = updates["text"] + if "start" in updates: + evt.start = pysubs2.make_time(s=updates["start"]) + if "end" in updates: + evt.end = pysubs2.make_time(s=updates["end"]) + + subs.save(str(ass_path), encoding="utf-8") + return ass_path + + +def apply_global_style_override(ass_path: Path, style_config: dict) -> Path: + """Re-apply global style overrides to all events (for live preview).""" + subs = SSAFile.load(str(ass_path)) + new_style = build_style(**{k: v for k, v in style_config.items() if k in build_style.__code__.co_varnames}) + subs.styles["Default"] = new_style + subs.save(str(ass_path), encoding="utf-8") + return ass_path diff --git a/backend/src/transcription/__init__.py b/backend/src/transcription/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/backend/src/transcription/whisper.py b/backend/src/transcription/whisper.py new file mode 100644 index 0000000000000000000000000000000000000000..f9b3d2424d0149598b94195f13e88104d3571547 --- /dev/null +++ b/backend/src/transcription/whisper.py @@ -0,0 +1,234 @@ +"""Speech-to-text using insanely-fast-whisper with ROCm support. + +Word-level timestamps for subtitle generation. +Supports transcription (same language) and translation (→ English then to target). +""" +import asyncio +import subprocess +import json +import os +from pathlib import Path +from typing import Optional +from loguru import logger + +# Language codes supported by Whisper +WHISPER_LANGUAGES = { + "thai": "th", + "english": "en", + "chinese": "zh", + "japanese": "ja", + "korean": "ko", + "french": "fr", + "german": "de", + "spanish": "es", + "portuguese": "pt", + "russian": "ru", + "arabic": "ar", + "hindi": "hi", + "vietnamese": "vi", + "indonesian": "id", + "malay": "ms", +} + +# Languages that need character-level splitting (no word spaces) +CHAR_LEVEL_LANGUAGES = {"th", "zh", "ja", "km", "lo", "my"} + + +def extract_audio(video_path: Path, audio_path: Path) -> Path: + """Extract mono 16kHz audio from video using ffmpeg.""" + cmd = [ + "ffmpeg", "-y", "-i", str(video_path), + "-ac", "1", "-ar", "16000", + "-vn", "-f", "wav", str(audio_path) + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"ffmpeg audio extraction failed: {result.stderr}") + return audio_path + + +def transcribe( + audio_path: Path, + clip_language: str = "auto", + subtitle_language: str = "en", + model_size: str = "large-v3", + device: str = "cuda", + batch_size: int = 16, +) -> dict: + """Transcribe audio and return word-level timestamps. + + Returns: + { + "text": str, + "segments": [{"start": float, "end": float, "text": str, "words": [...]}], + "language": str, + "char_level": bool, + } + """ + clip_lang_code = WHISPER_LANGUAGES.get(clip_language.lower(), None) + sub_lang_code = WHISPER_LANGUAGES.get(subtitle_language.lower(), "en") + + # Determine whisper task + task = "transcribe" + if clip_lang_code and sub_lang_code and clip_lang_code != sub_lang_code: + if sub_lang_code == "en": + task = "translate" # Whisper built-in translate → English + else: + task = "transcribe" # Non-English targets keep transcription in the selected language. + + logger.info(f"Whisper: task={task}, clip_lang={clip_lang_code}, sub_lang={sub_lang_code}, model={model_size}") + + try: + from transformers import pipeline + import torch + + # AMD ROCm: float16 triggers HIPBLAS_STATUS_INTERNAL_ERROR on some models. + # Use float32 for stability; bfloat16 as middle ground if available. + if device == "cuda": + try: + name = torch.cuda.get_device_name(0).lower() + is_amd = any(k in name for k in ("amd", "radeon", "instinct", "mi")) + except Exception: + is_amd = True # default safe + dtype = torch.bfloat16 if is_amd else torch.float16 + else: + dtype = torch.float32 + + def _run_on_cpu(gk): + logger.warning("Whisper: running on CPU (GPU unavailable or OOM)") + pipe_cpu = pipeline( + "automatic-speech-recognition", + model=f"openai/whisper-{model_size}", + torch_dtype=torch.float32, + device="cpu", + ) + return pipe_cpu(str(audio_path), batch_size=1, + return_timestamps="word", generate_kwargs=gk) + + generate_kwargs = {"task": task} + if clip_lang_code: + generate_kwargs["language"] = clip_lang_code + + # Check free VRAM — if GPU is nearly full, go straight to CPU + use_gpu = device == "cuda" + if use_gpu: + try: + free_bytes = torch.cuda.mem_get_info(0)[0] + if free_bytes < 8 * 1024 ** 3: # < 8 GB free + logger.warning(f"Whisper: only {free_bytes/1024**3:.1f} GB free — using CPU") + use_gpu = False + except Exception: + pass + + pipe = None + try: + if not use_gpu: + result = _run_on_cpu(generate_kwargs) + else: + pipe = pipeline( + "automatic-speech-recognition", + model=f"openai/whisper-{model_size}", + torch_dtype=dtype, + device=device, + model_kwargs={"attn_implementation": "sdpa"}, + ) + result = pipe(str(audio_path), batch_size=batch_size, + return_timestamps="word", generate_kwargs=generate_kwargs) + except (RuntimeError, Exception) as e: + err = str(e) + if any(k in err for k in ("HIPBLAS", "HIP", "out of memory", "OutOfMemory", "CUDA")): + logger.warning(f"GPU error in Whisper ({err[:120]}), retrying on CPU") + result = _run_on_cpu(generate_kwargs) + else: + raise + finally: + if pipe is not None: + del pipe + try: + torch.cuda.empty_cache() + except Exception: + pass + + segments = _build_segments(result, sub_lang_code) + char_level = sub_lang_code in CHAR_LEVEL_LANGUAGES + + return { + "text": result.get("text", ""), + "segments": segments, + "language": clip_lang_code or "auto", + "char_level": char_level, + "task": task, + } + + except ImportError: + logger.warning("transformers not available, using stub transcription") + return _stub_transcription(str(audio_path)) + + +def _build_segments(whisper_result: dict, target_lang: str) -> list: + """Convert Whisper output to segment list with word timestamps.""" + segments = [] + chunks = whisper_result.get("chunks", []) + + if not chunks: + # Fallback: single segment + return [{"start": 0.0, "end": 30.0, "text": whisper_result.get("text", ""), "words": []}] + + current_seg = {"start": None, "end": None, "text": "", "words": []} + SEGMENT_GAP = 1.5 # seconds gap to split into new segment + + for chunk in chunks: + ts = chunk.get("timestamp", [0, 0]) + start, end = (ts[0] or 0.0), (ts[1] or ts[0] or 0.0) + text = chunk.get("text", "").strip() + + if not text: + continue + + if current_seg["start"] is None: + current_seg["start"] = start + + if current_seg["words"] and start - current_seg["end"] > SEGMENT_GAP: + segments.append(current_seg) + current_seg = {"start": start, "end": end, "text": text, "words": []} + else: + current_seg["text"] += (" " if current_seg["text"] else "") + text + + current_seg["words"].append({"word": text, "start": start, "end": end}) + current_seg["end"] = end + + if current_seg["start"] is not None: + segments.append(current_seg) + + return segments + + +def _stub_transcription(audio_path: str) -> dict: + """Return minimal stub when Whisper is unavailable (dev/CPU mode).""" + return { + "text": "[Transcription not available — Whisper model not loaded]", + "segments": [{"start": 0.0, "end": 5.0, "text": "Sample subtitle", "words": [ + {"word": "Sample", "start": 0.0, "end": 0.5}, + {"word": "subtitle", "start": 0.6, "end": 1.0}, + ]}], + "language": "en", + "char_level": False, + "task": "transcribe", + } + + +async def transcribe_async( + audio_path: Path, + clip_language: str = "auto", + subtitle_language: str = "en", + model_size: str = "large-v3", + device: str = "cuda", +) -> dict: + """Async wrapper for transcribe.""" + loop = asyncio.get_event_loop() + from src.gpu.rocm_utils import get_optimal_batch_size + batch_size = get_optimal_batch_size("whisper") + return await loop.run_in_executor( + None, + lambda: transcribe(audio_path, clip_language, subtitle_language, model_size, device, batch_size) + ) diff --git a/deploy/setup_droplet.sh b/deploy/setup_droplet.sh new file mode 100644 index 0000000000000000000000000000000000000000..4b554491a98b7333d7f543b5b2a52dc30574ae35 --- /dev/null +++ b/deploy/setup_droplet.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# ElevenClip AI — Full AMD Droplet Setup Script +# Run once after fresh boot: bash /root/setup_droplet.sh +set -e + +LOG=/tmp/elevnclip_setup.log +exec > >(tee -a "$LOG") 2>&1 + +echo "=== ElevenClip AI Droplet Setup $(date) ===" + +# ── 1. Update repo ──────────────────────────────────────────────────────────── +echo "[1/5] Pulling latest code..." +cd /root/ElevenClip-AI +git pull origin master + +# ── 2. Python venv + pip install ───────────────────────────────────────────── +echo "[2/5] Installing Python dependencies..." +if [ ! -f /root/venv/bin/activate ]; then + python3 -m venv /root/venv +fi +source /root/venv/bin/activate +pip install --upgrade pip -q +pip install -r backend/requirements.txt -q +echo "PACKAGES_DONE" + +# ── 3. Start vLLM inside Docker container ──────────────────────────────────── +echo "[3/5] Starting vLLM with Qwen2.5-VL-7B-Instruct..." +docker start rocm 2>/dev/null || true +sleep 3 + +# Kill any stale vllm process +docker exec rocm bash -c "pkill -f 'vllm serve' 2>/dev/null || true" +sleep 2 + +# Start vLLM detached +docker exec -d rocm bash -c ' + vllm serve Qwen/Qwen2.5-VL-7B-Instruct \ + --port 8000 \ + --dtype float16 \ + --trust-remote-code \ + --max-model-len 4096 \ + --gpu-memory-utilization 0.7 \ + --limit-mm-per-prompt "image=3" \ + > /tmp/vllm.log 2>&1 +' +echo "vLLM started in background (downloading model, may take 5-15 min)" + +# ── 4. Start FastAPI backend on port 8080 ──────────────────────────────────── +echo "[4/5] Starting FastAPI backend on :8080..." +pkill -f "uvicorn backend.main" 2>/dev/null || true +sleep 1 + +cd /root/ElevenClip-AI +VLLM_BASE_URL=http://localhost:8000/v1 \ +VLLM_MODEL=Qwen/Qwen2.5-VL-7B-Instruct \ +WORK_DIR=/tmp/elevnclip \ +NEXT_PUBLIC_API_URL=http://localhost:8080 \ +nohup /root/venv/bin/uvicorn backend.main:app \ + --host 0.0.0.0 \ + --port 8080 \ + --workers 1 \ + --log-level info \ + > /tmp/fastapi.log 2>&1 & + +echo "FastAPI PID: $!" +echo "FASTAPI_STARTED" + +# ── 5. Poll vLLM health ─────────────────────────────────────────────────────── +echo "[5/5] Waiting for vLLM to load model..." +for i in $(seq 1 180); do + if curl -sf http://localhost:8000/health > /dev/null 2>&1; then + echo "vLLM READY after $((i * 5))s!" + echo "VLLM_READY" + break + fi + if [ $((i % 12)) -eq 0 ]; then + echo " Still loading... $((i * 5))s elapsed" + docker exec rocm bash -c "tail -3 /tmp/vllm.log 2>/dev/null" + fi + sleep 5 +done + +echo "" +echo "=== Setup complete! ===" +echo " FastAPI: http://129.212.178.101:8080" +echo " vLLM API: http://129.212.178.101:8000/v1" +echo " Logs: /tmp/fastapi.log | docker exec rocm cat /tmp/vllm.log" diff --git a/deploy/start_fastapi.sh b/deploy/start_fastapi.sh new file mode 100644 index 0000000000000000000000000000000000000000..e36c10f687e4f94f33d892d1a70b60c1a3911664 --- /dev/null +++ b/deploy/start_fastapi.sh @@ -0,0 +1,10 @@ +#!/bin/bash +pkill -f uvicorn 2>/dev/null +sleep 1 +cd /root/ElevenClip-AI/backend +export VLLM_BASE_URL=http://localhost:8000/v1 +export VLLM_MODEL=Qwen/Qwen2.5-VL-7B-Instruct +export WORK_DIR=/tmp/elevnclip +mkdir -p /tmp/elevnclip +nohup /root/venv/bin/uvicorn main:app --host 0.0.0.0 --port 8080 --workers 1 > /tmp/fastapi.log 2>&1 & +echo "FastAPI PID: $!" diff --git a/docker-compose.yml b/docker-compose.yml index 204e62d3addb6c4529e1c24a90033264c879730c..7ea3ae6fe7670a5daa0ce162bcaf28eff4bff5de 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,40 +1,34 @@ -services: - redis: - image: redis:7-alpine - ports: - - "6379:6379" +version: "3.9" +services: backend: build: - context: ./backend - args: - ROCM_PYTORCH_IMAGE: ${ROCM_PYTORCH_IMAGE:-rocm/pytorch:latest} - env_file: - - .env.example + context: . + dockerfile: Dockerfile + ports: + - "7860:7860" + - "8001:8000" environment: - STORAGE_DIR: /app/data - REDIS_URL: redis://redis:6379/0 - FRONTEND_ORIGIN: http://localhost:5173 + - WORK_DIR=/tmp/elevnclip + - VLLM_BASE_URL=http://localhost:8000/v1 + - VLLM_MODEL=Qwen/Qwen2.5-VL-7B-Instruct + - VLLM_PORT=8000 + - VLLM_DOCKER_CONTAINER= + - NEXT_PUBLIC_API_URL=http://localhost:7860 volumes: - - ./backend/data:/app/data - ports: - - "8000:8000" - depends_on: - - redis + - /tmp/elevnclip:/tmp/elevnclip + - huggingface_cache:/root/.cache/huggingface + # AMD GPU passthrough for ROCm devices: - /dev/kfd - /dev/dri group_add: - video - ipc: host - shm_size: 16gb + security_opt: + - seccomp:unconfined + cap_add: + - SYS_PTRACE + restart: unless-stopped - frontend: - build: - context: ./frontend - environment: - VITE_API_BASE_URL: http://localhost:8000 - ports: - - "5173:5173" - depends_on: - - backend +volumes: + huggingface_cache: diff --git a/docs/AMD_CREDIT_RUNBOOK.md b/docs/AMD_CREDIT_RUNBOOK.md deleted file mode 100644 index 229b81009612972ba7f0408b0609ae0e1126c3c0..0000000000000000000000000000000000000000 --- a/docs/AMD_CREDIT_RUNBOOK.md +++ /dev/null @@ -1,115 +0,0 @@ -# AMD Credit Runbook - -Use this checklist as soon as AMD Developer Cloud credits are approved. - -## 1. Create Instance - -Target: - -- AMD Developer Cloud -- AMD Instinct MI300X -- ROCm 6.x image if available -- Enough disk for videos, model cache, and rendered clips - -## 2. Clone Repository - -```bash -git clone https://github.com/JakgritB/ElevenClip.AI.git -cd ElevenClip.AI -``` - -## 3. Configure Environment - -```bash -cp .env.example .env -``` - -Edit `.env`: - -```bash -DEMO_MODE=false -HF_TOKEN= -WHISPER_MODEL_ID=openai/whisper-large-v3 -QWEN_TEXT_MODEL_ID=Qwen/Qwen2.5-7B-Instruct -QWEN_VL_MODEL_ID=Qwen/Qwen2-VL-7B-Instruct -FFMPEG_VIDEO_CODEC=h264_amf -``` - -## 4. Verify ROCm - -```bash -rocminfo | head -rocm-smi -``` - -Verify PyTorch: - -```bash -python - <<'PY' -import torch -print("cuda available:", torch.cuda.is_available()) -print("device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "none") -print("hip:", torch.version.hip) -PY -``` - -On ROCm, PyTorch still exposes AMD GPUs through the `torch.cuda` API. - -## 5. Start Backend And Frontend - -Docker path: - -```bash -docker compose build --build-arg INSTALL_EXTRAS=.[ai,rocm-inference] backend -docker compose up -``` - -Manual backend path: - -```bash -cd backend -python -m venv .venv -source .venv/bin/activate -pip install -e ".[ai,rocm-inference]" -uvicorn app.main:app --host 0.0.0.0 --port 8000 -``` - -Manual frontend path: - -```bash -cd frontend -npm install -npm run dev -- --host 0.0.0.0 -``` - -## 6. Run Benchmark - -CPU baseline: - -```bash -DEMO_MODE=false HIP_VISIBLE_DEVICES= python scripts/benchmark.py --youtube-url "" --language Thai --style informative --niche education --clip-length 60 -``` - -AMD GPU: - -```bash -DEMO_MODE=false python scripts/benchmark.py --youtube-url "" --language Thai --style informative --niche education --clip-length 60 -``` - -Save the JSON outputs into: - -```text -data/benchmarks/cpu.json -data/benchmarks/mi300x.json -``` - -## 7. Update Submission Materials - -After the benchmark: - -- Update `README.md`. -- Update `docs/SUBMISSION.md`. -- Update `docs/PITCH_DECK.md`. -- Update Hugging Face Space. -- Record the final demo video. - diff --git a/docs/BUILD_IN_PUBLIC.md b/docs/BUILD_IN_PUBLIC.md deleted file mode 100644 index 509e852a381fc77af2d5df778027e7dccd3cd6f2..0000000000000000000000000000000000000000 --- a/docs/BUILD_IN_PUBLIC.md +++ /dev/null @@ -1,69 +0,0 @@ -# Build In Public Drafts - -The hackathon has an optional Build in Public challenge. Use these as draft posts and adjust the tone before publishing. - -## Post 1 - Project Start - -### X / Twitter - -Building ElevenClip.AI for the AMD Developer Hackathon. - -It turns long-form videos into short-form clips with a human-AI editing loop: - -- Whisper Large V3 for transcription -- Qwen2.5 for highlight scoring -- Qwen2-VL for visual signals -- ROCm + AMD MI300X target deployment - -Creators should not need to watch a 2-hour video just to find 10 good clips. - -@lablab @AIatAMD - -### LinkedIn - -I am building ElevenClip.AI for the AMD Developer Hackathon. - -The project is an AI clip studio that helps creators turn long-form videos into short-form clips for TikTok, YouTube Shorts, and Instagram Reels. The core workflow combines Whisper transcription, Qwen highlight detection, optional Qwen2-VL visual understanding, ffmpeg rendering, and a human-in-the-loop editor. - -The production target is AMD Developer Cloud with ROCm and AMD Instinct MI300X, because long-form video processing needs high-throughput model inference and fast rendering. - -## Post 2 - Technical Update - -### X / Twitter - -Technical update on ElevenClip.AI: - -The local MVP now has: - -- FastAPI backend -- React clip editor -- Channel profile inputs -- Upload/YouTube pipeline -- Mock transcript/highlight path for demo mode -- Clip cards with trim, subtitle edit, regenerate, approve, download -- Hugging Face Space live under the hackathon org - -Next: run the real Whisper + Qwen pipeline on AMD Developer Cloud with ROCm and benchmark CPU vs MI300X. - -@lablab @AIatAMD - -### LinkedIn - -Technical update for ElevenClip.AI: - -The MVP now has a FastAPI backend, React editor, channel profile setup, upload/YouTube input, transcript and highlight output, clip generation, and a human review interface. I also published a Hugging Face Space under the AMD Developer Hackathon organization. - -The next milestone is the real AMD cloud run: Whisper Large V3 on ROCm PyTorch, Qwen2.5 through a ROCm-compatible serving path, and benchmark logs comparing CPU and AMD Instinct MI300X performance. - -## AMD Feedback Notes - -Fill this after using AMD Developer Cloud: - -- What was easy: -- What was confusing: -- ROCm setup notes: -- PyTorch/Transformers compatibility notes: -- vLLM ROCm notes: -- Benchmark result: -- Suggestion for AMD Developer Cloud documentation: - diff --git a/docs/DEMO_SCRIPT.md b/docs/DEMO_SCRIPT.md deleted file mode 100644 index 9d976cf2e8766c9bc0988be06a028d35fe79339d..0000000000000000000000000000000000000000 --- a/docs/DEMO_SCRIPT.md +++ /dev/null @@ -1,104 +0,0 @@ -# Demo Recording Script - -The final submission should use the AMD GPU version if credits arrive in time. Record the draft now so the story, screen flow, and timing are ready. - -## Draft Demo Before AMD Credits - -Target length: 2-3 minutes. - -### Scene 1 - Problem - -Show the title slide or Hugging Face Space. - -Narration: - -"Long-form creators need short clips for TikTok, Shorts, and Reels, but finding the best moments, trimming them, and adding subtitles can take hours." - -### Scene 2 - Channel Profile - -Open the local web app at `http://localhost:5173`. - -Set: - -- Niche: `education`, `gaming`, or `podcast` -- Clip style: `funny` or `informative` -- Clip length: `60` -- Language: `Thai` -- Platform: `tiktok` - -Narration: - -"ElevenClip.AI starts with a reusable channel profile, so highlight selection matches the creator's niche, language, style, and target platform." - -### Scene 3 - Video Input - -Upload a short MP4 file or paste a YouTube URL. - -Narration: - -"The app accepts uploaded files or YouTube URLs. In production, the backend downloads the source, transcribes it, ranks highlights, and renders clips." - -### Scene 4 - AI Pipeline - -Click Start Pipeline and show progress. - -Narration: - -"The AMD version runs Whisper Large V3 for transcription, Qwen2.5 for profile-aware highlight scoring, optional Qwen2-VL for visual signals, and ffmpeg for clip generation." - -### Scene 5 - Human-AI Editor - -Show generated clips. - -Edit a subtitle line, change start/end, approve a clip, and click download. - -Narration: - -"AI creates the first pass, but the creator stays in control. They can trim clips, edit subtitles, delete weak clips, regenerate a specific clip, approve, and download." - -### Scene 6 - AMD GPU Plan - -Show README or benchmark placeholder. - -Narration: - -"Once AMD Developer Cloud credits are active, the same pipeline runs on ROCm and AMD Instinct MI300X. We will benchmark CPU versus AMD GPU and target ten subtitled clips from a two-hour video in under ten minutes." - -## Final Demo After AMD Credits - -Target length: 3-4 minutes. - -Add these shots to the draft demo: - -1. AMD Developer Cloud instance page. -2. Terminal showing ROCm/GPU visibility: - -```bash -rocminfo | head -python - <<'PY' -import torch -print(torch.cuda.is_available()) -print(torch.cuda.get_device_name(0)) -print(torch.version.hip) -PY -``` - -3. Backend running with `DEMO_MODE=false`. -4. Benchmark command: - -```bash -python scripts/benchmark.py --youtube-url "" --language Thai --style informative --niche education --clip-length 60 -``` - -5. Timing JSON showing `input`, `transcription`, `highlight_detection`, `clip_generation`, and `total`. -6. Final clips shown in the editor. - -## Recording Checklist - -- Browser zoom at 100%. -- Use a short clean video for draft demo. -- Hide private tokens and email inboxes. -- Keep terminal text large enough to read. -- Record in 1080p or higher. -- End with GitHub URL, Hugging Face Space URL, and AMD/ROCm benchmark result. - diff --git a/docs/PITCH_DECK.md b/docs/PITCH_DECK.md deleted file mode 100644 index 56376cd1d9c0a561767f1186fef5371d2bd3d8ff..0000000000000000000000000000000000000000 --- a/docs/PITCH_DECK.md +++ /dev/null @@ -1,111 +0,0 @@ -# Pitch Deck Outline - -Use this as the slide plan for the required presentation deck. - -## Slide 1 - Title - -ElevenClip.AI - -AI clip studio for turning long-form videos into personalized short-form clips. - -Include: - -- AMD Developer Hackathon -- Track 3 - Vision & Multimodal AI -- GitHub URL -- Hugging Face Space URL - -## Slide 2 - Problem - -Long-form creators need short-form distribution, but editing clips manually is slow. - -Key points: - -- Two-hour videos can take hours to review. -- Good clips depend on audience, niche, tone, and platform. -- Subtitles and vertical export add repetitive work. - -## Slide 3 - Solution - -ElevenClip.AI automates the first editing pass. - -Workflow: - -Video input -> Whisper transcript -> Qwen highlight scoring -> ffmpeg clip rendering -> human review/editor -> downloads - -## Slide 4 - Product Demo - -Show screenshots or short GIFs of: - -- Channel profile -- Pipeline progress -- Transcript/highlights -- Clip editor -- Approved/downloaded clips - -## Slide 5 - AI Architecture - -Model roles: - -- Whisper Large V3: multilingual transcription, including Thai. -- Qwen2.5-7B-Instruct: profile-aware highlight detection. -- Qwen2-VL-7B-Instruct: visual reactions, scene changes, and on-screen text. -- ffmpeg: subtitle burn-in and platform export. - -## Slide 6 - AMD + ROCm - -Why AMD matters: - -- Long videos need high-throughput inference. -- MI300X memory helps with large models and long transcripts. -- ROCm + PyTorch enables Whisper inference. -- vLLM ROCm enables faster Qwen serving. - -## Slide 7 - Benchmark - -Replace placeholders after cloud credits arrive. - -| Run | Hardware | Total Time | Clips | -| --- | --- | ---: | ---: | -| CPU baseline | CPU | TBD | 10 | -| AMD GPU | MI300X + ROCm | TBD | 10 | - -Goal: 2-hour video -> 10 subtitled clips in under 10 minutes on MI300X. - -## Slide 8 - Business Value - -Target users: - -- YouTubers -- Podcasters -- Educators -- Streamers -- Agencies -- Brand marketing teams - -Value: - -- Save editing time. -- Increase short-form output. -- Keep creator control. -- Support multilingual creators. - -## Slide 9 - What We Built - -Current MVP: - -- FastAPI backend -- React editor -- YouTube/upload input -- Demo pipeline -- Clip rendering and subtitles -- Hugging Face Space -- AMD deployment plan - -Next: - -- Real Whisper + Qwen on MI300X -- Qwen2-VL frame analysis -- Benchmark table -- Better subtitle styling presets - diff --git a/docs/SUBMISSION.md b/docs/SUBMISSION.md deleted file mode 100644 index ba6200d92176d62a77249680acc0dba2cd8ff3f5..0000000000000000000000000000000000000000 --- a/docs/SUBMISSION.md +++ /dev/null @@ -1,97 +0,0 @@ -# ElevenClip.AI Hackathon Submission Pack - -Use this as the source text for the lablab.ai project submission. - -## Basic Information - -### Project Title - -ElevenClip.AI - -### Short Description - -AI clip studio that turns long-form videos into personalized short-form clips with Whisper, Qwen, ROCm, and AMD MI300X. - -### Track - -Track 3 - Vision & Multimodal AI - -### Technology Tags - -AMD Developer Cloud, AMD Instinct MI300X, ROCm, Hugging Face, Qwen, Whisper, FastAPI, React, ffmpeg, multimodal AI, video AI, Thai language, creator tools - -## Long Description - -ElevenClip.AI helps creators convert long-form videos into short-form clips for TikTok, YouTube Shorts, and Instagram Reels. The app takes a YouTube URL or uploaded video, transcribes it with Whisper Large V3, uses Qwen2.5 to score the best highlight moments based on the creator's channel profile, optionally adds visual signals with Qwen2-VL, and renders vertical clips with subtitles through ffmpeg. - -The core idea is human-AI collaborative editing. AI creates the first pass quickly, but creators still control the final result. After the pipeline generates clips, the user can trim start and end times, edit subtitle text, delete weak clips, regenerate specific clips, approve final clips, and download the outputs. - -The project is built for AMD Developer Hackathon Track 3 because it combines audio transcription, transcript reasoning, video understanding, and rendered media outputs. The production target is AMD Developer Cloud with ROCm and AMD Instinct MI300X. MI300X acceleration is especially relevant because the workflow needs to process long videos, run large multilingual models, and generate multiple clips fast enough for creator workflows. - -## Problem - -Creators often publish long-form content but still need short clips for discovery platforms. Manually finding the best moments in a two-hour video, trimming clips, writing subtitles, reframing to vertical, and exporting multiple MP4 files can take hours. - -Generic clipping tools miss the creator's style. A funny gaming channel and an educational podcast do not choose highlights the same way. ElevenClip.AI uses a reusable channel profile so highlight detection can adapt to niche, style, language, target platform, and preferred clip length. - -## Solution - -1. Set a channel profile once. -2. Paste a YouTube URL or upload a video. -3. Transcribe speech with Whisper Large V3. -4. Use Qwen2.5 to score transcript segments for engagement potential. -5. Optionally use Qwen2-VL for visual highlights such as reactions, scene changes, and on-screen text. -6. Render short-form clips with subtitles using ffmpeg. -7. Let the human editor trim, edit subtitles, regenerate, approve, and download. - -## AMD + ROCm Usage - -The intended production pipeline runs on AMD Developer Cloud: - -- AMD Instinct MI300X for high-throughput inference. -- ROCm 6.x as the GPU compute stack. -- PyTorch ROCm for Whisper Large V3 transcription. -- vLLM ROCm backend for Qwen2.5 highlight analysis. -- Hugging Face model hub for Whisper and Qwen models. -- ffmpeg hardware acceleration hooks where available. - -The repo includes a local demo mode so the interface and API can be tested before cloud credits arrive. Once AMD credits are active, `DEMO_MODE=false` enables the real model path and benchmark collection. - -## Hugging Face Space - -Public Space: - -https://huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI - -The Space is published under the event organization and acts as the public demo/landing page. The full app source code is available on GitHub. - -## GitHub Repository - -https://github.com/JakgritB/ElevenClip.AI - -## Demo Video Plan - -The final demo video should be recorded twice: - -1. Draft demo before AMD credits: show the local app, end-to-end UI, clip editor, and project concept. -2. Final demo after AMD credits: show the same flow plus AMD Developer Cloud, ROCm/MI300X GPU detection, real model inference, and benchmark results. - -## Benchmark Placeholder - -Replace this table after the AMD run. - -| Run | Video Length | Clips | Hardware | Total Time | Notes | -| --- | --- | ---: | --- | ---: | --- | -| Draft demo | 2-3 min | 5 | Local CPU demo mode | TBD | UI and workflow validation | -| CPU baseline | 2 hr | 10 | CPU | TBD | Real model stack, GPU hidden | -| AMD GPU | 2 hr | 10 | AMD Instinct MI300X + ROCm | TBD | Target: under 10 minutes | - -## Judging Criteria Mapping - -| Criterion | How ElevenClip.AI Addresses It | -| --- | --- | -| Application of Technology | Integrates Whisper, Qwen2.5, optional Qwen2-VL, Hugging Face, ROCm, and AMD MI300X target deployment. | -| Business Value | Solves a real creator workflow: turning long videos into platform-ready clips with subtitles and human editing. | -| Originality | Uses creator profile personalization and multilingual support rather than generic highlight detection. | -| Presentation | Demo shows before/after clips, editing controls, and CPU vs AMD GPU benchmark logs. | - diff --git a/docs/assets/cover.svg b/docs/assets/cover.svg deleted file mode 100644 index 04bb390007022deb62c90b5ca9224a849d940b76..0000000000000000000000000000000000000000 --- a/docs/assets/cover.svg +++ /dev/null @@ -1,54 +0,0 @@ - - ElevenClip.AI cover image - Hackathon cover artwork for an AI video clipping studio powered by AMD ROCm and Qwen. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ElevenClip.AI - AI clip studio for long-form video creators - - - - Whisper transcription - Qwen highlight detection - ROCm + AMD Instinct MI300X target - - - - Track 3 Multimodal AI - - Hugging Face - - - diff --git a/frontend/.gitignore b/frontend/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..5ef6a520780202a1d6addd833d800ccb1ecac0bb --- /dev/null +++ b/frontend/.gitignore @@ -0,0 +1,41 @@ +# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. + +# dependencies +/node_modules +/.pnp +.pnp.* +.yarn/* +!.yarn/patches +!.yarn/plugins +!.yarn/releases +!.yarn/versions + +# testing +/coverage + +# next.js +/.next/ +/out/ + +# production +/build + +# misc +.DS_Store +*.pem + +# debug +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnpm-debug.log* + +# env files (can opt-in for committing if needed) +.env* + +# vercel +.vercel + +# typescript +*.tsbuildinfo +next-env.d.ts diff --git a/frontend/AGENTS.md b/frontend/AGENTS.md new file mode 100644 index 0000000000000000000000000000000000000000..8bd0e39085d5260e7f8faffcad2fdc45e10aef33 --- /dev/null +++ b/frontend/AGENTS.md @@ -0,0 +1,5 @@ + +# This is NOT the Next.js you know + +This version has breaking changes — APIs, conventions, and file structure may all differ from your training data. Read the relevant guide in `node_modules/next/dist/docs/` before writing any code. Heed deprecation notices. + diff --git a/frontend/CLAUDE.md b/frontend/CLAUDE.md new file mode 100644 index 0000000000000000000000000000000000000000..43c994c2d3617f947bcb5adf1933e21dabe46bb5 --- /dev/null +++ b/frontend/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md diff --git a/frontend/Dockerfile b/frontend/Dockerfile deleted file mode 100644 index f5b490e8c24876ce51e9a7bda2ab0b15599c8414..0000000000000000000000000000000000000000 --- a/frontend/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM node:22-alpine - -WORKDIR /app -COPY package.json package-lock.json* ./ -RUN npm install -COPY . . - -EXPOSE 5173 -CMD ["npm", "run", "dev", "--", "--host", "0.0.0.0"] diff --git a/frontend/README.md b/frontend/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e215bc4ccf138bbc38ad58ad57e92135484b3c0f --- /dev/null +++ b/frontend/README.md @@ -0,0 +1,36 @@ +This is a [Next.js](https://nextjs.org) project bootstrapped with [`create-next-app`](https://nextjs.org/docs/app/api-reference/cli/create-next-app). + +## Getting Started + +First, run the development server: + +```bash +npm run dev +# or +yarn dev +# or +pnpm dev +# or +bun dev +``` + +Open [http://localhost:3000](http://localhost:3000) with your browser to see the result. + +You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file. + +This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel. + +## Learn More + +To learn more about Next.js, take a look at the following resources: + +- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API. +- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial. + +You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome! + +## Deploy on Vercel + +The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js. + +Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details. diff --git a/frontend/app/editor/page.tsx b/frontend/app/editor/page.tsx new file mode 100644 index 0000000000000000000000000000000000000000..50d9131bbd3822bcefec9961af48c2c7a6b16d25 --- /dev/null +++ b/frontend/app/editor/page.tsx @@ -0,0 +1,1120 @@ +"use client"; +import { useEffect, useState, Suspense } from "react"; +import { useSearchParams, useRouter } from "next/navigation"; +import { HexColorPicker } from "react-colorful"; +import { getClips, patchSubtitle, patchGlobalStyle, renderClip, type ClipResult, type StyleConfig } from "@/lib/api"; +import { + Download, RotateCcw, ChevronLeft, Loader2, Bot, Scissors, CheckCircle2, + Copy, Check, FileText, Palette, Scissors as ScissorsIcon, Plus, X, + Heart, MessageCircle, Share2, Music, +} from "lucide-react"; + +const BBB = "https://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"; + +// ─── Types ──────────────────────────────────────────────────────────────────── +type SubEvent = { index: number; text: string; start: number; end: number }; +type SubLineStyle = { + font_family?: string; font_size?: number; + primary_color?: string; secondary_color?: string; + outline_color?: string; shadow_color?: string; + outline_size?: number; shadow_size?: number; + bold?: boolean; italic?: boolean; animation?: string; + alignment?: number; margin_v?: number; +}; +type CutRegion = { id: number; from: number; to: number }; +type Lang = "en" | "th" | "zh"; + +// ─── Translations ───────────────────────────────────────────────────────────── +const L = { + en: { + back: "Back", clips: "CLIPS", clip: "Clip", + duration: "Duration", aiScore: "AI Score", + render: "Render", download: "Download", rendering: "Rendering…", reRender: "Re-render", + whyPicked: "Why AI picked this clip", + caption: "Suggested caption", copy: "Copy", copied: "Copied", + trimTab: "Trim & Cut", subsTab: "Subtitles", + adjustBounds: "Adjust boundaries (±10s)", startOff: "Start offset", endOff: "End offset", + cutMiddle: "Cut from middle", addCut: "+ Add cut", + noCuts: "No cuts yet — click \"Add cut\" to remove a section from the middle", + totalRemoved: "Total removed", finalDuration: "Final duration", + subStyle: "Subtitle Style", + global: "🌐 Global", perLine: "✨ Per-line", + styleBtn: "Style ▾", styledBtn: "Styled ▾", + font: "Font", size: "Size", outline: "Outline", shadow: "Shadow", + colorText: "Text", colorKaraoke: "Karaoke", colorOutline: "Outline", colorShadow: "Shadow", + animation: "Animation", position: "Position", marginV: "Margin V", + wordMode: "Word", sentenceMode: "Sentence", + resetGlobal: "Reset to global style", + noSubsHre: "No subtitles in HRE mode — AI handles styling automatically.", + cut: "Cut", + demoSession: "Demo session", waitRender: "Waiting for render", close: "Close", + preview: "PREVIEW", + }, + th: { + back: "กลับ", clips: "คลิป", clip: "คลิป", + duration: "ความยาว", aiScore: "คะแนน AI", + render: "เรนเดอร์", download: "ดาวน์โหลด", rendering: "กำลังเรนเดอร์…", reRender: "เรนเดอร์ใหม่", + whyPicked: "เหตุผลที่ AI เลือกคลิปนี้", + caption: "คำบรรยายแนะนำ", copy: "คัดลอก", copied: "คัดลอกแล้ว", + trimTab: "ตัด & ตัดกลาง", subsTab: "ซับไตเติ้ล", + adjustBounds: "ปรับขอบเขต (±10s)", startOff: "ออฟเซ็ตต้น", endOff: "ออฟเซ็ตท้าย", + cutMiddle: "ตัดกลางคลิป", addCut: "+ เพิ่มจุดตัด", + noCuts: "ยังไม่มีจุดตัด — กด \"เพิ่มจุดตัด\" เพื่อลบส่วนกลางคลิป", + totalRemoved: "ตัดออกรวม", finalDuration: "ความยาวสุดท้าย", + subStyle: "สไตล์ซับ", + global: "🌐 ทั้งหมด", perLine: "✨ รายบรรทัด", + styleBtn: "ปรับ ▾", styledBtn: "ปรับแล้ว ▾", + font: "ฟ้อนต์", size: "ขนาด", outline: "ขอบ", shadow: "เงา", + colorText: "ตัวอักษร", colorKaraoke: "คาราโอเกะ", colorOutline: "ขอบ", colorShadow: "เงา", + animation: "อนิเมชั่น", position: "ตำแหน่ง", marginV: "ระยะแนวตั้ง", + wordMode: "คำต่อคำ", sentenceMode: "ประโยค", + resetGlobal: "รีเซ็ตเป็นสไตล์ทั่วไป", + noSubsHre: "HRE mode ไม่มีซับ — AI จัดการสไตล์ให้อัตโนมัติ", + cut: "จุดตัด", + demoSession: "Demo session", waitRender: "รอการเรนเดอร์", close: "ปิด", + preview: "PREVIEW", + }, + zh: { + back: "返回", clips: "片段", clip: "片段", + duration: "时长", aiScore: "AI分数", + render: "渲染", download: "下载", rendering: "渲染中…", reRender: "重新渲染", + whyPicked: "AI选择此片段的原因", + caption: "建议说明", copy: "复制", copied: "已复制", + trimTab: "剪辑 & 切割", subsTab: "字幕", + adjustBounds: "调整边界 (±10s)", startOff: "开始偏移", endOff: "结束偏移", + cutMiddle: "从中间切割", addCut: "+ 添加切割", + noCuts: "暂无切割 — 点击\"添加切割\"删除中间部分", + totalRemoved: "总计删除", finalDuration: "最终时长", + subStyle: "字幕样式", + global: "🌐 全局", perLine: "✨ 逐行", + styleBtn: "样式 ▾", styledBtn: "已设置 ▾", + font: "字体", size: "大小", outline: "描边", shadow: "阴影", + colorText: "文字", colorKaraoke: "卡拉OK", colorOutline: "描边", colorShadow: "阴影", + animation: "动画", position: "位置", marginV: "垂直边距", + wordMode: "逐词", sentenceMode: "句子", + resetGlobal: "重置为全局样式", + noSubsHre: "HRE模式无字幕 — AI自动处理样式", + cut: "切割", + demoSession: "演示会话", waitRender: "等待渲染", close: "关闭", + preview: "预览", + }, +} as const; +type Lbl = typeof L[Lang]; + +// ─── Mock data ───────────────────────────────────────────────────────────────── +const MOCK_CLIPS: (ClipResult & { suggested_caption: string })[] = [ + { + index: 0, start: 12.5, end: 72.5, duration: 60.0, score: 0.92, + download_url: `${BBB}#t=12,73`, raw_url: BBB, ass_path: "demo_0.ass", + highlight_reason: "High energy moment — face detected, audio peak +8.4 dB, Qwen2.5-VL excitement score 0.94", + suggested_caption: "This is why I keep coming back to this stream 🔥 Wait for the ending… #gaming #highlight #viral #amd", + }, + { + index: 1, start: 145.0, end: 205.0, duration: 60.0, score: 0.87, + download_url: `${BBB}#t=145,205`, raw_url: BBB, ass_path: "demo_1.ass", + highlight_reason: "Exciting chase sequence — Qwen2.5-VL excitement score 0.91, crowd reaction detected at 147.2s", + suggested_caption: "POV: You witness the most insane moment of the year 🎮 Nobody was ready for this #fyp #gaming #moments", + }, + { + index: 2, start: 320.0, end: 380.0, duration: 60.0, score: 0.81, + download_url: `${BBB}#t=320,380`, raw_url: BBB, ass_path: "demo_2.ass", + highlight_reason: "Funny reaction moment — humor level 0.88, peak audio energy at 320.4s, chat sentiment positive", + suggested_caption: "When the stream becomes a movie 😂 Chat was NOT ready #funny #reaction #viral #clip", + }, +]; + +const MOCK_SUBS: Record = { + 0: [ + { index: 0, text: "Let's GO! First blood!", start: 0.0, end: 2.8 }, + { index: 1, text: "That was INSANE", start: 3.1, end: 5.5 }, + { index: 2, text: "No way he survived that", start: 6.0, end: 8.4 }, + { index: 3, text: "Triple kill right there", start: 9.0, end: 11.2 }, + { index: 4, text: "Chat is going crazy right now", start: 12.0, end: 14.8 }, + { index: 5, text: "This is the best clip of the stream", start: 55.0, end: 58.5 }, + ], + 1: [ + { index: 0, text: "Oh my god, this part...", start: 0.0, end: 3.0 }, + { index: 1, text: "I've been waiting for this moment", start: 3.4, end: 6.0 }, + { index: 2, text: "HERE WE GO", start: 7.0, end: 9.0 }, + { index: 3, text: "The crowd is going wild", start: 28.0, end: 31.0 }, + { index: 4, text: "YESSS! FINALLY!", start: 54.0, end: 57.0 }, + { index: 5, text: "We are SO back", start: 57.5, end: 60.0 }, + ], + 2: [ + { index: 0, text: "Chat predicted this would happen", start: 0.0, end: 2.5 }, + { index: 1, text: "I cannot believe this is real", start: 3.0, end: 6.2 }, + { index: 2, text: "This is why I stream every day", start: 7.0, end: 9.5 }, + { index: 3, text: "No script. This is 100% real.", start: 10.0, end: 13.0 }, + { index: 4, text: "Clip it. Someone clip this.", start: 50.0, end: 53.0 }, + ], +}; + +const LANG_OPTIONS = [ + { code: "en" as Lang, label: "English" }, + { code: "th" as Lang, label: "ไทย" }, + { code: "zh" as Lang, label: "中文" }, +]; + +const FONTS = ["Noto Sans Thai", "Noto Sans SC", "Noto Sans", "Impact", "Montserrat", "Oswald", "Anton"]; +const ANIMATIONS = ["none", "fade", "karaoke", "pop", "typewriter", "bounce"]; +const ALIGNMENTS = [ + { val: 7, label: "↖" }, { val: 8, label: "↑" }, { val: 9, label: "↗" }, + { val: 4, label: "←" }, { val: 5, label: "·" }, { val: 6, label: "→" }, + { val: 1, label: "↙" }, { val: 2, label: "↓" }, { val: 3, label: "↘" }, +]; + +const ANIM_KEYFRAMES = ` + @keyframes elevn-fade { 0%,100%{opacity:0} 30%,70%{opacity:1} } + @keyframes elevn-pop { 0%,100%{transform:scale(0.5);opacity:0} 40%{transform:scale(1.12)} 50%,80%{transform:scale(1);opacity:1} } + @keyframes elevn-bounce { 0%,100%{transform:translateY(0)} 50%{transform:translateY(-9px)} } + @keyframes elevn-type { from{clip-path:inset(0 100% 0 0)} to{clip-path:inset(0 0% 0 0)} } +`; + +// ─── TikTok phone mock preview (180×320) ────────────────────────────────────── +type PreviewStyle = { + font_family?: string; font_size?: number; + primary_color?: string; secondary_color?: string; + outline_color?: string; outline_size?: number; + bold?: boolean; italic?: boolean; animation?: string; + alignment?: number; margin_v?: number; +}; + +function TikTokMiniPreview({ style, text, lbl }: { style: PreviewStyle; text: string; lbl: Lbl }) { + const alignment = style.alignment ?? 2; + const isBottom = alignment <= 3; + const isTop = alignment >= 7; + const isLeft = [1, 4, 7].includes(alignment); + const isRight = [3, 6, 9].includes(alignment); + + const s = 0.34; + const fontSize = Math.max(10, (style.font_size ?? 52) * s); + const outlineW = (style.outline_size ?? 2.5) * s; + const textShadow = outlineW > 0 + ? `${outlineW}px ${outlineW}px 0 ${style.outline_color ?? "#000"}, + -${outlineW}px ${outlineW}px 0 ${style.outline_color ?? "#000"}, + ${outlineW}px -${outlineW}px 0 ${style.outline_color ?? "#000"}, + -${outlineW}px -${outlineW}px 0 ${style.outline_color ?? "#000"}` + : "none"; + + const animMap: Record = { + fade: "elevn-fade 2s ease-in-out infinite", + pop: "elevn-pop 1.8s ease-in-out infinite", + bounce: "elevn-bounce 1s ease-in-out infinite", + typewriter: "elevn-type 2.5s steps(20) infinite", + }; + + const posStyle: React.CSSProperties = { + position: "absolute", zIndex: 10, maxWidth: "76%", wordBreak: "break-word", + textAlign: isLeft ? "left" : isRight ? "right" : "center", + }; + if (isBottom) { + posStyle.bottom = "18%"; + posStyle.left = isLeft ? "4%" : isRight ? undefined : "12%"; + posStyle.right = isRight ? "14%" : isLeft ? undefined : "12%"; + } else if (isTop) { + posStyle.top = "12%"; + posStyle.left = isLeft ? "4%" : isRight ? undefined : "12%"; + posStyle.right = isRight ? "14%" : isLeft ? undefined : "12%"; + } else { + posStyle.top = "50%"; posStyle.transform = "translateY(-50%)"; + posStyle.left = isLeft ? "4%" : isRight ? undefined : "12%"; + posStyle.right = isRight ? "14%" : isLeft ? undefined : "12%"; + } + + const textStyle: React.CSSProperties = { + fontFamily: `"${style.font_family ?? "Noto Sans"}", sans-serif`, + fontSize: `${fontSize}px`, + fontWeight: style.bold ?? true ? "bold" : "normal", + fontStyle: style.italic ? "italic" : "normal", + color: style.primary_color ?? "#FFFFFF", + textShadow, lineHeight: 1.25, + animation: animMap[style.animation ?? ""] ?? "none", + display: "inline-block", + }; + + return ( +
+ +
+ {/* Grid lines */} +
+
+
+
+ {/* Notch */} +
+ {/* Side action buttons */} +
+ {[{ Icon: Heart, count: "12.4K" }, { Icon: MessageCircle, count: "892" }, { Icon: Share2, count: "Share" }].map(({ Icon, count }, i) => ( +
+
+ +
+ {count} +
+ ))} +
+
+ +
+
+
+ {/* Bottom info */} +
+
@elevnclip_ai
+
#highlight #tiktok #ai #amd
+
+ {/* Progress bar */} +
+
+
+ {/* Subtitle text */} +
{text || "Sample subtitle"}
+ {/* Preview badge */} +
+ {lbl.preview} +
+
+ ); +} + +// ─── Main content ───────────────────────────────────────────────────────────── +function EditorContent() { + const params = useSearchParams(); + const router = useRouter(); + const sessionId = params.get("session") ?? ""; + const isDemo = sessionId === "demo"; + + const [uiLang, setUiLang] = useState("en"); + const lbl = L[uiLang]; + + useEffect(() => { + const saved = localStorage.getItem("elevnclip-lang") as Lang | null; + if (saved && (["en", "th", "zh"] as string[]).includes(saved)) setUiLang(saved); + }, []); + + const handleLangChange = (lang: Lang) => { + setUiLang(lang); + localStorage.setItem("elevnclip-lang", lang); + }; + + const [clips, setClips] = useState<(ClipResult & { suggested_caption?: string })[]>([]); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(""); + const [activeClip, setActiveClip] = useState(0); + const [rendering, setRendering] = useState>({}); + const [renderDone, setRenderDone] = useState>({}); + const [downloadUrls, setDownloadUrls] = useState>({}); + + const [trimStart, setTrimStart] = useState>({}); + const [trimEnd, setTrimEnd] = useState>({}); + const [cutRegions, setCutRegions] = useState>({}); + + const [globalStyle, setGlobalStyle] = useState({ + font_family: "Noto Sans", font_size: 64, + primary_color: "#FFFFFF", secondary_color: "#FFFF00", + outline_color: "#000000", shadow_color: "#000000", + bold: true, italic: false, underline: false, + outline_size: 3.0, shadow_size: 1.5, + alignment: 2, margin_v: 250, + display_mode: "word", animation: "pop", + }); + + const [openColorPicker, setOpenColorPicker] = useState(null); + const [activeTab, setActiveTab] = useState<"trim" | "timeline">("trim"); + const [subEvents, setSubEvents] = useState([]); + const [subStyles, setSubStyles] = useState>>({}); + const [expandedSubLine, setExpandedSubLine] = useState(null); + const [styleMode, setStyleMode] = useState<"global" | "per-line">("global"); + const [copied, setCopied] = useState(false); + + useEffect(() => { + if (!sessionId) { router.push("/"); return; } + loadClips(); + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [sessionId]); + + useEffect(() => { + if (isDemo) setSubEvents(MOCK_SUBS[activeClip] ?? []); + setExpandedSubLine(null); + }, [activeClip, isDemo]); + + const loadClips = async () => { + if (isDemo) { + setClips(MOCK_CLIPS); + const urls: Record = {}; + MOCK_CLIPS.forEach((c) => { urls[c.index] = c.download_url; }); + setDownloadUrls(urls); + setSubEvents(MOCK_SUBS[0] ?? []); + setLoading(false); + return; + } + try { + const result = await getClips(sessionId); + if (result.status === "error") { setError(result.error ?? "Failed to load clips"); return; } + setClips(result.clips); + const urls: Record = {}; + result.clips.forEach((c) => { urls[c.index] = c.download_url; }); + setDownloadUrls(urls); + } catch (e: unknown) { + setError(e instanceof Error ? e.message : "Failed to load clips"); + } finally { + setLoading(false); + } + }; + + const handleStyleChange = async (updates: Partial) => { + const newStyle = { ...globalStyle, ...updates }; + setGlobalStyle(newStyle); + if (isDemo) return; + const clip = clips[activeClip]; + if (clip?.ass_path) await patchGlobalStyle(sessionId, clip.index, newStyle).catch(() => {}); + }; + + const handleRender = async (clipIndex: number) => { + setRendering((r) => ({ ...r, [clipIndex]: true })); + setRenderDone((d) => ({ ...d, [clipIndex]: false })); + if (isDemo) { + await new Promise((r) => setTimeout(r, 1400)); + setRendering((r) => ({ ...r, [clipIndex]: false })); + setRenderDone((d) => ({ ...d, [clipIndex]: true })); + return; + } + try { + const url = await renderClip(sessionId, clipIndex); + setDownloadUrls((d) => ({ ...d, [clipIndex]: url })); + setRenderDone((d) => ({ ...d, [clipIndex]: true })); + } catch (e) { + console.error("Render failed", e); + } finally { + setRendering((r) => ({ ...r, [clipIndex]: false })); + } + }; + + const handleSubEdit = async (clipIndex: number, eventIdx: number, text: string) => { + setSubEvents((evts) => evts.map((e) => e.index === eventIdx ? { ...e, text } : e)); + if (isDemo) return; + await patchSubtitle(sessionId, clipIndex, eventIdx, { text }).catch(() => {}); + }; + + const handleSubTiming = async (clipIndex: number, eventIdx: number, start: number, end: number) => { + setSubEvents((evts) => evts.map((e) => e.index === eventIdx ? { ...e, start, end } : e)); + if (isDemo) return; + await patchSubtitle(sessionId, clipIndex, eventIdx, { start, end }).catch(() => {}); + }; + + const handleSubLineStyle = (clipIndex: number, eventIdx: number, updates: Partial) => { + const key = `${clipIndex}-${eventIdx}`; + setSubStyles((prev) => ({ ...prev, [key]: { ...prev[key], ...updates } })); + }; + + const handleResetSubLine = (clipIndex: number, eventIdx: number) => { + const key = `${clipIndex}-${eventIdx}`; + setSubStyles((prev) => { const next = { ...prev }; delete next[key]; return next; }); + }; + + const addCut = (clipIndex: number, duration: number) => { + const existing = cutRegions[clipIndex] ?? []; + setCutRegions((prev) => ({ + ...prev, + [clipIndex]: [...existing, { id: Date.now(), from: parseFloat((duration * 0.3).toFixed(1)), to: parseFloat((duration * 0.5).toFixed(1)) }], + })); + }; + const removeCut = (clipIndex: number, cutId: number) => { + setCutRegions((prev) => ({ ...prev, [clipIndex]: (prev[clipIndex] ?? []).filter((c) => c.id !== cutId) })); + }; + const updateCut = (clipIndex: number, cutId: number, from: number, to: number) => { + setCutRegions((prev) => ({ + ...prev, + [clipIndex]: (prev[clipIndex] ?? []).map((c) => c.id === cutId ? { ...c, from, to } : c), + })); + }; + + const handleCopyCaption = (text: string) => { + navigator.clipboard.writeText(text).then(() => { + setCopied(true); setTimeout(() => setCopied(false), 2000); + }); + }; + + if (loading) return ( +
+
+ +

Loading clips...

+
+
+ ); + + if (error) return ( +
+
+

{error}

+ +
+
+ ); + + const clip = clips[activeClip]; + const apiBase = process.env.NEXT_PUBLIC_API_URL || "http://localhost:8000"; + const rawUrl = clip ? (downloadUrls[clip.index] ?? clip.download_url) : ""; + const clipDownloadUrl = rawUrl.startsWith("http") ? rawUrl : rawUrl ? `${apiBase}${rawUrl}` : ""; + const suggestedCaption = (clip as (typeof clip & { suggested_caption?: string }))?.suggested_caption; + const currentCuts = clip ? (cutRegions[clip.index] ?? []) : []; + const clipDuration = clip ? clip.duration + (trimEnd[clip.index] ?? 0) - (trimStart[clip.index] ?? 0) : 60; + + // TikTok preview style: in per-line mode, use expanded line's effective style + const expandedEvent = subEvents.find((e) => e.index === expandedSubLine); + const expandedOverride = (expandedSubLine !== null && clip) ? subStyles[`${clip.index}-${expandedSubLine}`] ?? {} : {}; + const previewStyle: PreviewStyle = styleMode === "per-line" && expandedSubLine !== null + ? { ...globalStyle, ...expandedOverride } + : globalStyle; + const previewText = styleMode === "per-line" && expandedEvent ? expandedEvent.text : "Sample subtitle"; + + return ( +
+ {/* ── Navbar ── */} + + + {/* ── Main layout ── */} +
+ + {/* ── Clip list sidebar ── */} +
+
{lbl.clips}
+ {clips.map((c, i) => ( + + ))} +
+ + {/* ── Video — fixed width, left-aligned ── */} +
+
+ {clipDownloadUrl ? ( +
+ {/* Stats */} +
+ {lbl.duration} {clip?.duration.toFixed(1)}s + {((clip?.score ?? 0) * 100).toFixed(0)}% +
+
+ + {/* ── Right panel — fills remaining space ── */} +
+ + {/* Render + Download */} +
+ {clip?.ass_path && ( + + )} + + {lbl.download} + +
+ + {/* AI reason */} + {clip?.highlight_reason && ( +
+ +
+
{lbl.whyPicked}
+ {clip.highlight_reason} +
+
+ )} + + {/* Suggested caption */} + {suggestedCaption && ( +
+
+
+ {lbl.caption} +
+ +
+

{suggestedCaption}

+
+ )} + + {/* Tabs */} +
+ {(["trim", "timeline"] as const).map((t) => ( + + ))} +
+ + {/* ── Trim & Cut tab ── */} + {activeTab === "trim" && clip && ( +
+ {/* Start/End offset */} +
+

{lbl.adjustBounds}

+ {[ + { key: "start", label: lbl.startOff, value: trimStart[clip.index] ?? 0, setter: setTrimStart }, + { key: "end", label: lbl.endOff, value: trimEnd[clip.index] ?? 0, setter: setTrimEnd }, + ].map(({ key, label, value, setter }) => ( +
+ + setter((t) => ({ ...t, [clip.index]: +e.target.value }))} + className="w-full mt-1" /> +
-10s0+10s
+
+ ))} +
+ + {/* Cut regions */} +
+
+

+ {lbl.cutMiddle} +

+ +
+ + {clipDuration > 0 && ( +
+
+ {clipDuration.toFixed(1)}s +
+ {(trimStart[clip.index] ?? 0) > 0 && ( +
+ )} + {(trimEnd[clip.index] ?? 0) < 0 && ( +
+ )} + {currentCuts.map((cut) => ( +
+ ))} +
+ )} + + {currentCuts.length === 0 && ( +

{lbl.noCuts}

+ )} + + {currentCuts.map((cut, ci) => ( +
+ {(lbl as typeof L["en"]).cut ?? "Cut"} {ci + 1} +
+ updateCut(clip.index, cut.id, +e.target.value, cut.to)} + className="flex-1 bg-white/5 border border-white/10 rounded px-2 py-1 text-xs text-white font-mono focus:outline-none focus:border-violet-500" /> + s → + updateCut(clip.index, cut.id, cut.from, +e.target.value)} + className="flex-1 bg-white/5 border border-white/10 rounded px-2 py-1 text-xs text-white font-mono focus:outline-none focus:border-violet-500" /> + s +
+ ({(cut.to - cut.from).toFixed(1)}s) + +
+ ))} + + {currentCuts.length > 0 && ( +

+ {lbl.totalRemoved}: {currentCuts.reduce((sum, c) => sum + Math.max(0, c.to - c.from), 0).toFixed(1)}s · + {" "}{lbl.finalDuration}: {(clipDuration - currentCuts.reduce((sum, c) => sum + Math.max(0, c.to - c.from), 0)).toFixed(1)}s +

+ )} +
+
+ )} + + {/* ── Subtitles tab — TikTok preview on left ── */} + {activeTab === "timeline" && clip?.ass_path && ( +
+ {/* Left: TikTok mock preview — sticky while scrolling event list */} +
+ +
+ {/* Right: controls */} +
+ +
+
+ )} + + {activeTab === "timeline" && !clip?.ass_path && ( +
+ {lbl.noSubsHre} +
+ )} +
+
+
+ ); +} + +// ─── Subtitle Timeline Panel ────────────────────────────────────────────────── +function SubtitleTimelinePanel({ + clipIndex, events, globalStyle, subStyles, expandedLine, styleMode, + openColorPicker, lbl, onExpand, onEdit, onTiming, onLineStyle, onResetLine, + onGlobalStyle, onStyleMode, onColorPicker, +}: { + clipIndex: number; events: SubEvent[]; globalStyle: StyleConfig; + subStyles: Record>; + expandedLine: number | null; styleMode: "global" | "per-line"; + openColorPicker: string | null; lbl: Lbl; + onExpand: (i: number | null) => void; + onEdit: (ci: number, ei: number, text: string) => void; + onTiming: (ci: number, ei: number, start: number, end: number) => void; + onLineStyle: (ci: number, ei: number, updates: Partial) => void; + onResetLine: (ci: number, ei: number) => void; + onGlobalStyle: (updates: Partial) => void; + onStyleMode: (m: "global" | "per-line") => void; + onColorPicker: (k: string | null) => void; +}) { + return ( +
+ {/* Style mode toggle + global controls */} +
+
+ {lbl.subStyle} +
+ + +
+
+ {styleMode === "global" && ( + + )} +
+ + {/* Events list */} +
+ {events.map((evt) => { + const styleKey = `${clipIndex}-${evt.index}`; + const lineStyle = subStyles[styleKey] ?? {}; + const hasOverride = Object.keys(lineStyle).length > 0; + const isExpanded = expandedLine === evt.index; + + return ( +
+
+
+ {evt.index + 1} + onEdit(clipIndex, evt.index, e.target.value)} + className="flex-1 bg-transparent text-sm text-white focus:outline-none border-b border-white/15 focus:border-violet-500 pb-0.5 transition" /> + {styleMode === "per-line" && ( + + )} +
+
+
+ {lbl.font.slice(0,2) === "ฟ้" ? "เข้า" : "In"} + onTiming(clipIndex, evt.index, +e.target.value, evt.end)} + className="flex-1 bg-white/5 border border-white/10 rounded px-1.5 py-0.5 text-[11px] text-white font-mono focus:outline-none focus:border-violet-500" /> +
+ +
+ {lbl.font.slice(0,2) === "ฟ้" ? "ออก" : "Out"} + onTiming(clipIndex, evt.index, evt.start, +e.target.value)} + className="flex-1 bg-white/5 border border-white/10 rounded px-1.5 py-0.5 text-[11px] text-white font-mono focus:outline-none focus:border-violet-500" /> +
+ {(evt.end - evt.start).toFixed(1)}s +
+
+ + {styleMode === "per-line" && isExpanded && ( +
+ onLineStyle(clipIndex, evt.index, updates)} + onReset={() => onResetLine(clipIndex, evt.index)} + /> +
+ )} +
+ ); + })} +
+
+ ); +} + +// ─── Global style controls ──────────────────────────────────────────────────── +function GlobalStyleControls({ style, lbl, onChange, openColorPicker, setOpenColorPicker }: { + style: StyleConfig; lbl: Lbl; + onChange: (s: Partial) => void; + openColorPicker: string | null; + setOpenColorPicker: (k: string | null) => void; +}) { + const colorFields = [ + { key: "primary_color", label: lbl.colorText }, + { key: "secondary_color", label: lbl.colorKaraoke }, + { key: "outline_color", label: lbl.colorOutline }, + { key: "shadow_color", label: lbl.colorShadow }, + ]; + + return ( +
+ {/* Font + Size */} +
+
+ + +
+
+ + onChange({ font_size: +e.target.value })} className="w-full mt-2" /> +
+
+ + onChange({ outline_size: +e.target.value })} className="w-full mt-2" /> +
+
+ + onChange({ shadow_size: +e.target.value })} className="w-full mt-2" /> +
+
+ + {/* B/I/U + display mode */} +
+ {[{ k: "bold", l: "B" }, { k: "italic", l: "I" }, { k: "underline", l: "U" }].map(({ k, l }) => ( + + ))} +
+ {[{ id: "word", label: lbl.wordMode }, { id: "sentence", label: lbl.sentenceMode }].map(({ id, label }) => ( + + ))} +
+ + {/* Colors */} +
+ {colorFields.map(({ key, label }) => ( +
+ + {openColorPicker === key && ( +
+
+ )[key] ?? "#fff"} + onChange={(c) => onChange({ [key]: c })} /> + +
+
+ )} +
+ ))} +
+ + {/* Animation */} +
+ +
+ {ANIMATIONS.map((a) => ( + + ))} +
+
+ + {/* Position */} +
+
+ +
+ {ALIGNMENTS.map((a) => ( + + ))} +
+
+
+ + onChange({ margin_v: +e.target.value })} className="w-full mt-1" /> +
+
+
+ ); +} + +// ─── Per-line style controls (full, same as global) ─────────────────────────── +function PerLineStyleControls({ lineStyle, globalStyle, text, hasOverride, lbl, onChange, onReset }: { + lineStyle: Partial; globalStyle: StyleConfig; + text: string; hasOverride: boolean; lbl: Lbl; + onChange: (updates: Partial) => void; + onReset: () => void; +}) { + const [localColorPicker, setLocalColorPicker] = useState(null); + + const ef = (key: keyof SubLineStyle, fallback: unknown) => (key in lineStyle ? lineStyle[key] : fallback) as never; + + const colorFields: { key: keyof SubLineStyle; label: string; fallback: string }[] = [ + { key: "primary_color", label: lbl.colorText, fallback: globalStyle.primary_color ?? "#fff" }, + { key: "secondary_color", label: lbl.colorKaraoke, fallback: globalStyle.secondary_color ?? "#ff0" }, + { key: "outline_color", label: lbl.colorOutline, fallback: globalStyle.outline_color ?? "#000" }, + { key: "shadow_color", label: lbl.colorShadow, fallback: globalStyle.shadow_color ?? "#000" }, + ]; + + const effectiveFont = ef("font_family", globalStyle.font_family ?? "Noto Sans") as string; + const effectiveSize = ef("font_size", globalStyle.font_size ?? 52) as number; + const effectiveOutline = ef("outline_size", globalStyle.outline_size ?? 2.5) as number; + const effectiveShadow = ef("shadow_size", globalStyle.shadow_size ?? 1.5) as number; + const effectiveBold = ef("bold", globalStyle.bold ?? true) as boolean; + const effectiveItalic = ef("italic", globalStyle.italic ?? false) as boolean; + const effectiveAnim = ef("animation", globalStyle.animation ?? "none") as string; + const effectiveAlign = ef("alignment", globalStyle.alignment ?? 2) as number; + const effectiveMarginV = ef("margin_v", globalStyle.margin_v ?? 40) as number; + + return ( +
+ {/* Font + Size */} +
+
+ + +
+
+ + onChange({ font_size: +e.target.value })} className="w-full mt-2" /> +
+
+ + onChange({ outline_size: +e.target.value })} className="w-full mt-2" /> +
+
+ + onChange({ shadow_size: +e.target.value })} className="w-full mt-2" /> +
+
+ + {/* B/I */} +
+ {[ + { key: "bold" as const, label: "B", active: effectiveBold }, + { key: "italic" as const, label: "I", active: effectiveItalic }, + ].map(({ key, label, active }) => ( + + ))} +
+ + {/* Colors */} +
+ {colorFields.map(({ key, label, fallback }) => { + const color = (lineStyle[key] as string | undefined) ?? fallback; + const pickerKey = `line-${key}`; + return ( +
+ + {localColorPicker === pickerKey && ( +
+
+ onChange({ [key]: c })} /> + +
+
+ )} +
+ ); + })} +
+ + {/* Animation */} +
+ +
+ {ANIMATIONS.map((a) => ( + + ))} +
+
+ + {/* Position + Margin V */} +
+
+ +
+ {ALIGNMENTS.map((a) => ( + + ))} +
+
+
+ + onChange({ margin_v: +e.target.value })} className="w-full mt-1" /> +
+
+ + {/* Reset */} + {hasOverride && ( + + )} +
+ ); +} + +// ─── Page entry point ───────────────────────────────────────────────────────── +export default function EditorPage() { + return ( + + +
+ }> + + + ); +} diff --git a/frontend/app/favicon.ico b/frontend/app/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..718d6fea4835ec2d246af9800eddb7ffb276240c Binary files /dev/null and b/frontend/app/favicon.ico differ diff --git a/frontend/app/globals.css b/frontend/app/globals.css new file mode 100644 index 0000000000000000000000000000000000000000..6df43887c245f0a9319dd70d54b1573a5704514e --- /dev/null +++ b/frontend/app/globals.css @@ -0,0 +1,37 @@ +@import "tailwindcss"; + +:root { + --font-inter: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; +} + +* { box-sizing: border-box; } + +body { + font-family: var(--font-inter), "Noto Sans Thai", sans-serif; + background: #0a0a1a; + color: white; +} + +::-webkit-scrollbar { width: 6px; height: 6px; } +::-webkit-scrollbar-track { background: rgba(255,255,255,0.05); border-radius: 3px; } +::-webkit-scrollbar-thumb { background: rgba(139,92,246,0.5); border-radius: 3px; } +::-webkit-scrollbar-thumb:hover { background: rgba(139,92,246,0.8); } + +input[type="range"] { + -webkit-appearance: none; + appearance: none; + height: 4px; + border-radius: 2px; + background: rgba(255,255,255,0.15); + outline: none; +} +input[type="range"]::-webkit-slider-thumb { + -webkit-appearance: none; + width: 16px; + height: 16px; + border-radius: 50%; + background: #8b5cf6; + cursor: pointer; +} + +select option { background: #1a1a2e; } diff --git a/frontend/app/layout.tsx b/frontend/app/layout.tsx new file mode 100644 index 0000000000000000000000000000000000000000..50ff266c7ffbd1a379a3448752ff42cc952fec89 --- /dev/null +++ b/frontend/app/layout.tsx @@ -0,0 +1,15 @@ +import type { Metadata } from "next"; +import "./globals.css"; + +export const metadata: Metadata = { + title: "ElevenClip AI — TikTok Highlight Clipper", + description: "AI-powered livestream highlight extraction for TikTok. Powered by AMD ROCm + Qwen2.5-VL.", +}; + +export default function RootLayout({ children }: { children: React.ReactNode }) { + return ( + + {children} + + ); +} diff --git a/frontend/app/page.tsx b/frontend/app/page.tsx new file mode 100644 index 0000000000000000000000000000000000000000..18041f5e9a5be542a5426fa7e24973e342c5b090 --- /dev/null +++ b/frontend/app/page.tsx @@ -0,0 +1,443 @@ +"use client"; +import { useState, useEffect, useRef } from "react"; +import { useRouter } from "next/navigation"; +import VideoUpload from "@/components/VideoUpload"; +import ClipSettings from "@/components/ClipSettings"; +import SubtitleDesigner from "@/components/SubtitleDesigner"; +import GenerationProgress from "@/components/GenerationProgress"; +import { startProcessing, connectProgressWS, type StyleConfig, type ProcessSettings } from "@/lib/api"; +import { Scissors, Check, ChevronLeft, ArrowRight, Zap, Sparkles, PlayCircle } from "lucide-react"; + +const LANGS = [ + { code: "en", label: "English" }, + { code: "th", label: "ไทย" }, + { code: "zh", label: "中文" }, +] as const; + +const T = { + en: { + heroTitle: "AI Highlight Clipper", + heroSub: "Livestream to TikTok · AMD MI300X · Vision + Audio + Text Multimodal", + steps: ["Video", "Settings", "Subtitles"] as [string, string, string], + addVideo: "Add Video", + clipSettings: "Clip Settings", + subtitleDesign: "Design Subtitles", + back: "Back", + next: "Next", + generate: "Generate Clips!", + generateHRE: "Generate with HRE", + tryDemo: "Try Demo", + demoHint: "Demo mode — works without backend", + accessRequired: "Enter the demo access code to run real GPU generation.", + hackathon: "AMD Developer Hackathon 2026 · Track 3: Vision & Multimodal AI", + }, + th: { + heroTitle: "ตัดคลิปไฮไลท์ด้วย AI", + heroSub: "จาก livestream สู่ TikTok · AMD MI300X · Vision + Audio + Text Multimodal", + steps: ["วิดีโอ", "ตั้งค่า", "ซับ"] as [string, string, string], + addVideo: "เพิ่มวิดีโอ", + clipSettings: "ตั้งค่าคลิป", + subtitleDesign: "ออกแบบซับไตเติ้ล", + back: "ย้อนกลับ", + next: "ถัดไป", + generate: "สร้างคลิปเลย!", + generateHRE: "สร้างด้วย HRE", + tryDemo: "ลองดูตัวอย่าง", + demoHint: "โหมด Demo — ไม่ต้องเชื่อม backend", + accessRequired: "ใส่รหัสเดโมก่อน เพื่อรันของจริงบน GPU", + hackathon: "AMD Developer Hackathon 2026 · Track 3: Vision & Multimodal AI", + }, + zh: { + heroTitle: "AI 精彩片段剪辑", + heroSub: "直播到 TikTok · AMD MI300X · 视觉 + 音频 + 文本多模态", + steps: ["视频", "设置", "字幕"] as [string, string, string], + addVideo: "添加视频", + clipSettings: "片段设置", + subtitleDesign: "字幕设计", + back: "返回", + next: "下一步", + generate: "生成片段!", + generateHRE: "HRE 生成", + tryDemo: "试用演示", + demoHint: "演示模式 — 无需后端连接", + accessRequired: "请输入演示访问码以运行真实 GPU 生成。", + hackathon: "AMD Developer Hackathon 2026 · Track 3: Vision & Multimodal AI", + }, +} as const; + +type Lang = keyof typeof T; +type Step = 1 | 2 | 3 | "generating"; + +// Set NEXT_PUBLIC_DEMO_ENABLED=false in production to hide the demo button +const DEMO_ENABLED = process.env.NEXT_PUBLIC_DEMO_ENABLED !== "false"; +const DEMO_ONLY_PUBLIC = process.env.NEXT_PUBLIC_DEMO_ONLY === "true"; + +const FONT_MAP: Record = { + thai: "Noto Sans Thai", + chinese: "Noto Sans SC", + japanese: "Noto Sans JP", + korean: "Noto Sans KR", + english: "Montserrat", +}; + +const DEMO_STAGES = [ + { stage: "download", pct: 10, message: "Fetching sample video (yt-dlp)..." }, + { stage: "audio", pct: 22, message: "Extracting audio track..." }, + { stage: "scenes", pct: 35, message: "PySceneDetect — 24 scenes found" }, + { stage: "transcribe",pct: 50, message: "Whisper ROCm — transcribing 3m 42s..." }, + { stage: "vision", pct: 65, message: "Qwen2.5-VL analyzing frames + transcript..." }, + { stage: "scoring", pct: 80, message: "score = 0.4×vision + 0.35×audio + 0.25×text" }, + { stage: "cutting", pct: 90, message: "Cutting 3 highlight clips via ffmpeg-amf..." }, + { stage: "subtitles", pct: 96, message: "Generating ASS subtitles (pysubs2)..." }, + { stage: "done", pct: 100, message: "" }, +]; + +export default function HomePage() { + const router = useRouter(); + const [uiLang, setUiLang] = useState("en"); + const [accessCode, setAccessCode] = useState(""); + const t = T[uiLang]; + + useEffect(() => { + const saved = localStorage.getItem("elevnclip-lang") as Lang | null; + if (saved && (["en", "th", "zh"] as string[]).includes(saved)) setUiLang(saved); + setAccessCode(localStorage.getItem("elevnclip_access_code") ?? ""); + }, []); + + const handleLangChange = (lang: Lang) => { + setUiLang(lang); + localStorage.setItem("elevnclip-lang", lang); + }; + + const handleAccessCodeChange = (code: string) => { + setAccessCode(code); + localStorage.setItem("elevnclip_access_code", code); + }; + + const [step, setStep] = useState(1); + const [videoFile, setVideoFile] = useState(null); + const [channelDesc, setChannelDesc] = useState(""); + + const [clipSettings, setClipSettings] = useState({ + clip_style: "funny", + target_duration: 60, + clip_count: 3, + clip_language: "auto", + subtitle_language: "english", + mode: "normal" as "normal" | "hre", + }); + + const [styleConfig, setStyleConfig] = useState({ + font_family: "Montserrat", + font_size: 64, + primary_color: "#FFFFFF", + secondary_color: "#FFFF00", + outline_color: "#000000", + shadow_color: "#000000", + bold: true, + italic: false, + underline: false, + outline_size: 3.0, + shadow_size: 1.5, + alignment: 2, + margin_l: 20, + margin_r: 20, + margin_v: 250, + display_mode: "word", + animation: "pop", + fade_in_ms: 200, + fade_out_ms: 150, + }); + + const [progress, setProgress] = useState({ stage: "download", pct: 0, message: "" }); + const wsRef = useRef(null); + const demoTimerRef = useRef | null>(null); + + const canProceedStep1 = !!videoFile; + + useEffect(() => { + const font = FONT_MAP[clipSettings.subtitle_language] ?? "Noto Sans"; + setStyleConfig((c) => ({ ...c, font_family: font })); + }, [clipSettings.subtitle_language]); + + useEffect(() => () => { + wsRef.current?.close(); + if (demoTimerRef.current) clearInterval(demoTimerRef.current); + }, []); + + const runMockDemo = () => { + let i = 0; + if (demoTimerRef.current) clearInterval(demoTimerRef.current); + demoTimerRef.current = setInterval(() => { + if (i < DEMO_STAGES.length) { + setProgress(DEMO_STAGES[i]); + if (DEMO_STAGES[i].stage === "done") { + clearInterval(demoTimerRef.current!); + setTimeout(() => router.push("/editor?session=demo"), 600); + } + i++; + } + }, 900); + }; + + const handleGenerate = async () => { + if (!canProceedStep1) return; + setStep("generating"); + if (DEMO_ONLY_PUBLIC && !accessCode.trim()) { + setProgress({ stage: "error", pct: 0, message: t.accessRequired }); + return; + } + try { + const settings: ProcessSettings = { + channel_description: channelDesc, + ...clipSettings, + style_config: clipSettings.mode === "hre" ? {} : { ...styleConfig, subtitle_language: clipSettings.subtitle_language }, + }; + const sessionId = await startProcessing(settings, videoFile ?? undefined, accessCode); + localStorage.setItem("elevnclip_session", sessionId); + + let wsAlive = false; + const ws = connectProgressWS(sessionId, (data) => { + wsAlive = true; + setProgress(data); + if (data.stage === "done") { ws.close(); router.push(`/editor?session=${sessionId}`); } + if (data.stage === "error") ws.close(); + }); + wsRef.current = ws; + + // HTTP polling fallback — kicks in if WS doesn't deliver messages + const API_BASE = process.env.NEXT_PUBLIC_API_URL || "http://localhost:8000"; + const poll = setInterval(async () => { + try { + const res = await fetch(`${API_BASE}/api/clips/${sessionId}`); + const data = await res.json(); + const lp = data.last_progress; + if (!wsAlive && lp) setProgress(lp); + if (data.status === "done") { + clearInterval(poll); + if (!wsAlive) router.push(`/editor?session=${sessionId}`); + } + if (data.status === "error") clearInterval(poll); + } catch { /* ignore */ } + }, 3000); + // Stop polling after 10 min + setTimeout(() => clearInterval(poll), 600_000); + } catch (e: unknown) { + setProgress({ stage: "error", pct: 0, message: e instanceof Error ? e.message : "Error" }); + } + }; + + const handleDemo = async () => { + setStep("generating"); + if (DEMO_ONLY_PUBLIC && !accessCode.trim()) { + runMockDemo(); + return; + } + try { + const settings: ProcessSettings = { + use_demo_video: true, + channel_description: "Gaming and reaction channel with funny moments", + clip_style: "funny", + target_duration: 60, + clip_count: 3, + clip_language: "auto", + subtitle_language: "english", + mode: "hre", + style_config: {}, + }; + const sessionId = await startProcessing(settings, undefined, accessCode); + localStorage.setItem("elevnclip_session", sessionId); + + const API_BASE = process.env.NEXT_PUBLIC_API_URL || "http://localhost:8000"; + const ws = connectProgressWS(sessionId, (data) => { + setProgress(data); + if (data.stage === "done") { ws.close(); router.push(`/editor?session=${sessionId}`); } + if (data.stage === "error") ws.close(); + }); + wsRef.current = ws; + + const poll = setInterval(async () => { + try { + const res = await fetch(`${API_BASE}/api/clips/${sessionId}`); + const data = await res.json(); + if (data.last_progress) setProgress(data.last_progress); + if (data.status === "done") { clearInterval(poll); router.push(`/editor?session=${sessionId}`); } + if (data.status === "error") clearInterval(poll); + } catch { /* ignore */ } + }, 3000); + setTimeout(() => clearInterval(poll), 600_000); + } catch { + runMockDemo(); + } + }; + + return ( +
+ {/* Navbar */} + + + {/* Hero */} +
+

+ {t.heroTitle} +

+

{t.heroSub}

+
+ + {/* Main card */} +
+
+ {/* Step indicator */} + {step !== "generating" && ( +
+ {([1, 2, 3] as const).map((s) => ( +
+
s + ? "bg-green-500 text-white" + : "bg-white/10 text-white/30" + }`}> + {(step as number) > s ? : s} +
+ + {t.steps[s - 1]} + + {s < 3 &&
} +
+ ))} +
+ )} + +
+ {step === 1 && ( + <> +

{t.addVideo}

+ setVideoFile(f)} + onChannelDesc={setChannelDesc} + channelDesc={channelDesc} + accessCode={accessCode} + onAccessCode={handleAccessCodeChange} + uiLang={uiLang} + /> + + )} + + {step === 2 && ( + <> +

{t.clipSettings}

+ setClipSettings((c) => ({ ...c, ...s }))} + uiLang={uiLang} + /> + + )} + + {step === 3 && clipSettings.mode === "normal" && ( + <> +

{t.subtitleDesign}

+ setStyleConfig((p) => ({ ...p, ...c }))} + subtitleLanguage={clipSettings.subtitle_language} + uiLang={uiLang} + /> + + )} + + {step === "generating" && } + + {/* Navigation */} + {step !== "generating" && ( +
+ {(step as number) > 1 && ( + + )} + + {step === 1 && ( +
+ {DEMO_ENABLED && ( + + )} + +
+ )} + + {step === 2 && ( + + )} + + {step === 3 && ( + + )} +
+ )} +
+ + {step === 1 && DEMO_ENABLED && ( +

{t.demoHint}

+ )} + +
+ {t.hackathon} +
+
+
+
+ ); +} diff --git a/frontend/components/ClipSettings.tsx b/frontend/components/ClipSettings.tsx new file mode 100644 index 0000000000000000000000000000000000000000..7cef3e2cf32c25eaa57c6a9de4c8dd8bf6b4f04b --- /dev/null +++ b/frontend/components/ClipSettings.tsx @@ -0,0 +1,228 @@ +"use client"; +import { Laugh, Target, BookOpen, Gamepad2, Tv, Zap, Bot, Film } from "lucide-react"; + +const STYLES = [ + { id: "funny", Icon: Laugh, en: "Funny", th: "ตลก", zh: "搞笑" }, + { id: "serious", Icon: Target, en: "Serious", th: "จริงจัง", zh: "严肃" }, + { id: "educational", Icon: BookOpen, en: "Educational", th: "ให้ความรู้", zh: "教育" }, + { id: "gaming", Icon: Gamepad2, en: "Gaming", th: "เกมมิ่ง", zh: "游戏" }, + { id: "entertainment", Icon: Tv, en: "Entertainment", th: "บันเทิง", zh: "娱乐" }, +]; + +const DURATIONS = [15, 30, 45, 60, 90]; + +const LANGUAGES = [ + { code: "auto", label: "Auto-detect" }, + { code: "thai", label: "ภาษาไทย" }, + { code: "english", label: "English" }, + { code: "chinese", label: "中文 (简体)" }, + { code: "japanese", label: "日本語" }, + { code: "korean", label: "한국어" }, + { code: "french", label: "Français" }, + { code: "german", label: "Deutsch" }, + { code: "spanish", label: "Español" }, + { code: "portuguese", label: "Português" }, + { code: "russian", label: "Русский" }, + { code: "arabic", label: "العربية" }, + { code: "hindi", label: "हिंदी" }, + { code: "vietnamese", label: "Tiếng Việt" }, + { code: "indonesian", label: "Bahasa Indonesia" }, +]; + +const L = { + en: { + style: "Clip Style", + duration: "Duration (seconds)", + count: "Clip Count", + videoLang: "Video Language", + subLang: "Subtitle Language", + mode: "Editing Mode", + normalTitle: "Normal Subtitles", + normalDesc: "Customize font, colors, animations", + hreTitle: "High-Retention", + hreDesc: "AI picks everything + auto-zoom + jump cuts", + hreInfo: "AI will auto-select font/colors/animation, remove silence, zoom on faces, and add emoji overlays.", + }, + th: { + style: "สไตล์คลิป", + duration: "ความยาว (วินาที)", + count: "จำนวนคลิป", + videoLang: "ภาษาของวิดีโอ", + subLang: "ภาษาของซับ", + mode: "โหมดการตัด", + normalTitle: "ซับปกติ", + normalDesc: "เลือกรูปแบบซับได้เอง", + hreTitle: "High-Retention", + hreDesc: "AI เลือกทุกอย่างให้ + auto-zoom + jump cuts", + hreInfo: "AI จะเลือก font/สี/animation + ตัด silence + zoom หน้าคน + ใส่ emoji ให้อัตโนมัติ", + }, + zh: { + style: "片段风格", + duration: "时长(秒)", + count: "片段数量", + videoLang: "视频语言", + subLang: "字幕语言", + mode: "剪辑模式", + normalTitle: "普通字幕", + normalDesc: "自定义字体、颜色、动画", + hreTitle: "高留存", + hreDesc: "AI 自动处理 + 自动缩放 + 跳切", + hreInfo: "AI 将自动选择字体/颜色/动画,去除静音段,放大人脸,并添加表情覆盖。", + }, +} as const; + +type Lang = keyof typeof L; + +interface Settings { + clip_style: string; + target_duration: number; + clip_count: number; + clip_language: string; + subtitle_language: string; + mode: "normal" | "hre"; +} + +interface Props { + settings: Settings; + onChange: (s: Partial) => void; + uiLang?: Lang; +} + +export default function ClipSettings({ settings, onChange, uiLang = "en" }: Props) { + const lbl = L[uiLang]; + + return ( +
+ {/* Style — horizontal buttons (icon + label in one line) */} +
+ +
+ {STYLES.map(({ id, Icon, en, th, zh }) => { + const label = uiLang === "th" ? th : uiLang === "zh" ? zh : en; + return ( + + ); + })} +
+
+ + {/* Duration + Count */} +
+
+ +
+ {DURATIONS.map((d) => ( + + ))} +
+
+ +
+ +
+ + {settings.clip_count} + +
+
+
+ + {/* Languages */} +
+
+ + +
+ +
+ + +
+
+ + {/* Mode */} +
+ +
+ + + +
+ + {settings.mode === "hre" && ( +
+ + {lbl.hreInfo} +
+ )} +
+
+ ); +} diff --git a/frontend/components/GenerationProgress.tsx b/frontend/components/GenerationProgress.tsx new file mode 100644 index 0000000000000000000000000000000000000000..6aef1b21f9d8f081eeed4b35b66293e12225f065 --- /dev/null +++ b/frontend/components/GenerationProgress.tsx @@ -0,0 +1,142 @@ +"use client"; +import { useState, useEffect } from "react"; +import { + Download, Music, Film, Mic, Eye, Star, Scissors, Type, + CheckCircle2, AlertCircle, Zap, Brain, LucideProps, +} from "lucide-react"; + +type LucideIcon = React.ComponentType; + +const STAGES: Record = { + download: { en: "Download Video", th: "ดาวน์โหลดวิดีโอ", zh: "下载视频", Icon: Download }, + audio: { en: "Extract Audio", th: "แยกเสียง", zh: "提取音频", Icon: Music }, + scenes: { en: "Scene Detection", th: "ตรวจจับฉาก", zh: "场景检测", Icon: Film }, + transcribe:{ en: "Transcribe", th: "ถอดเสียง (Whisper)", zh: "语音转录", Icon: Mic }, + vision: { en: "Vision Analysis", th: "วิเคราะห์ด้วย AI", zh: "视觉分析", Icon: Eye }, + scoring: { en: "Highlight Scoring", th: "คัดเลือกไฮไลท์", zh: "精彩评分", Icon: Star }, + cutting: { en: "Cut Clips", th: "ตัดคลิป", zh: "剪切片段", Icon: Scissors }, + subtitles: { en: "Generate Subtitles",th: "สร้างซับไตเติ้ล", zh: "生成字幕", Icon: Type }, + done: { en: "Done!", th: "เสร็จสิ้น!", zh: "完成!", Icon: CheckCircle2 }, + error: { en: "Error", th: "เกิดข้อผิดพลาด", zh: "错误", Icon: AlertCircle }, +}; + +type Lang = "en" | "th" | "zh"; + +interface Props { + stage: string; + pct: number; + message: string; + uiLang?: Lang; +} + +export default function GenerationProgress({ stage, pct, message, uiLang = "en" }: Props) { + const info = STAGES[stage] ?? { en: stage, th: stage, zh: stage, Icon: Film }; + const Icon = info.Icon; + const label = uiLang === "th" ? info.th : uiLang === "zh" ? info.zh : info.en; + const isError = stage === "error"; + + // Fake progress: slowly animate toward 4% when real pct is 0 (pipeline just started) + const [fakePct, setFakePct] = useState(0); + useEffect(() => { + if (pct > 0 || isError || stage === "done") { setFakePct(0); return; } + const id = setInterval(() => setFakePct((p) => Math.min(p + 0.3, 4)), 400); + return () => clearInterval(id); + }, [pct, isError, stage]); + const displayPct = Math.max(pct, Math.round(fakePct * 10) / 10); + + const progressLabel = uiLang === "th" ? "ความคืบหน้า" : uiLang === "zh" ? "处理进度" : "Progress"; + + return ( +
+ {/* AMD badge */} +
+ + AMD ROCm GPU Processing + +
+ + {/* Stage icon + label */} +
+
+
+ +
+
+

{label}

+ {message &&

{message}

} +
+ + {/* Progress bar */} +
+
+ {progressLabel} + {displayPct}% +
+
+
= 100 ? "bg-green-500" + : "bg-gradient-to-r from-violet-500 via-fuchsia-500 to-pink-500" + }`} + style={{ width: `${displayPct}%` }} + /> +
+
+ + {/* Stage grid */} +
+ {Object.entries(STAGES) + .filter(([k]) => k !== "error") + .map(([key, val], i, arr) => { + const stageKeys = arr.map(([k]) => k); + const currentIdx = stageKeys.indexOf(stage); + const done = i < currentIdx; + const active = i === currentIdx; + const StageIcon = val.Icon; + const stageLabel = uiLang === "th" ? val.th : uiLang === "zh" ? val.zh : val.en; + return ( +
+
+ +
+
{stageLabel.split(" ")[0]}
+
+ ); + })} +
+ + {/* Multimodal info — compact single line */} + {(stage === "vision" || stage === "transcribe") && ( +
+ + + {uiLang === "th" ? "Multimodal AI: Whisper ROCm · Qwen2.5-VL · librosa" + : uiLang === "zh" ? "多模态 AI: Whisper ROCm · Qwen2.5-VL · librosa" + : "Multimodal AI: Whisper ROCm · Qwen2.5-VL · librosa"} + +
+ )} +
+ ); +} diff --git a/frontend/components/SubtitleDesigner.tsx b/frontend/components/SubtitleDesigner.tsx new file mode 100644 index 0000000000000000000000000000000000000000..c1a1cd4d562a82454ad71c68358cc19c433c42d6 --- /dev/null +++ b/frontend/components/SubtitleDesigner.tsx @@ -0,0 +1,385 @@ +"use client"; +import { useState } from "react"; +import { HexColorPicker } from "react-colorful"; +import { Minus, Sun, Mic, Zap, Keyboard, ArrowUpDown, Heart, MessageCircle, Share2, Music } from "lucide-react"; +import type { StyleConfig } from "@/lib/api"; + +const FONTS = [ + "Noto Sans Thai", "Noto Sans SC", "Noto Sans", "Impact", + "Montserrat", "Oswald", "Anton", "Bebas Neue", +]; + +const ANIMATIONS = [ + { id: "none", Icon: Minus, en: "None", th: "ไม่มี", zh: "无" }, + { id: "fade", Icon: Sun, en: "Fade", th: "Fade", zh: "淡入" }, + { id: "karaoke", Icon: Mic, en: "Karaoke", th: "Karaoke", zh: "卡拉OK" }, + { id: "pop", Icon: Zap, en: "Pop", th: "Pop", zh: "弹出" }, + { id: "typewriter", Icon: Keyboard, en: "Typewriter", th: "Typewriter",zh: "打字机" }, + { id: "bounce", Icon: ArrowUpDown, en: "Bounce", th: "Bounce", zh: "弹跳" }, +]; + +const ALIGNMENTS = [ + { val: 7, label: "↖" }, { val: 8, label: "↑" }, { val: 9, label: "↗" }, + { val: 4, label: "←" }, { val: 5, label: "·" }, { val: 6, label: "→" }, + { val: 1, label: "↙" }, { val: 2, label: "↓" }, { val: 3, label: "↘" }, +]; + +const SAMPLE_TEXTS: Record = { + thai: "นี่คือตัวอย่างซับไตเติ้ล", + english: "This is a sample subtitle", + chinese: "这是字幕示例", + japanese: "これはサンプル字幕です", + korean: "이것은 샘플 자막입니다", + default: "Sample subtitle preview", +}; + +const L = { + en: { + font: "Font", size: "Size", outline: "Outline", shadow: "Shadow", + colors: "Colors", primary: "Text", secondary: "Karaoke", outlineC: "Border", shadowC: "Shadow", + word: "Word", sentence: "Sentence", + animation: "Animation", animPreview: "Animation preview", position: "Position", close: "Close", + }, + th: { + font: "ฟ้อนต์", size: "ขนาด", outline: "ขอบ", shadow: "เงา", + colors: "สี", primary: "ข้อความ", secondary: "Karaoke", outlineC: "ขอบ", shadowC: "เงา", + word: "คำ", sentence: "ประโยค", + animation: "แอนิเมชัน", animPreview: "ตัวอย่างแอนิเมชัน", position: "ตำแหน่ง", close: "ปิด", + }, + zh: { + font: "字体", size: "大小", outline: "描边", shadow: "阴影", + colors: "颜色", primary: "文字", secondary: "卡拉OK", outlineC: "描边", shadowC: "阴影", + word: "逐字", sentence: "句子", + animation: "动画", animPreview: "动画预览", position: "位置", close: "关闭", + }, +} as const; + +type Lang = keyof typeof L; + +interface Props { + config: StyleConfig; + onChange: (c: Partial) => void; + subtitleLanguage: string; + uiLang?: Lang; +} + +/* ── TikTok phone preview ── */ +function TikTokPreview({ config, subtitleLanguage }: { config: StyleConfig; subtitleLanguage: string }) { + const text = SAMPLE_TEXTS[subtitleLanguage] ?? SAMPLE_TEXTS.default; + const alignment = config.alignment ?? 2; + const isBottom = alignment <= 3; + const isTop = alignment >= 7; + const isLeft = [1, 4, 7].includes(alignment); + const isRight = [3, 6, 9].includes(alignment); + + const posStyle: React.CSSProperties = { + position: "absolute", zIndex: 10, maxWidth: "76%", + wordBreak: "break-word", + textAlign: isLeft ? "left" : isRight ? "right" : "center", + }; + if (isBottom) { + posStyle.bottom = "18%"; + posStyle.left = isLeft ? "4%" : isRight ? undefined : "12%"; + posStyle.right = isRight ? "14%" : isLeft ? undefined : "12%"; + } else if (isTop) { + posStyle.top = "12%"; + posStyle.left = isLeft ? "4%" : isRight ? undefined : "12%"; + posStyle.right = isRight ? "14%" : isLeft ? undefined : "12%"; + } else { + posStyle.top = "50%"; posStyle.transform = "translateY(-50%)"; + posStyle.left = isLeft ? "4%" : isRight ? undefined : "12%"; + posStyle.right = isRight ? "14%" : isLeft ? undefined : "12%"; + } + + const s = 0.34; + const fontSize = Math.max(10, (config.font_size ?? 52) * s); + const outlineW = (config.outline_size ?? 2.5) * s; + const textShadow = outlineW > 0 + ? `${outlineW}px ${outlineW}px 0 ${config.outline_color ?? "#000"}, + -${outlineW}px ${outlineW}px 0 ${config.outline_color ?? "#000"}, + ${outlineW}px -${outlineW}px 0 ${config.outline_color ?? "#000"}, + -${outlineW}px -${outlineW}px 0 ${config.outline_color ?? "#000"}` + : "none"; + + const textStyle: React.CSSProperties = { + fontFamily: `"${config.font_family ?? "Noto Sans"}", sans-serif`, + fontSize: `${fontSize}px`, fontWeight: config.bold ? "bold" : "normal", + fontStyle: config.italic ? "italic" : "normal", + textDecoration: config.underline ? "underline" : "none", + color: config.primary_color ?? "#FFFFFF", textShadow, lineHeight: 1.25, + }; + + return ( +
+
+
+
+
+
+
+
+
+ {[{ Icon: Heart, count: "12.4K" }, { Icon: MessageCircle, count: "892" }, { Icon: Share2, count: "Share" }].map(({ Icon, count }, i) => ( +
+
+ +
+ {count} +
+ ))} +
+
+ +
+
+
+
+
@elevnclip_ai
+
#highlight #tiktok #ai #amd
+
+
+
+
+
{text}
+
+ PREVIEW +
+
+
+ ); +} + +/* ── Animation CSS keyframes ── */ +const ANIM_CSS = ` + @keyframes elevn-fade { 0%,100%{opacity:0} 30%,70%{opacity:1} } + @keyframes elevn-pop { 0%,100%{transform:scale(0.5);opacity:0} 40%{transform:scale(1.12)} 50%,80%{transform:scale(1);opacity:1} } + @keyframes elevn-bounce { 0%,100%{transform:translateY(0)} 50%{transform:translateY(-9px)} } + @keyframes elevn-type { from{clip-path:inset(0 100% 0 0)} to{clip-path:inset(0 0% 0 0)} } +`; + +function AnimPreview({ animation, config, text }: { animation: string; config: StyleConfig; text: string }) { + const s = 0.38; + const fontSize = Math.max(12, (config.font_size ?? 52) * s); + const outlineW = (config.outline_size ?? 2.5) * s; + const textShadow = outlineW > 0 + ? `${outlineW}px ${outlineW}px 0 ${config.outline_color ?? "#000"}, -${outlineW}px ${outlineW}px 0 ${config.outline_color ?? "#000"}, ${outlineW}px -${outlineW}px 0 ${config.outline_color ?? "#000"}, -${outlineW}px -${outlineW}px 0 ${config.outline_color ?? "#000"}` + : "none"; + + const animMap: Record = { + fade: "elevn-fade 2s ease-in-out infinite", + pop: "elevn-pop 1.8s ease-in-out infinite", + bounce: "elevn-bounce 1s ease-in-out infinite", + typewriter: "elevn-type 2.5s steps(20) infinite", + }; + + const baseStyle: React.CSSProperties = { + fontFamily: `"${config.font_family ?? "Noto Sans"}", sans-serif`, + fontSize: `${fontSize}px`, + fontWeight: config.bold ? "bold" : "normal", + fontStyle: config.italic ? "italic" : "normal", + color: config.primary_color ?? "#fff", + textShadow, + display: "inline-block", + whiteSpace: "nowrap", + animation: animMap[animation] ?? "none", + }; + + return ( + <> + +
+ {animation === "karaoke" ? ( + + {text} + + ) : ( + {text} + )} +
+ + ); +} + +export default function SubtitleDesigner({ config, onChange, subtitleLanguage, uiLang = "en" }: Props) { + const [colorPicker, setColorPicker] = useState(null); + const lbl = L[uiLang]; + const sampleText = SAMPLE_TEXTS[subtitleLanguage] ?? SAMPLE_TEXTS.default; + + const colorFields = [ + { key: "primary_color", label: lbl.primary }, + { key: "secondary_color", label: lbl.secondary }, + { key: "outline_color", label: lbl.outlineC }, + { key: "shadow_color", label: lbl.shadowC }, + ]; + + return ( +
+ + {/* Col 1 — TikTok preview */} +
+ +
+ + {/* Col 2 — Typography controls */} +
+ {/* Font */} +
+ + +
+ {/* Size */} +
+ + onChange({ font_size: +e.target.value })} + className="w-full mt-1 accent-violet-500" /> +
+ {/* Outline */} +
+ + onChange({ outline_size: +e.target.value })} + className="w-full mt-1 accent-violet-500" /> +
+ {/* Shadow */} +
+ + onChange({ shadow_size: +e.target.value })} + className="w-full mt-1 accent-violet-500" /> +
+ {/* B/I/U */} +
+ {[ + { key: "bold", label: "B", cls: "font-bold" }, + { key: "italic", label: "I", cls: "italic" }, + { key: "underline", label: "U", cls: "underline" }, + ].map(({ key, label, cls }) => ( + + ))} +
+ {/* Display mode */} +
+ {[{ id: "word", label: lbl.word }, { id: "sentence", label: lbl.sentence }].map((m) => ( + + ))} +
+
+ + {/* Col 3 — Colors, Animation, Position */} +
+ {/* Colors */} +
+ +
+ {colorFields.map(({ key, label }) => ( +
+ + {colorPicker === key && ( +
+
+ )[key] ?? "#ffffff"} + onChange={(c) => onChange({ [key]: c })} /> + +
+
+ )} +
+ ))} +
+
+ + {/* Animation preview + picker combined */} +
+ + +
+ {ANIMATIONS.map(({ id, Icon, en, th, zh }) => { + const animLabel = uiLang === "th" ? th : uiLang === "zh" ? zh : en; + return ( + + ); + })} +
+
+ + {/* Position */} +
+ +
+
+ {ALIGNMENTS.map((a) => ( + + ))} +
+
+ + onChange({ margin_v: +e.target.value })} + className="w-full accent-violet-500" /> +
+
+
+
+ +
+ ); +} diff --git a/frontend/components/VideoUpload.tsx b/frontend/components/VideoUpload.tsx new file mode 100644 index 0000000000000000000000000000000000000000..76e801085323267d8a46a9718cfb46fded1e22ae --- /dev/null +++ b/frontend/components/VideoUpload.tsx @@ -0,0 +1,133 @@ +"use client"; +import { useState, useCallback } from "react"; +import { useDropzone } from "react-dropzone"; +import { Upload, CheckCircle, Film } from "lucide-react"; + +const L = { + en: { + upload: "Upload File", + replace: "Click or drag a new file to replace", + drag: "Drag video here, or click to select", + channelLabel: "Channel Description", + channelOpt: "(optional — helps AI analyze better)", + channelPlaceholder: "e.g. English gaming channel focused on funny reactions and horror games", + accessLabel: "Demo Access Code", + accessOpt: "(required for GPU generation when enabled)", + accessPlaceholder: "Enter the code shared by the team", + }, + th: { + upload: "อัปโหลดไฟล์", + replace: "คลิกหรือลากไฟล์ใหม่เพื่อเปลี่ยน", + drag: "ลากวิดีโอมาวางที่นี่ หรือคลิกเพื่อเลือก", + channelLabel: "คำอธิบายช่อง", + channelOpt: "(ไม่บังคับ — ช่วย AI วิเคราะห์ได้ดีขึ้น)", + channelPlaceholder: "เช่น: ช่องเกมมิ่งภาษาไทย เน้นตลก reaction และ horror game", + accessLabel: "รหัสเข้าใช้เดโม", + accessOpt: "(จำเป็นเมื่อเปิดการป้องกัน GPU)", + accessPlaceholder: "ใส่รหัสที่ทีมแชร์ให้", + }, + zh: { + upload: "上传文件", + replace: "点击或拖入新文件以替换", + drag: "将视频拖到此处,或点击选择", + channelLabel: "频道描述", + channelOpt: "(可选 — 帮助 AI 更好地分析)", + channelPlaceholder: "例:中文游戏频道,专注搞笑反应和恐怖游戏", + accessLabel: "演示访问码", + accessOpt: "(启用 GPU 保护时需要)", + accessPlaceholder: "输入团队分享的访问码", + }, +} as const; + +type Lang = keyof typeof L; + +interface Props { + onFileSelect: (file: File) => void; + onChannelDesc: (desc: string) => void; + channelDesc: string; + accessCode: string; + onAccessCode: (code: string) => void; + uiLang?: Lang; +} + +export default function VideoUpload({ + onFileSelect, + onChannelDesc, + channelDesc, + accessCode, + onAccessCode, + uiLang = "en", +}: Props) { + const lbl = L[uiLang]; + const [fileName, setFileName] = useState(""); + + const onDrop = useCallback((files: File[]) => { + if (files[0]) { + setFileName(files[0].name); + onFileSelect(files[0]); + } + }, [onFileSelect]); + + const { getRootProps, getInputProps, isDragActive } = useDropzone({ + onDrop, + accept: { "video/*": [".mp4", ".mov", ".avi", ".mkv", ".webm"] }, + maxFiles: 1, + }); + + return ( +
+ {/* Drop zone */} +
+ + {fileName ? ( +
+ +

{fileName}

+

{lbl.replace}

+
+ ) : ( +
+ +

{lbl.drag}

+

MP4, MOV, AVI, MKV, WebM

+
+ )} +
+ + {/* Channel description */} +
+ +