JakgritB commited on
Commit
102f4d2
·
1 Parent(s): 89e1dc4

Deploy safe hackathon demo

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.example +0 -18
  2. .github/workflows/sync-to-hf.yml +0 -22
  3. .gitignore +28 -16
  4. Dockerfile +53 -4
  5. LICENSE +1 -1
  6. README.md +190 -433
  7. backend/Dockerfile +0 -17
  8. backend/app/__init__.py +0 -1
  9. backend/app/core/__init__.py +0 -1
  10. backend/app/core/config.py +0 -68
  11. backend/app/core/timing.py +0 -20
  12. backend/app/main.py +0 -240
  13. backend/app/models/__init__.py +0 -1
  14. backend/app/models/schemas.py +0 -127
  15. backend/app/services/__init__.py +0 -1
  16. backend/app/services/clips.py +0 -219
  17. backend/app/services/highlight.py +0 -434
  18. backend/app/services/multimodal.py +0 -200
  19. backend/app/services/pipeline.py +0 -236
  20. backend/app/services/subtitles.py +0 -151
  21. backend/app/services/transcription.py +0 -366
  22. backend/app/services/video_input.py +0 -80
  23. backend/app/storage.py +0 -58
  24. backend/app/utils/__init__.py +0 -1
  25. backend/app/utils/rocm.py +0 -33
  26. backend/app/workers/__init__.py +0 -1
  27. backend/app/workers/celery_app.py +0 -15
  28. backend/main.py +466 -0
  29. backend/pyproject.toml +0 -44
  30. backend/requirements.txt +37 -0
  31. backend/src/__init__.py +0 -0
  32. backend/src/analysis/__init__.py +0 -0
  33. backend/src/analysis/highlight_scorer.py +166 -0
  34. backend/src/analysis/scene_detector.py +111 -0
  35. backend/src/analysis/vision.py +305 -0
  36. backend/src/gpu/__init__.py +0 -0
  37. backend/src/gpu/rocm_utils.py +92 -0
  38. backend/src/gpu/vllm_manager.py +208 -0
  39. backend/src/ingestion/__init__.py +0 -0
  40. backend/src/ingestion/uploader.py +34 -0
  41. backend/src/ingestion/youtube.py +147 -0
  42. backend/src/processing/__init__.py +0 -0
  43. backend/src/processing/clip_extractor.py +131 -0
  44. backend/src/processing/emoji_overlay.py +36 -0
  45. backend/src/processing/high_retention.py +491 -0
  46. backend/src/processing/subtitle.py +291 -0
  47. backend/src/transcription/__init__.py +0 -0
  48. backend/src/transcription/whisper.py +234 -0
  49. deploy/setup_droplet.sh +87 -0
  50. deploy/start_fastapi.sh +10 -0
.env.example DELETED
@@ -1,18 +0,0 @@
1
- DEMO_MODE=true
2
- STORAGE_DIR=backend/data
3
- FRONTEND_ORIGIN=http://localhost:5173
4
-
5
- WHISPER_MODEL_ID=openai/whisper-large-v3
6
- QWEN_TEXT_MODEL_ID=Qwen/Qwen2.5-7B-Instruct
7
- QWEN_VL_MODEL_ID=Qwen/Qwen2-VL-7B-Instruct
8
- HF_TOKEN=
9
-
10
- TARGET_CLIP_COUNT=5
11
- MAX_CLIPS=10
12
- FFMPEG_BINARY=ffmpeg
13
- FFPROBE_BINARY=ffprobe
14
- FFMPEG_VIDEO_CODEC=h264_amf
15
- FFMPEG_CPU_CODEC=libx264
16
-
17
- REDIS_URL=redis://redis:6379/0
18
- CELERY_ENABLED=false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.github/workflows/sync-to-hf.yml DELETED
@@ -1,22 +0,0 @@
1
- name: Sync to Hugging Face Space
2
-
3
- on:
4
- push:
5
- branches:
6
- - main
7
-
8
- jobs:
9
- sync:
10
- runs-on: ubuntu-latest
11
- steps:
12
- - uses: actions/checkout@v4
13
- with:
14
- fetch-depth: 0
15
- lfs: true
16
-
17
- - name: Push to Hugging Face Space
18
- env:
19
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
- run: |
21
- git remote add space https://JakgritB:$HF_TOKEN@huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI
22
- git push space main --force
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -1,25 +1,37 @@
1
- .env
2
- .hf-home/
3
- .venv/
4
- .python_packages/
5
  __pycache__/
6
- *.pyc
7
  *.egg-info/
8
- .pytest_cache/
9
- .mypy_cache/
10
- .ruff_cache/
11
-
12
- node_modules/
13
  dist/
14
- .vite/
15
 
16
- data/
17
- backend/data/
18
- tmp/
19
- pip-tmp/
 
 
 
 
 
 
 
20
  *.log
21
 
22
- hf-space-live/
 
 
 
 
 
 
 
 
 
 
23
 
 
24
  .DS_Store
25
  Thumbs.db
 
1
+ # Python
 
 
 
2
  __pycache__/
3
+ *.py[cod]
4
  *.egg-info/
5
+ .venv/
6
+ venv/
 
 
 
7
  dist/
8
+ build/
9
 
10
+ # Node
11
+ frontend/node_modules/
12
+ frontend/.next/
13
+ frontend/out/
14
+
15
+ # Temp files
16
+ graphify-out/
17
+ /tmp/
18
+ *.wav
19
+ *.mp4
20
+ *.ass
21
  *.log
22
 
23
+ # Env
24
+ .env
25
+ .env.local
26
+ .env.*.local
27
+
28
+ # SSH keys (never commit)
29
+ *.pem
30
+ *_key
31
+ *_key.pub
32
+ id_rsa*
33
+ id_ed25519*
34
 
35
+ # OS
36
  .DS_Store
37
  Thumbs.db
Dockerfile CHANGED
@@ -1,9 +1,58 @@
1
- FROM python:3.11-slim
 
 
2
  WORKDIR /app
3
 
4
- RUN pip install --no-cache-dir fastapi uvicorn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- COPY landing.py ./landing.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  EXPOSE 7860
9
- CMD ["uvicorn", "landing:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
 
 
 
1
+ # ElevenClip AI — HuggingFace Spaces (AMD ROCm)
2
+ FROM rocm/pytorch:rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
3
+
4
  WORKDIR /app
5
 
6
+ # System dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ ffmpeg \
9
+ nginx \
10
+ curl \
11
+ git \
12
+ nodejs \
13
+ npm \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ # ─── Backend Python dependencies ───────────────────────────────────────────
17
+ COPY backend/requirements.txt /app/backend/requirements.txt
18
+ RUN pip install --no-cache-dir -r /app/backend/requirements.txt
19
+
20
+ # vLLM with ROCm support
21
+ RUN pip install --no-cache-dir \
22
+ "vllm>=0.6.0" \
23
+ --extra-index-url https://download.pytorch.org/whl/rocm6.2
24
+
25
+ COPY backend/ /app/backend/
26
+
27
+ # ─── Frontend (Next.js standalone build) ──────────────────────────────────
28
+ COPY frontend/package*.json /app/frontend/
29
+ RUN cd /app/frontend && npm ci --production=false
30
+
31
+ COPY frontend/ /app/frontend/
32
 
33
+ # Relative API URL — nginx proxies /api/* and /ws/* to FastAPI :8080
34
+ ENV NEXT_PUBLIC_API_URL=""
35
+ ENV NEXT_PUBLIC_DEMO_ENABLED="true"
36
+ ENV NEXT_PUBLIC_DEMO_ONLY="true"
37
+
38
+ RUN cd /app/frontend && npm run build
39
+
40
+ # ─── nginx config ──────────────────────────────────────────────────────────
41
+ COPY nginx.conf /app/nginx.conf
42
+
43
+ # ─── Runtime directories ───────────────────────────────────────────────────
44
+ RUN mkdir -p /tmp/elevnclip /root/.cache/huggingface /root/ElevenClip-AI/demo_videos
45
+
46
+ # ─── Startup ──────────────────────────────────────────────────────────────
47
+ COPY start.sh /app/start.sh
48
+ RUN chmod +x /app/start.sh
49
 
50
  EXPOSE 7860
51
+
52
+ # vLLM managed on-demand by vllm_manager.py (not started at container startup)
53
+ ENV VLLM_ON_DEMAND="true"
54
+ ENV VLLM_PORT="8000"
55
+ ENV VLLM_IDLE_TIMEOUT="300"
56
+ ENV VLLM_DOCKER_CONTAINER=""
57
+
58
+ CMD ["/app/start.sh"]
LICENSE CHANGED
@@ -1,6 +1,6 @@
1
  MIT License
2
 
3
- Copyright (c) 2026 ElevenClip.AI contributors
4
 
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
  of this software and associated documentation files (the "Software"), to deal
 
1
  MIT License
2
 
3
+ Copyright (c) 2026 ElevenClip AI
4
 
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
  of this software and associated documentation files (the "Software"), to deal
README.md CHANGED
@@ -1,498 +1,255 @@
1
- ---
2
- title: ElevenClip AI
3
- emoji: 🎬
4
- colorFrom: purple
5
- colorTo: red
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- # ElevenClip.AI
11
-
12
- ElevenClip.AI is an AI-powered clip studio for turning long-form videos into personalized short-form content for TikTok, YouTube Shorts, and Instagram Reels.
13
-
14
- This project is built for the **AMD Developer Hackathon** on lablab.ai, targeting **Track 3: Vision & Multimodal AI**. The system is designed to run on **AMD Developer Cloud** with **ROCm** and **AMD Instinct MI300X** acceleration, while using **Hugging Face** as the model hub/deployment layer and **Qwen** models for profile-aware highlight reasoning.
15
-
16
- ## One-Sentence Pitch
17
-
18
- ElevenClip.AI helps creators convert long videos into ready-to-edit short clips by combining Whisper transcription, Qwen highlight detection, optional Qwen-VL visual understanding, ffmpeg rendering, and a human-in-the-loop clip editor.
19
-
20
- ## Problem
21
-
22
- Long-form creators, podcasters, educators, streamers, and marketing teams often publish hours of video but still need short clips for modern discovery platforms.
23
-
24
- The manual workflow is painful:
25
-
26
- - Watch the full video.
27
- - Find high-retention moments.
28
- - Trim each clip.
29
- - Rewrite subtitles.
30
- - Reframe to vertical 9:16.
31
- - Export platform-ready MP4 files.
32
-
33
- For a two-hour video, this can take several hours of editing time. The bottleneck is not just cutting video; it is understanding which moments match the creator's audience, channel style, language, and target platform.
34
-
35
- ## Solution
36
-
37
- ElevenClip.AI automates the first pass of short-form production:
38
-
39
- 1. The creator sets up a reusable channel profile.
40
- 2. The creator provides a YouTube URL or uploads a video file.
41
- 3. Whisper Large V3 transcribes the video, including Thai and multilingual speech.
42
- 4. Qwen2.5 analyzes the transcript and scores candidate highlights based on engagement potential and the creator profile.
43
- 5. Optional Qwen2-VL analysis can enrich the scores with visual signals such as reactions, scene changes, and on-screen text.
44
- 6. ffmpeg renders vertical clips with subtitle files and burn-in support.
45
- 7. The React editor lets the human approve, delete, trim, regenerate, and edit subtitles before download.
46
 
47
- The product is intentionally human-AI collaborative: AI finds and prepares the clips quickly, while the creator keeps editorial control.
48
 
49
- ## Hackathon Alignment
50
 
51
- ### Track
 
 
 
52
 
53
- **Track 3: Vision & Multimodal AI**
54
-
55
- ElevenClip.AI processes multiple media types:
56
-
57
- - Audio: speech transcription with Whisper Large V3.
58
- - Text: transcript reasoning and highlight ranking with Qwen2.5.
59
- - Video: frame-aware multimodal analysis with Qwen2-VL as the next pipeline stage.
60
- - Rendered media: ffmpeg exports platform-ready video clips.
61
-
62
- ### AMD Technology
63
-
64
- The production target is AMD Developer Cloud:
65
-
66
- - **AMD Instinct MI300X** for high-throughput model inference.
67
- - **ROCm 6.x** as the GPU software stack.
68
- - **PyTorch with ROCm support** for Whisper inference.
69
- - **vLLM ROCm backend** for fast Qwen2.5 inference.
70
- - **Optimum-AMD** as an optimization path for Hugging Face models on AMD hardware.
71
- - **ffmpeg hardware acceleration hooks** for faster video encoding where available.
72
-
73
- The app has a local `DEMO_MODE=true` path so judges and teammates can inspect the UI/API without downloading large models. On AMD Developer Cloud, set `DEMO_MODE=false` to activate the real model stack.
74
 
75
- ### Hugging Face Integration
76
 
77
- Hugging Face is used as the model hub and deployment layer:
78
 
79
- - `openai/whisper-large-v3` for transcription.
80
- - `Qwen/Qwen2.5-7B-Instruct` for highlight analysis.
81
- - `Qwen/Qwen2-VL-7B-Instruct` for multimodal video understanding.
82
- - Public Hugging Face Space for the hackathon demo page:
83
- `https://huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI`
84
 
85
- ### Qwen Integration
86
 
87
- Qwen is not used as a generic chatbot. It is part of the core product logic:
88
 
89
- - Reads timestamped transcript segments.
90
- - Considers creator profile settings.
91
- - Scores engagement potential.
92
- - Explains why a segment should become a clip.
93
- - Returns structured JSON with timestamps, titles, scores, reasons, and subtitle text.
 
 
94
 
95
- ## Current MVP Features
96
 
97
- - Channel profile onboarding:
98
- - niche
99
- - preferred clip style
100
- - preferred clip length
101
- - primary language
102
- - target platform
103
- - YouTube URL ingestion through `yt-dlp`.
104
- - Direct video upload endpoint.
105
- - Whisper transcription service boundary.
106
- - Qwen highlight detection service boundary.
107
- - Optional Qwen2-VL multimodal analysis service boundary.
108
- - ffmpeg clip generation with subtitle file creation.
109
- - Vertical 9:16 export path for TikTok, Shorts, and Reels.
110
- - Human-AI review UI:
111
- - trim start/end
112
- - edit subtitles inline
113
- - approve clips
114
- - delete clips
115
- - regenerate a clip
116
- - download MP4 output
117
- - Timing logs for benchmark demos.
118
- - Docker and AMD Cloud deployment notes.
119
-
120
- ## Architecture
121
-
122
- ```mermaid
123
- flowchart LR
124
- A["Creator Profile"] --> D["Qwen2.5 Highlight Scoring"]
125
- B["YouTube URL"] --> C["yt-dlp / Video Input"]
126
- B2["Uploaded Video"] --> C
127
- C --> W["Whisper Large V3 Transcription"]
128
- W --> D
129
- C --> V["Qwen2-VL Visual Analysis (Optional)"]
130
- D --> R["Clip Plan JSON"]
131
- V --> R
132
- R --> F["ffmpeg Clip Rendering + Subtitles"]
133
- F --> E["React Human-AI Editor"]
134
- E --> O["Approved Short-Form Clips"]
135
  ```
 
136
 
137
- ## Repository Structure
138
-
139
- ```text
140
- .
141
- ├── backend/
142
- │ ├── app/
143
- │ │ ├── core/ # configuration and timing instrumentation
144
- │ │ ├── models/ # Pydantic request/response schemas
145
- │ │ ├── services/ # ingest, transcription, Qwen scoring, subtitles, rendering
146
- │ │ ├── utils/ # ROCm / accelerator detection
147
- │ │ ├── workers/ # optional Celery wiring
148
- │ │ ├── main.py # FastAPI application
149
- │ │ └── storage.py # file-backed job storage for MVP
150
- │ ├── Dockerfile
151
- │ └── pyproject.toml
152
- ├── frontend/
153
- │ ├── src/
154
- │ │ ├── App.jsx # creator workflow and clip editor
155
- │ │ ├── main.jsx
156
- │ │ └── styles.css
157
- │ ├── Dockerfile
158
- │ └── package.json
159
- ├── infra/
160
- │ └── amd-cloud.md # AMD Developer Cloud deployment guide
161
- ├── scripts/
162
- │ └── benchmark.py # end-to-end API benchmark helper
163
- ├── docker-compose.yml
164
- └── README.md
165
  ```
166
 
167
- ## Processing Pipeline
168
-
169
- ### 1. Video Input
170
-
171
- The backend accepts:
172
-
173
- - YouTube URL through `POST /api/jobs/youtube`
174
- - Uploaded video file through `POST /api/jobs/upload`
175
-
176
- In production, YouTube videos are downloaded with `yt-dlp`. In demo mode, the app can generate a synthetic ffmpeg test video so the workflow can be tested without external downloads.
177
-
178
- ### 2. Transcription
179
-
180
- The transcription service is implemented in `backend/app/services/transcription.py`.
181
-
182
- Production target:
183
-
184
- - Model: `openai/whisper-large-v3`
185
- - Runtime: Hugging Face Transformers
186
- - Accelerator: PyTorch ROCm on AMD MI300X
187
- - Language goal: Thai and multilingual support
188
-
189
- ### 3. Highlight Detection
190
-
191
- The highlight detector is implemented in `backend/app/services/highlight.py`.
192
-
193
- Production target:
194
-
195
- - Model: `Qwen/Qwen2.5-7B-Instruct`
196
- - Runtime: vLLM with ROCm backend
197
- - Output: strict structured JSON
198
 
199
- Highlight scoring considers:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- - questions
202
- - punchlines
203
- - emotional peaks
204
- - key information
205
- - channel niche
206
- - preferred clip style
207
- - target platform
208
- - target clip length
209
 
210
- ### 4. Multimodal Analysis
211
 
212
- The multimodal service boundary is implemented in `backend/app/services/multimodal.py`.
 
 
 
 
 
 
213
 
214
- Planned production target:
215
 
216
- - Model: `Qwen/Qwen2-VL-7B-Instruct`
217
- - Inputs: sampled video frames, transcript context, and clip candidates
218
- - Visual signals:
219
- - creator or guest reactions
220
- - scene changes
221
- - on-screen text
222
- - high-motion segments
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
- This is isolated as a replaceable pipeline step so it can be enabled when AMD Cloud resources are available.
225
 
226
- ### 5. Clip Generation
227
 
228
- Clip rendering is implemented in `backend/app/services/clips.py`.
 
 
 
 
 
 
229
 
230
- The ffmpeg stage:
231
 
232
- - cuts video by selected timestamps
233
- - exports MP4
234
- - creates `.srt` subtitle files
235
- - supports subtitle burn-in
236
- - reformats to 9:16 vertical output for short-form platforms
237
- - includes AMD hardware encoder configuration hooks
 
 
 
 
 
 
 
 
 
238
 
239
- ### 6. Human-AI Collaborative Editing
240
 
241
- The frontend editor lets creators review AI-generated clips and make final decisions:
242
 
243
- - adjust start and end timestamps
244
- - edit subtitle text
245
- - delete weak clips
246
- - approve good clips
247
- - regenerate a specific clip
248
- - download the result
249
 
250
- ## API Overview
 
 
 
 
251
 
252
- | Method | Endpoint | Description |
253
- | --- | --- | --- |
254
- | `GET` | `/health` | Returns service health and accelerator detection. |
255
- | `POST` | `/api/jobs/youtube` | Creates a processing job from a YouTube URL. |
256
- | `POST` | `/api/jobs/upload` | Creates a processing job from an uploaded video. |
257
- | `GET` | `/api/jobs/{job_id}` | Returns status, transcript, clips, timings, and errors. |
258
- | `PATCH` | `/api/jobs/{job_id}/clips/{clip_id}` | Updates trim times, subtitles, approval, or deletion state. |
259
- | `POST` | `/api/jobs/{job_id}/clips/{clip_id}/regenerate` | Re-renders one clip with updated parameters. |
260
- | `GET` | `/api/jobs/{job_id}/clips/{clip_id}/download` | Downloads an exported clip. |
261
 
262
  ## Local Development
263
 
264
- ### Requirements
265
-
266
- - Python 3.11+
267
- - Node.js 20+
268
- - ffmpeg
269
-
270
- ### Backend
271
 
272
- ```bash
273
- cd backend
274
- python -m venv .venv
275
- . .venv/bin/activate
276
- pip install -e .
277
- uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
278
- ```
279
-
280
- On Windows PowerShell:
281
-
282
- ```powershell
283
- cd backend
284
- python -m venv .venv
285
- .\.venv\Scripts\Activate.ps1
286
- pip install -e .
287
- uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
288
  ```
289
 
290
- ### Frontend
291
-
292
  ```bash
293
  cd frontend
294
  npm install
295
- npm run dev
296
  ```
297
 
298
- Open:
299
-
300
- ```text
301
- http://localhost:5173
302
- ```
303
 
304
- ### Demo Mode
305
-
306
- By default, the project runs in demo mode:
307
-
308
- ```env
309
- DEMO_MODE=true
310
- ```
311
 
312
- Demo mode avoids downloading multi-GB AI models and returns deterministic mock transcript/highlight data while still exercising the API, UI, job state, timing logs, subtitle generation, and ffmpeg rendering path.
313
 
314
- ## AMD Developer Cloud Deployment
315
 
316
- See [infra/amd-cloud.md](infra/amd-cloud.md) for a focused deployment guide.
 
 
 
 
317
 
318
- High-level steps:
319
-
320
- ```bash
321
- git clone https://github.com/JakgritB/ElevenClip.AI.git
322
- cd ElevenClip.AI
323
- cp .env.example .env
324
- ```
325
-
326
- Edit `.env`:
327
 
328
  ```env
329
- DEMO_MODE=false
330
- HF_TOKEN=your_huggingface_token
331
- WHISPER_MODEL_ID=openai/whisper-large-v3
332
- QWEN_TEXT_MODEL_ID=Qwen/Qwen2.5-7B-Instruct
333
- QWEN_VL_MODEL_ID=Qwen/Qwen2-VL-7B-Instruct
334
- ```
335
-
336
- Install the AI/ROCm stack on the AMD instance:
337
-
338
- ```bash
339
- cd backend
340
- pip install -e ".[ai,rocm-inference]"
341
- ```
342
-
343
- Start the API:
344
-
345
- ```bash
346
- uvicorn app.main:app --host 0.0.0.0 --port 8000
347
- ```
348
-
349
- Validate accelerator detection:
350
-
351
- ```bash
352
- curl http://localhost:8000/health
353
- ```
354
-
355
- Expected on AMD Cloud:
356
-
357
- - `torch_available: true`
358
- - `cuda_api_available: true`
359
- - `rocm_hip_version` populated
360
- - MI300X visible as the active device
361
-
362
- ## Docker
363
-
364
- ```bash
365
- docker compose up --build
366
  ```
367
 
368
- For AMD Developer Cloud with ROCm extras:
369
 
370
  ```bash
371
- docker compose build --build-arg INSTALL_EXTRAS=.[ai,rocm-inference] backend
372
- docker compose up
 
 
373
  ```
374
 
375
- The compose file mounts AMD GPU devices (`/dev/kfd`, `/dev/dri`) and uses host IPC for large-model inference.
376
-
377
- ## Benchmark Plan
378
-
379
- The hackathon judges care about technology application and real-world performance. ElevenClip.AI includes step-level timing logs so the demo can show why AMD acceleration matters.
380
-
381
- Run a benchmark against a running API:
382
-
383
- ```bash
384
- python scripts/benchmark.py \
385
- --api http://localhost:8000 \
386
- --youtube-url "https://youtube.com/watch?v=..."
387
- ```
388
-
389
- Recommended benchmark comparison:
390
-
391
- | Scenario | Hardware | Expected Purpose |
392
- | --- | --- | --- |
393
- | CPU baseline | CPU-only runtime | Show the pain of long-form video processing without acceleration. |
394
- | AMD GPU run | AMD Instinct MI300X + ROCm | Show high-throughput transcription and Qwen inference. |
395
-
396
- Metrics captured:
397
-
398
- - input/download time
399
- - transcription time
400
- - highlight detection time
401
- - multimodal analysis time
402
- - clip generation time
403
- - total wall-clock time
404
- - number of clips generated
405
 
406
- Demo target:
407
 
408
- - input: two-hour creator video
409
- - output: 10 subtitle-ready clips
410
- - goal: under 10 minutes on MI300X
411
 
412
- ## Submission Assets Checklist
413
-
414
- The lablab.ai submission asks for:
415
-
416
- - Project title: `ElevenClip.AI`
417
- - Short description
418
- - Long description
419
- - Technology and category tags
420
- - Cover image
421
- - Video presentation
422
- - Slide presentation
423
- - Public GitHub repository
424
- - Demo application platform
425
- - Application URL
426
-
427
- Prepared submission docs:
428
-
429
- - `docs/SUBMISSION.md` - copy-ready project text for lablab.ai.
430
- - `docs/DEMO_SCRIPT.md` - draft and final recording script.
431
- - `docs/PITCH_DECK.md` - slide outline for the presentation deck.
432
- - `docs/BUILD_IN_PUBLIC.md` - social post drafts and AMD feedback notes.
433
- - `docs/AMD_CREDIT_RUNBOOK.md` - checklist for the first MI300X run.
434
-
435
- Recommended tags:
436
-
437
- ```text
438
- AMD, ROCm, MI300X, AMD Developer Cloud, Vision AI, Multimodal AI, Video AI, Whisper, Qwen, Qwen-VL, Hugging Face, FastAPI, React
439
- ```
440
-
441
- ## Suggested Short Description
442
-
443
- ```text
444
- ElevenClip.AI turns long-form videos into personalized short-form clips using Whisper, Qwen, Hugging Face, and AMD ROCm on MI300X.
445
- ```
446
-
447
- ## Suggested Long Description
448
-
449
- ```text
450
- ElevenClip.AI is a human-AI collaborative clip studio for creators. It takes a YouTube URL or uploaded long-form video, transcribes it with Whisper Large V3, uses Qwen2.5 to identify high-engagement highlight moments based on a reusable channel profile, optionally enriches candidates with Qwen2-VL visual analysis, and renders short-form MP4 clips with subtitles using ffmpeg. The React editor lets creators trim, edit subtitles, approve, delete, regenerate, and download final clips. The project is designed for AMD Developer Cloud with ROCm and AMD Instinct MI300X acceleration, demonstrating how high-throughput multimodal AI can reduce hours of manual editing into a fast creator workflow.
451
- ```
452
-
453
- ## Judging Criteria Mapping
454
-
455
- ### Application of Technology
456
-
457
- ElevenClip.AI integrates Whisper, Qwen2.5, Qwen2-VL, Hugging Face, ROCm, vLLM, and AMD Developer Cloud into an end-to-end video processing product.
458
-
459
- ### Presentation
460
-
461
- The demo is designed to be visual and easy to understand: input a long video, watch AI create candidates, edit clips, and download platform-ready MP4 files.
462
-
463
- ### Business Value
464
-
465
- The product targets a real creator economy workflow. Creators, agencies, podcasters, educators, and streamers all need short-form repurposing, and manual editing is expensive.
466
-
467
- ### Originality
468
-
469
- The system goes beyond generic clipping by personalizing highlight selection to a creator's niche, style, language, clip length, and platform. It also preserves human editorial control instead of fully automating final publishing.
470
-
471
- ## Build-in-Public Plan
472
-
473
- The hackathon includes a build-in-public challenge. Suggested updates:
474
-
475
- 1. Share the architecture and first local demo.
476
- 2. Share AMD Cloud/ROCm setup notes and benchmark results.
477
- 3. Publish meaningful feedback about ROCm, AMD Developer Cloud, or inference setup.
478
-
479
- Suggested hashtags/topics:
480
 
481
- ```text
482
- #AMDDeveloperHackathon #ROCm #MI300X #HuggingFace #Qwen #VideoAI #MultimodalAI
483
- ```
484
 
485
- ## Roadmap
 
 
 
 
 
 
 
 
 
486
 
487
- - Real Whisper Large V3 run on AMD Developer Cloud.
488
- - Real Qwen2.5 vLLM ROCm inference path.
489
- - Qwen2-VL frame sampling and visual scoring.
490
- - Batch export for 10+ clips.
491
- - Subtitle styling presets per platform.
492
- - Creator profile memory and reusable brand presets.
493
- - Hugging Face Space screenshot and richer project media.
494
- - CPU vs MI300X benchmark report after AMD credits arrive.
495
 
496
  ## License
497
 
498
- MIT. See [LICENSE](LICENSE).
 
1
+ # ElevenClip AI ✂️
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ > **AMD Developer Hackathon 2026 Track 3: Vision & Multimodal AI**
4
 
5
+ Turn livestream recordings or uploaded videos into TikTok-ready highlight clips using **true multimodal AI** — vision, audio, and text analyzed simultaneously on AMD Instinct MI300X.
6
 
7
+ [![HuggingFace Space](https://img.shields.io/badge/🤗-HuggingFace%20Space-yellow)](https://huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI)
8
+ [![AMD ROCm](https://img.shields.io/badge/AMD-ROCm%206.3-red)](https://rocm.docs.amd.com/)
9
+ [![Qwen2.5-VL](https://img.shields.io/badge/Qwen2.5--VL-7B%20Instruct-blue)](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)
10
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green)](LICENSE)
11
 
12
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ ## Demo
15
 
16
+ > Try it live: [HuggingFace Space](https://huggingface.co/spaces/lablab-ai-amd-developer-hackathon/ElevenClip-AI)
17
 
18
+ ---
 
 
 
 
19
 
20
+ ## What It Does
21
 
22
+ ElevenClip AI ingests an uploaded video and automatically finds the best moments to clip for TikTok using three AI modalities working together. The backend also keeps optional yt-dlp/YouTube support, but the public demo focuses on uploads because public video platforms can trigger anti-bot restrictions.
23
 
24
+ | Modality | Model | What it detects |
25
+ |---|---|---|
26
+ | **Vision** | Qwen2.5-VL-7B on ROCm | Excitement, faces, action type, humor, TikTok potential |
27
+ | **Audio** | insanely-fast-whisper (ROCm) | Word-level transcript + language detection |
28
+ | **Audio Signal** | librosa | RMS energy loud/quiet moments |
29
+ | **Vision+Text** | Qwen2.5-VL (multimodal) | Frame + transcript context fused together |
30
+ | **Text** | Python keyword scorer + Qwen2.5-VL text prompt | Style keyword matching, emoji selection |
31
 
32
+ ### Highlight Scoring Formula
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  ```
35
+ final_score = 0.40 × vision_score + 0.35 × audio_energy + 0.25 × text_keywords
36
 
37
+ where:
38
+ vision_score = 0.5 × excitement + 0.3 × tiktok_potential + 0.2 × humor_level
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  ```
40
 
41
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ ## AI Pipeline
44
+
45
+ ```
46
+ ┌─ Input ──────────────────────────────────────────────────────────┐
47
+ │ Uploaded video file (YouTube backend support is optional) │
48
+ └──────────────────────────────────────────────────────────────────┘
49
+
50
+
51
+ ┌─ Audio Extraction (ffmpeg) ──────────────────────────────────────┐
52
+ │ 16kHz mono WAV for Whisper │
53
+ └──────────────────────────────────────────────────────────────────┘
54
+
55
+ ┌──────┴──────┐
56
+ │ │ ← PARALLEL on AMD GPU ─────────────────────────
57
+ ▼ ▼
58
+ ┌─ Scene ┌─ Whisper ROCm ────────────────────────────────────┐
59
+ │ Detection │ insanely-fast-whisper (SDPA attention, 4.45×) │
60
+ │ PyScene │ → transcript + word-level timestamps │
61
+ │ Detect │ → auto language detection │
62
+ └─────┬──────┴───────────────────────────────────────────────────┘
63
+ │ │
64
+ ▼ ▼
65
+ ┌─ Frame Sampling ──────────────────────────────────────────────────┐
66
+ │ 3 frames per scene (20%, 50%, 80% of scene) │
67
+ └──────────────────────────────────────────────────────────────────┘
68
+
69
+ ▼ ← CONCURRENT requests to vLLM ──────────────────────
70
+ ┌─ Qwen2.5-VL Multimodal Analysis ───────────────────────────────────┐
71
+ │ Input per scene: [frame1] [frame2] [frame3] + transcript text │
72
+ │ Output: excitement_score, tiktok_potential, face_bbox, │
73
+ │ emotion, action_type, humor_level, highlight_reason │
74
+ │ All scenes sent concurrently — vLLM batches on AMD MI300X │
75
+ └──────────────────────────────────────────────────────────────────┘
76
+
77
+
78
+ ┌─ Multi-Signal Scoring ────────────────────────────────────────────┐
79
+ │ score = 0.40×vision + 0.35×audio_energy + 0.25×text_keywords │
80
+ │ Select top-N non-overlapping clips (min 30s gap) │
81
+ └──────────────────────────────────────────────────────────────────┘
82
+
83
+
84
+ ┌─ Branch ──────────────────────────────────────────────────────────┐
85
+ │ │
86
+ │ Normal Mode HRE (High-Retention Editing) │
87
+ │ ───────────── ────────────────────────────── │
88
+ │ • pysubs2 ASS • Silence removal (ffmpeg) │
89
+ │ • User style config • Auto-zoom to face (zoompan) │
90
+ │ • Font/color/animation • Jump cuts at boundaries │
91
+ │ • Karaoke/pop/fade • Qwen2.5-VL emoji selection │
92
+ │ • AMD AMF encode • Impact bold captions │
93
+ └──────────────────────────────────────────────────────────────────┘
94
+
95
+
96
+ ┌─ Editor (/editor) ────────────────────────────────────────────────┐
97
+ │ • Per-clip subtitle timeline editing │
98
+ │ • Global style override (live preview) │
99
+ │ • Re-render + download MP4 │
100
+ └──────────────────────────────────────────────────────────────────┘
101
+ ```
102
 
103
+ ---
 
 
 
 
 
 
 
104
 
105
+ ## AMD GPU Optimizations
106
 
107
+ - **ROCm 6.3** all model inference on AMD Instinct MI300X
108
+ - **vLLM** — serves Qwen2.5-VL with continuous batching and PagedAttention
109
+ - **SDPA attention** — PyTorch 2.0 Scaled Dot-Product Attention for Whisper (4.45× faster on ROCm)
110
+ - **float16 inference** — 7B model fits in ~14 GB VRAM, leaves 50+ GB for large videos
111
+ - **h264_amf** — AMD VCE hardware encoder for clip extraction (falls back to libx264)
112
+ - **Parallel pipeline** — scene detection (CPU) + Whisper (GPU) run simultaneously
113
+ - **Concurrent vLLM requests** — all scenes sent to Qwen2.5-VL in parallel; server batches them
114
 
115
+ ---
116
 
117
+ ## Two Output Modes
118
+
119
+ ### Normal Subtitles
120
+ Full creative control over:
121
+ - Font family (Noto Sans Thai, Noto Sans SC, Montserrat, Impact, ...)
122
+ - Font size, bold/italic/underline
123
+ - 4-layer ASS colors: primary, secondary, outline, shadow
124
+ - Display mode: word-by-word or sentence
125
+ - Animation: Fade / Karaoke / Pop / Typewriter / Bounce
126
+ - Alignment (3×3 grid) + margin sliders
127
+ - Per-subtitle-line style overrides in the editor
128
+
129
+ ### High-Retention Editing (HRE)
130
+ AI chooses everything:
131
+ - Silence removal (`ffmpeg silenceremove`)
132
+ - Auto-zoom to face region (`ffmpeg zoompan` using Qwen2.5-VL face_bbox)
133
+ - Jump cuts at scene boundaries
134
+ - Qwen2.5-VL selects contextually-appropriate emoji overlay
135
+ - Impact 64px bold white captions, word-by-word, pop animation
136
 
137
+ ---
138
 
139
+ ## Multilingual Support
140
 
141
+ | Layer | Coverage |
142
+ |---|---|
143
+ | UI language | ไทย · English · 中文 |
144
+ | Video input language | Auto-detect + 15+ (Whisper) |
145
+ | Subtitle output language | Thai (Noto Sans Thai) · Chinese (Noto Sans SC) · Japanese (Noto Sans JP) · Korean (Noto Sans KR) · English + more |
146
+ | Cross-lingual | Whisper translate → English when English subtitles are requested; multilingual transcription/subtitle timing uses Whisper language support |
147
+ | Character-level splitting | Thai and Chinese use character-level subtitle timing (no word spaces) |
148
 
149
+ ---
150
 
151
+ ## Tech Stack
152
+
153
+ | Layer | Technology |
154
+ |---|---|
155
+ | Vision AI | **Qwen2.5-VL-7B-Instruct** (Apache 2.0) via vLLM |
156
+ | Speech-to-Text | **insanely-fast-whisper** with PyTorch SDPA on ROCm |
157
+ | Audio Analysis | **librosa** — RMS energy per scene |
158
+ | Scene Detection | **PySceneDetect** — ContentDetector |
159
+ | Video Download | **yt-dlp** |
160
+ | Video Processing | **ffmpeg** (AMD AMF hardware encode) |
161
+ | Subtitle Engine | **pysubs2** — full ASS format with karaoke tags |
162
+ | GPU | **AMD Instinct MI300X** via ROCm 6.3 |
163
+ | Frontend | **Next.js 16.2.4** App Router + Tailwind CSS |
164
+ | Backend | **FastAPI** + WebSocket (real-time progress) |
165
+ | Deployment | HuggingFace Spaces public demo + AMD GPU Cloud backend |
166
 
167
+ ---
168
 
169
+ ## Judge Demo
170
 
171
+ Public visitors can open the HuggingFace Space and click **Try Demo** to see a simulated flow without using AMD GPU credits. Full AMD MI300X generation is protected by an access code shared only in the lablab.ai submission notes for judges.
 
 
 
 
 
172
 
173
+ Recommended judging flow:
174
+ 1. Open the HuggingFace Space.
175
+ 2. Click **Try Demo** for the instant public demo.
176
+ 3. Enter the judge access code from the lablab.ai submission notes to run real generation on AMD GPU Cloud.
177
+ 4. Upload a short MP4 sample for the real run.
178
 
179
+ ---
 
 
 
 
 
 
 
 
180
 
181
  ## Local Development
182
 
183
+ For the real development/demo path, run the frontend locally and point it at the AMD GPU Cloud backend:
 
 
 
 
 
 
184
 
185
+ ```env
186
+ # frontend/.env.local
187
+ NEXT_PUBLIC_API_URL=http://129.212.178.101:8080
188
+ NEXT_PUBLIC_DEMO_ENABLED=true
189
+ NEXT_PUBLIC_DEMO_ONLY=false
 
 
 
 
 
 
 
 
 
 
 
190
  ```
191
 
 
 
192
  ```bash
193
  cd frontend
194
  npm install
195
+ npm run dev # http://localhost:3000
196
  ```
197
 
198
+ The AMD GPU Cloud backend runs FastAPI on `:8080` and vLLM/Qwen2.5-VL on `:8000`. For development without a GPU, the backend can still run with fallback stubs (stubbed Whisper, fallback vision scores).
 
 
 
 
199
 
200
+ ---
 
 
 
 
 
 
201
 
202
+ ## Safe Public Demo Setup
203
 
204
+ ElevenClip AI supports three deployment modes:
205
 
206
+ | Mode | Frontend runs on | Backend/vLLM runs on | Use when |
207
+ |---|---|---|---|
208
+ | Local dev | Your laptop (`localhost:3000`) | AMD GPU Cloud (`129.212.178.101:8080`) | Iterating quickly while using MI300X remotely |
209
+ | HF public shell | HuggingFace Space CPU | AMD GPU Cloud | Public hackathon page, real generation gated by access code |
210
+ | HF self-contained GPU | HuggingFace Space | HuggingFace Space GPU | Only if the Space has suitable ROCm/AMD GPU hardware |
211
 
212
+ For the current CPU Basic HuggingFace Space, use it as the public UI and keep real generation on AMD GPU Cloud:
 
 
 
 
 
 
 
 
213
 
214
  ```env
215
+ # frontend/.env.local for local development
216
+ NEXT_PUBLIC_API_URL=http://129.212.178.101:8080
217
+ NEXT_PUBLIC_DEMO_ENABLED=true
218
+ NEXT_PUBLIC_DEMO_ONLY=false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  ```
220
 
221
+ On the AMD GPU Cloud backend, protect expensive GPU endpoints before exposing the demo:
222
 
223
  ```bash
224
+ export DEMO_ACCESS_CODE="share-this-only-with-judges"
225
+ export MAX_CONCURRENT_JOBS=1
226
+ export MAX_UPLOAD_MB=300
227
+ export VLLM_IDLE_TIMEOUT=300
228
  ```
229
 
230
+ When `DEMO_ACCESS_CODE` is set, `/api/process`, `/api/video-info`, and vLLM start/stop endpoints require the `X-Demo-Key` header. The frontend shows a Demo Access Code field and sends that header automatically. Leave `DEMO_ACCESS_CODE` unset only for private/local testing.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
+ For a self-contained HuggingFace GPU Space, leave `NEXT_PUBLIC_API_URL=""` so nginx routes `/api`, `/ws`, and `/downloads` to FastAPI inside the same Space. Only use this mode if the Space hardware is actually GPU-capable.
233
 
234
+ For the public HuggingFace Space, set `NEXT_PUBLIC_DEMO_ONLY=true`. Visitors can open the UI and run the simulated demo without touching AMD GPU credits. Judges can enter the access code to run real generation against the protected AMD GPU Cloud backend.
 
 
235
 
236
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
+ ## Hackathon Compliance
 
 
239
 
240
+ | Requirement | Status |
241
+ |---|---|
242
+ | Track 3: Vision & Multimodal AI | ✅ Qwen2.5-VL processes frames + audio simultaneously |
243
+ | AMD Developer Cloud | ✅ All inference on AMD Instinct MI300X via ROCm 6.3 |
244
+ | ROCm acceleration | ✅ vLLM + SDPA Whisper + h264_amf encoder |
245
+ | Qwen partner integration | ✅ Qwen2.5-VL as primary multimodal model and text/emoji prompt model |
246
+ | HuggingFace Space | ✅ `lablab-ai-amd-developer-hackathon/ElevenClip-AI` |
247
+ | Public GitHub repo | ✅ `JakgritB/ElevenClip-AI` |
248
+ | Ship It challenge | ✅ Social posts tagging @AIatAMD + @lablab |
249
+ | MIT license | ✅ |
250
 
251
+ ---
 
 
 
 
 
 
 
252
 
253
  ## License
254
 
255
+ MIT see [LICENSE](LICENSE)
backend/Dockerfile DELETED
@@ -1,17 +0,0 @@
1
- ARG ROCM_PYTORCH_IMAGE=rocm/pytorch:latest
2
- FROM ${ROCM_PYTORCH_IMAGE}
3
-
4
- WORKDIR /app
5
-
6
- RUN apt-get update \
7
- && apt-get install -y --no-install-recommends ffmpeg git curl \
8
- && rm -rf /var/lib/apt/lists/*
9
-
10
- COPY pyproject.toml ./
11
- ARG INSTALL_EXTRAS=.
12
- RUN pip install --upgrade pip && pip install -e "${INSTALL_EXTRAS}"
13
-
14
- COPY app ./app
15
-
16
- EXPOSE 8000
17
- CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/__init__.py DELETED
@@ -1 +0,0 @@
1
- """AI Clip Studio backend."""
 
 
backend/app/core/__init__.py DELETED
@@ -1 +0,0 @@
1
- """Core configuration and instrumentation."""
 
 
backend/app/core/config.py DELETED
@@ -1,68 +0,0 @@
1
- from functools import lru_cache
2
- import os
3
- from pathlib import Path
4
-
5
- from pydantic import Field
6
- from pydantic import BaseModel
7
-
8
-
9
- class Settings(BaseModel):
10
- app_name: str = "ElevenClip.AI"
11
- demo_mode: bool = True
12
- storage_dir: Path = Path("data")
13
- frontend_origin: str = "http://localhost:5173"
14
-
15
- whisper_model_id: str = "openai/whisper-large-v3"
16
- qwen_text_model_id: str = "Qwen/Qwen2.5-7B-Instruct"
17
- qwen_vl_model_id: str = "Qwen/Qwen2-VL-7B-Instruct"
18
- hf_token: str | None = None
19
- preferred_torch_dtype: str = "bfloat16"
20
-
21
- target_clip_count: int = Field(default=5, ge=1, le=20)
22
- max_clips: int = Field(default=10, ge=1, le=50)
23
-
24
- ffmpeg_binary: str = "ffmpeg"
25
- ffprobe_binary: str = "ffprobe"
26
- ffmpeg_video_codec: str = "h264_amf"
27
- ffmpeg_cpu_codec: str = "libx264"
28
-
29
- redis_url: str = "redis://redis:6379/0"
30
- celery_enabled: bool = False
31
-
32
-
33
- @lru_cache
34
- def get_settings() -> Settings:
35
- settings = Settings(
36
- demo_mode=_bool_env("DEMO_MODE", True),
37
- storage_dir=Path(os.getenv("STORAGE_DIR", "data")),
38
- frontend_origin=os.getenv("FRONTEND_ORIGIN", "http://localhost:5173"),
39
- whisper_model_id=os.getenv("WHISPER_MODEL_ID", "openai/whisper-large-v3"),
40
- qwen_text_model_id=os.getenv("QWEN_TEXT_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct"),
41
- qwen_vl_model_id=os.getenv("QWEN_VL_MODEL_ID", "Qwen/Qwen2-VL-7B-Instruct"),
42
- hf_token=os.getenv("HF_TOKEN") or None,
43
- preferred_torch_dtype=os.getenv("TORCH_DTYPE", "bfloat16"),
44
- target_clip_count=_int_env("TARGET_CLIP_COUNT", 5),
45
- max_clips=_int_env("MAX_CLIPS", 10),
46
- ffmpeg_binary=os.getenv("FFMPEG_BINARY", "ffmpeg"),
47
- ffprobe_binary=os.getenv("FFPROBE_BINARY", "ffprobe"),
48
- ffmpeg_video_codec=os.getenv("FFMPEG_VIDEO_CODEC", "h264_amf"),
49
- ffmpeg_cpu_codec=os.getenv("FFMPEG_CPU_CODEC", "libx264"),
50
- redis_url=os.getenv("REDIS_URL", "redis://redis:6379/0"),
51
- celery_enabled=_bool_env("CELERY_ENABLED", False),
52
- )
53
- settings.storage_dir.mkdir(parents=True, exist_ok=True)
54
- return settings
55
-
56
-
57
- def _bool_env(name: str, default: bool) -> bool:
58
- value = os.getenv(name)
59
- if value is None:
60
- return default
61
- return value.strip().lower() in {"1", "true", "yes", "on"}
62
-
63
-
64
- def _int_env(name: str, default: int) -> int:
65
- value = os.getenv(name)
66
- if value is None:
67
- return default
68
- return int(value)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/core/timing.py DELETED
@@ -1,20 +0,0 @@
1
- from collections.abc import Iterator
2
- from contextlib import contextmanager
3
- from time import perf_counter
4
-
5
-
6
- class TimingLog:
7
- def __init__(self) -> None:
8
- self._steps: dict[str, float] = {}
9
-
10
- @contextmanager
11
- def measure(self, name: str) -> Iterator[None]:
12
- started = perf_counter()
13
- try:
14
- yield
15
- finally:
16
- self._steps[name] = round(perf_counter() - started, 3)
17
-
18
- def to_dict(self) -> dict[str, float]:
19
- total = round(sum(self._steps.values()), 3)
20
- return {**self._steps, "total": total}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/main.py DELETED
@@ -1,240 +0,0 @@
1
- from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
2
- from fastapi.middleware.cors import CORSMiddleware
3
- from fastapi.responses import FileResponse
4
- from fastapi.staticfiles import StaticFiles
5
-
6
- from app.core.config import get_settings
7
- from app.models.schemas import (
8
- ChannelProfile,
9
- ClipCandidate,
10
- ClipPatch,
11
- HealthResponse,
12
- JobSnapshot,
13
- PolishSubtitlesRequest,
14
- RegenerateClipRequest,
15
- SubtitleCue,
16
- TranslateSubtitlesRequest,
17
- YoutubeJobRequest,
18
- )
19
- from app.services.highlight import QwenHighlightDetector
20
- from app.services.pipeline import VideoPipeline
21
- from app.services.transcription import WhisperTranscriber
22
- from app.services.video_input import save_upload
23
- from app.storage import JobStore
24
- from app.utils.rocm import detect_accelerator
25
-
26
- settings = get_settings()
27
- store = JobStore(settings)
28
- pipeline = VideoPipeline(settings, store)
29
- highlight_detector = QwenHighlightDetector(settings)
30
- transcriber = WhisperTranscriber(settings)
31
-
32
- app = FastAPI(title=settings.app_name, version="0.1.0")
33
- app.add_middleware(
34
- CORSMiddleware,
35
- allow_origins=[settings.frontend_origin, "http://localhost:5173", "http://127.0.0.1:5173"],
36
- allow_credentials=True,
37
- allow_methods=["*"],
38
- allow_headers=["*"],
39
- )
40
- app.mount("/media", StaticFiles(directory=settings.storage_dir), name="media")
41
-
42
-
43
- @app.get("/health", response_model=HealthResponse)
44
- async def health() -> HealthResponse:
45
- return HealthResponse(
46
- ok=True,
47
- app=settings.app_name,
48
- demo_mode=settings.demo_mode,
49
- accelerator=detect_accelerator(),
50
- )
51
-
52
-
53
- @app.post("/api/jobs/youtube", response_model=JobSnapshot)
54
- async def create_youtube_job(
55
- request: YoutubeJobRequest, background_tasks: BackgroundTasks
56
- ) -> JobSnapshot:
57
- snapshot = store.create_job(
58
- request.profile, {"kind": "youtube", "url": str(request.youtube_url)}
59
- )
60
- background_tasks.add_task(
61
- pipeline.process_source, snapshot.id, "youtube", str(request.youtube_url), request.profile
62
- )
63
- return snapshot
64
-
65
-
66
- @app.post("/api/jobs/upload", response_model=JobSnapshot)
67
- async def create_upload_job(
68
- background_tasks: BackgroundTasks,
69
- profile_json: str = Form(...),
70
- file: UploadFile = File(...),
71
- ) -> JobSnapshot:
72
- try:
73
- profile = ChannelProfile.model_validate_json(profile_json)
74
- except Exception as exc:
75
- raise HTTPException(status_code=422, detail=f"Invalid profile JSON: {exc}") from exc
76
-
77
- snapshot = store.create_job(profile, {"kind": "upload", "filename": file.filename})
78
- source_path = await save_upload(file, store.job_dir(snapshot.id))
79
- background_tasks.add_task(pipeline.process_source, snapshot.id, "upload", str(source_path), profile)
80
- return snapshot
81
-
82
-
83
- @app.get("/api/jobs/{job_id}", response_model=JobSnapshot)
84
- async def get_job(job_id: str) -> JobSnapshot:
85
- try:
86
- return store.get_job(job_id)
87
- except FileNotFoundError as exc:
88
- raise HTTPException(status_code=404, detail="Job not found") from exc
89
-
90
-
91
- @app.patch("/api/jobs/{job_id}/clips/{clip_id}", response_model=ClipCandidate)
92
- async def update_clip(job_id: str, clip_id: str, patch: ClipPatch) -> ClipCandidate:
93
- try:
94
- return pipeline.patch_clip(job_id, clip_id, patch.model_dump())
95
- except FileNotFoundError as exc:
96
- raise HTTPException(status_code=404, detail="Job not found") from exc
97
- except KeyError as exc:
98
- raise HTTPException(status_code=404, detail="Clip not found") from exc
99
-
100
-
101
- @app.post("/api/jobs/{job_id}/clips/{clip_id}/regenerate", response_model=ClipCandidate)
102
- async def regenerate_clip(
103
- job_id: str, clip_id: str, request: RegenerateClipRequest
104
- ) -> ClipCandidate:
105
- try:
106
- return pipeline.regenerate_clip(
107
- job_id,
108
- clip_id,
109
- clip_style=request.clip_style,
110
- clip_length_seconds=request.clip_length_seconds,
111
- subtitle_text=request.subtitle_text,
112
- )
113
- except FileNotFoundError as exc:
114
- raise HTTPException(status_code=404, detail="Source video not found") from exc
115
- except KeyError as exc:
116
- raise HTTPException(status_code=404, detail="Clip not found") from exc
117
-
118
-
119
- @app.get("/api/jobs/{job_id}/clips/{clip_id}/download")
120
- async def download_clip(job_id: str, clip_id: str) -> FileResponse:
121
- snapshot = store.get_job(job_id)
122
- clip = next((item for item in snapshot.clips if item.id == clip_id), None)
123
- if clip is None or clip.download_url is None:
124
- raise HTTPException(status_code=404, detail="Clip not found")
125
- filename = clip.download_url.rsplit("/", 1)[-1]
126
- path = store.job_dir(job_id) / filename
127
- if not path.exists():
128
- raise HTTPException(status_code=404, detail="Clip file not found")
129
- return FileResponse(path, media_type="video/mp4", filename=filename)
130
-
131
-
132
- # ──────────────────────────────────────────────────────���──────────
133
- # AI subtitle endpoints — work in demo mode immediately, switch to
134
- # real Qwen / Whisper output once DEMO_MODE=false on AMD GPU cloud.
135
- # ─────────────────────────────────────────────────────────────────
136
-
137
-
138
- def _resolve_clip_cues(snapshot: JobSnapshot, clip: ClipCandidate) -> list[SubtitleCue]:
139
- """Return the cue list to operate on. Prefer explicit subtitle_cues; fall
140
- back to splitting subtitle_text into evenly-spaced cues."""
141
- if clip.subtitle_cues:
142
- return [SubtitleCue(**cue.model_dump()) for cue in clip.subtitle_cues]
143
- duration = max(0.5, clip.end_seconds - clip.start_seconds)
144
- text = clip.subtitle_text.strip()
145
- if not text:
146
- return [SubtitleCue(start_seconds=0.0, end_seconds=duration, text="")]
147
- # Reuse Whisper aligner's deterministic chunking for fallback
148
- return transcriber._demo_align_words(text, 0.0, duration)
149
-
150
-
151
- @app.post(
152
- "/api/jobs/{job_id}/clips/{clip_id}/subtitle/polish",
153
- response_model=ClipCandidate,
154
- )
155
- async def polish_clip_subtitles(
156
- job_id: str, clip_id: str, request: PolishSubtitlesRequest
157
- ) -> ClipCandidate:
158
- try:
159
- snapshot = store.get_job(job_id)
160
- except FileNotFoundError as exc:
161
- raise HTTPException(status_code=404, detail="Job not found") from exc
162
- clip = next((c for c in snapshot.clips if c.id == clip_id), None)
163
- if clip is None:
164
- raise HTTPException(status_code=404, detail="Clip not found")
165
-
166
- cues_in = _resolve_clip_cues(snapshot, clip)
167
- polished = highlight_detector.polish_subtitles(cues_in, style=request.style)
168
- return pipeline.patch_clip(
169
- job_id,
170
- clip_id,
171
- {
172
- "subtitle_cues": [cue.model_dump() for cue in polished],
173
- "subtitle_text": " ".join(cue.text for cue in polished if cue.text),
174
- },
175
- )
176
-
177
-
178
- @app.post(
179
- "/api/jobs/{job_id}/clips/{clip_id}/subtitle/translate",
180
- response_model=ClipCandidate,
181
- )
182
- async def translate_clip_subtitles(
183
- job_id: str, clip_id: str, request: TranslateSubtitlesRequest
184
- ) -> ClipCandidate:
185
- try:
186
- snapshot = store.get_job(job_id)
187
- except FileNotFoundError as exc:
188
- raise HTTPException(status_code=404, detail="Job not found") from exc
189
- clip = next((c for c in snapshot.clips if c.id == clip_id), None)
190
- if clip is None:
191
- raise HTTPException(status_code=404, detail="Clip not found")
192
-
193
- cues_in = _resolve_clip_cues(snapshot, clip)
194
- translated = highlight_detector.translate_subtitles(cues_in, request.target_language)
195
- return pipeline.patch_clip(
196
- job_id,
197
- clip_id,
198
- {
199
- "subtitle_cues": [cue.model_dump() for cue in translated],
200
- "subtitle_text": " ".join(cue.text for cue in translated if cue.text),
201
- },
202
- )
203
-
204
-
205
- @app.post(
206
- "/api/jobs/{job_id}/clips/{clip_id}/subtitle/auto-time",
207
- response_model=ClipCandidate,
208
- )
209
- async def auto_time_clip_subtitles(job_id: str, clip_id: str) -> ClipCandidate:
210
- try:
211
- snapshot = store.get_job(job_id)
212
- except FileNotFoundError as exc:
213
- raise HTTPException(status_code=404, detail="Job not found") from exc
214
- clip = next((c for c in snapshot.clips if c.id == clip_id), None)
215
- if clip is None:
216
- raise HTTPException(status_code=404, detail="Clip not found")
217
-
218
- text = clip.subtitle_text or " ".join(
219
- (cue.text for cue in (clip.subtitle_cues or []) if cue.text)
220
- )
221
- # Best-effort: production mode uses the actual source video on disk; demo
222
- # mode uses synthetic chunking that doesn't require the file at all.
223
- source_path = ""
224
- try:
225
- for entry in store.job_dir(job_id).iterdir():
226
- if entry.suffix.lower() in {".mp4", ".mkv", ".mov", ".webm"}:
227
- source_path = str(entry)
228
- break
229
- except Exception:
230
- source_path = ""
231
-
232
- timed = transcriber.align_words(source_path, text, clip.start_seconds, clip.end_seconds)
233
- return pipeline.patch_clip(
234
- job_id,
235
- clip_id,
236
- {
237
- "subtitle_cues": [cue.model_dump() for cue in timed],
238
- "subtitle_text": " ".join(cue.text for cue in timed if cue.text),
239
- },
240
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/models/__init__.py DELETED
@@ -1 +0,0 @@
1
- """Pydantic models."""
 
 
backend/app/models/schemas.py DELETED
@@ -1,127 +0,0 @@
1
- from datetime import datetime, timezone
2
- from enum import Enum
3
- from typing import Any, Literal
4
-
5
- from pydantic import BaseModel, Field, HttpUrl, field_validator
6
-
7
-
8
- def utc_now() -> datetime:
9
- return datetime.now(timezone.utc)
10
-
11
-
12
- class TargetPlatform(str, Enum):
13
- tiktok = "tiktok"
14
- youtube_shorts = "youtube_shorts"
15
- instagram_reels = "instagram_reels"
16
-
17
-
18
- class ChannelProfile(BaseModel):
19
- niche: str = Field(default="education", min_length=2, max_length=80)
20
- niche_custom: str = Field(default="", max_length=80)
21
- channel_description: str = Field(default="", max_length=700)
22
- clip_style: str = Field(default="informative", min_length=2, max_length=80)
23
- clip_length_seconds: int = Field(default=60, ge=15, le=180)
24
- clip_count: int = Field(default=5, ge=1, le=20)
25
- primary_language: str = Field(default="Thai", min_length=2, max_length=40)
26
- target_platform: TargetPlatform = TargetPlatform.tiktok
27
-
28
- @field_validator("niche", "niche_custom", "channel_description", "clip_style", "primary_language")
29
- @classmethod
30
- def clean_text(cls, value: str) -> str:
31
- return value.strip()
32
-
33
-
34
- class YoutubeJobRequest(BaseModel):
35
- youtube_url: HttpUrl
36
- profile: ChannelProfile
37
-
38
-
39
- class TranscriptSegment(BaseModel):
40
- id: str
41
- start_seconds: float = Field(ge=0)
42
- end_seconds: float = Field(ge=0)
43
- text: str
44
- language: str | None = None
45
-
46
-
47
- class SubtitleCue(BaseModel):
48
- """A single subtitle line with explicit timing relative to clip start."""
49
-
50
- start_seconds: float = Field(ge=0)
51
- end_seconds: float = Field(ge=0)
52
- text: str = ""
53
-
54
-
55
- class SkipRange(BaseModel):
56
- """A range to splice out of the middle of a clip (relative to clip start)."""
57
-
58
- start_seconds: float = Field(ge=0)
59
- end_seconds: float = Field(ge=0)
60
-
61
-
62
- class ClipCandidate(BaseModel):
63
- id: str
64
- start_seconds: float = Field(ge=0)
65
- end_seconds: float = Field(ge=0)
66
- title: str
67
- reason: str
68
- score: float = Field(ge=0, le=100)
69
- subtitle_text: str = ""
70
- subtitle_cues: list[SubtitleCue] | None = None
71
- skip_ranges: list[SkipRange] | None = None
72
- video_url: str | None = None
73
- download_url: str | None = None
74
- approved: bool = False
75
- deleted: bool = False
76
- metadata: dict[str, Any] = Field(default_factory=dict)
77
-
78
-
79
- class ClipPatch(BaseModel):
80
- start_seconds: float | None = Field(default=None, ge=0)
81
- end_seconds: float | None = Field(default=None, ge=0)
82
- subtitle_text: str | None = None
83
- subtitle_cues: list[SubtitleCue] | None = None
84
- skip_ranges: list[SkipRange] | None = None
85
- approved: bool | None = None
86
- deleted: bool | None = None
87
-
88
-
89
- class RegenerateClipRequest(BaseModel):
90
- clip_style: str | None = None
91
- clip_length_seconds: int | None = Field(default=None, ge=15, le=180)
92
- subtitle_text: str | None = None
93
-
94
-
95
- class TranslateSubtitlesRequest(BaseModel):
96
- target_language: str = Field(min_length=2, max_length=40)
97
-
98
-
99
- class PolishSubtitlesRequest(BaseModel):
100
- style: str | None = None
101
-
102
-
103
- class JobSnapshot(BaseModel):
104
- id: str
105
- status: Literal["queued", "running", "completed", "failed"]
106
- progress: float = Field(ge=0, le=1)
107
- message: str
108
- current_step: str = ""
109
- step_index: int = Field(default=0, ge=0)
110
- step_total: int = Field(default=6, ge=1)
111
- active_clip_index: int = Field(default=0, ge=0)
112
- active_clip_total: int = Field(default=0, ge=0)
113
- source: dict[str, Any]
114
- profile: ChannelProfile
115
- transcript: list[TranscriptSegment] = Field(default_factory=list)
116
- clips: list[ClipCandidate] = Field(default_factory=list)
117
- timings: dict[str, float] = Field(default_factory=dict)
118
- error: str | None = None
119
- created_at: datetime = Field(default_factory=utc_now)
120
- updated_at: datetime = Field(default_factory=utc_now)
121
-
122
-
123
- class HealthResponse(BaseModel):
124
- ok: bool
125
- app: str
126
- demo_mode: bool
127
- accelerator: dict[str, Any]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/__init__.py DELETED
@@ -1 +0,0 @@
1
- """Pipeline services."""
 
 
backend/app/services/clips.py DELETED
@@ -1,219 +0,0 @@
1
- import shutil
2
- import subprocess
3
- from pathlib import Path
4
- from typing import Callable
5
-
6
- from app.core.config import Settings
7
- from app.models.schemas import ChannelProfile, ClipCandidate, TranscriptSegment
8
- from app.services.subtitles import write_single_caption_srt, write_srt, write_srt_from_cues
9
- from app.storage import JobStore
10
-
11
-
12
- class ClipGenerator:
13
- def __init__(self, settings: Settings, store: JobStore) -> None:
14
- self.settings = settings
15
- self.store = store
16
-
17
- def generate(
18
- self,
19
- job_id: str,
20
- video_path: Path,
21
- clips: list[ClipCandidate],
22
- transcript: list[TranscriptSegment],
23
- profile: ChannelProfile,
24
- progress_callback: Callable[[int, int], None] | None = None,
25
- ) -> list[ClipCandidate]:
26
- rendered: list[ClipCandidate] = []
27
- total = len(clips)
28
- for index, clip in enumerate(clips, start=1):
29
- if progress_callback:
30
- progress_callback(index, total)
31
- rendered.append(self.render_one(job_id, video_path, clip, transcript, profile, index))
32
- return rendered
33
-
34
- def render_one(
35
- self,
36
- job_id: str,
37
- video_path: Path,
38
- clip: ClipCandidate,
39
- transcript: list[TranscriptSegment],
40
- profile: ChannelProfile,
41
- index: int = 1,
42
- ) -> ClipCandidate:
43
- job_dir = self.store.job_dir(job_id)
44
- output_name = f"clip_{index:02}_{clip.id[:8]}.mp4"
45
- subtitle_name = f"clip_{index:02}_{clip.id[:8]}.srt"
46
- output_path = job_dir / output_name
47
- subtitle_path = job_dir / subtitle_name
48
-
49
- duration = max(1.0, clip.end_seconds - clip.start_seconds)
50
- if clip.subtitle_cues:
51
- subtitle_cues = write_srt_from_cues(subtitle_path, clip.subtitle_cues)
52
- elif clip.subtitle_text.strip():
53
- subtitle_cues = write_single_caption_srt(subtitle_path, duration, clip.subtitle_text)
54
- else:
55
- subtitle_cues = write_srt(subtitle_path, clip.start_seconds, clip.end_seconds, transcript)
56
- self._run_ffmpeg(video_path, output_path, subtitle_path, clip, profile)
57
-
58
- clip.video_url = self.store.media_url(job_id, output_name)
59
- clip.download_url = clip.video_url
60
- clip.metadata["subtitle_file"] = self.store.media_url(job_id, subtitle_name)
61
- clip.metadata["subtitle_cues"] = subtitle_cues
62
- return clip
63
-
64
- def _run_ffmpeg(
65
- self,
66
- video_path: Path,
67
- output_path: Path,
68
- subtitle_path: Path,
69
- clip: ClipCandidate,
70
- profile: ChannelProfile,
71
- ) -> None:
72
- ffmpeg = shutil.which(self.settings.ffmpeg_binary)
73
- if not ffmpeg or not video_path.exists() or video_path.stat().st_size == 0:
74
- output_path.write_bytes(b"")
75
- return
76
-
77
- keep_ranges = self._compute_keep_ranges(clip)
78
- post_filters = [self._platform_filter(profile), self._subtitle_filter(subtitle_path)]
79
- post_chain = ",".join(post_filters)
80
-
81
- if len(keep_ranges) <= 1:
82
- start, end = keep_ranges[0]
83
- command = [
84
- ffmpeg,
85
- "-y",
86
- "-ss",
87
- f"{start:.3f}",
88
- "-i",
89
- str(video_path),
90
- "-t",
91
- f"{max(0.5, end - start):.3f}",
92
- "-vf",
93
- post_chain,
94
- "-c:v",
95
- self.settings.ffmpeg_video_codec,
96
- "-c:a",
97
- "aac",
98
- "-b:a",
99
- "160k",
100
- "-movflags",
101
- "+faststart",
102
- str(output_path),
103
- ]
104
- else:
105
- # Build concat filter that keeps multiple segments and skips middle ranges
106
- parts = []
107
- labels_v = []
108
- labels_a = []
109
- for i, (start, end) in enumerate(keep_ranges):
110
- parts.append(
111
- f"[0:v]trim=start={start:.3f}:end={end:.3f},setpts=PTS-STARTPTS[v{i}]"
112
- )
113
- parts.append(
114
- f"[0:a]atrim=start={start:.3f}:end={end:.3f},asetpts=PTS-STARTPTS[a{i}]"
115
- )
116
- labels_v.append(f"[v{i}]")
117
- labels_a.append(f"[a{i}]")
118
- concat_inputs = "".join(
119
- f"{labels_v[i]}{labels_a[i]}" for i in range(len(keep_ranges))
120
- )
121
- parts.append(
122
- f"{concat_inputs}concat=n={len(keep_ranges)}:v=1:a=1[vc][ac]"
123
- )
124
- parts.append(f"[vc]{post_chain}[vout]")
125
- filter_complex = ";".join(parts)
126
- command = [
127
- ffmpeg,
128
- "-y",
129
- "-i",
130
- str(video_path),
131
- "-filter_complex",
132
- filter_complex,
133
- "-map",
134
- "[vout]",
135
- "-map",
136
- "[ac]",
137
- "-c:v",
138
- self.settings.ffmpeg_video_codec,
139
- "-c:a",
140
- "aac",
141
- "-b:a",
142
- "160k",
143
- "-movflags",
144
- "+faststart",
145
- str(output_path),
146
- ]
147
-
148
- try:
149
- subprocess.run(command, check=True, capture_output=True, text=True, timeout=180)
150
- return
151
- except Exception:
152
- fallback = command.copy()
153
- try:
154
- fallback[fallback.index(self.settings.ffmpeg_video_codec)] = (
155
- self.settings.ffmpeg_cpu_codec
156
- )
157
- except ValueError:
158
- pass
159
- try:
160
- subprocess.run(fallback, check=True, capture_output=True, text=True, timeout=180)
161
- return
162
- except Exception:
163
- output_path.write_bytes(b"")
164
-
165
- def _compute_keep_ranges(self, clip: ClipCandidate) -> list[tuple[float, float]]:
166
- """Return absolute video time ranges to keep, after subtracting skip_ranges."""
167
- clip_start = float(clip.start_seconds)
168
- clip_end = float(clip.end_seconds)
169
- if not clip.skip_ranges:
170
- return [(clip_start, clip_end)]
171
-
172
- # Skip ranges are relative to clip start. Convert to absolute and sort.
173
- skips: list[tuple[float, float]] = []
174
- for skip in clip.skip_ranges:
175
- s = clip_start + max(0.0, float(skip.start_seconds))
176
- e = clip_start + max(0.0, float(skip.end_seconds))
177
- if e > s:
178
- skips.append((min(s, clip_end), min(e, clip_end)))
179
- skips.sort()
180
-
181
- # Merge overlapping
182
- merged: list[tuple[float, float]] = []
183
- for s, e in skips:
184
- if merged and s <= merged[-1][1]:
185
- merged[-1] = (merged[-1][0], max(merged[-1][1], e))
186
- else:
187
- merged.append((s, e))
188
-
189
- # Compute keep segments
190
- keeps: list[tuple[float, float]] = []
191
- cursor = clip_start
192
- for s, e in merged:
193
- if s > cursor:
194
- keeps.append((cursor, s))
195
- cursor = max(cursor, e)
196
- if cursor < clip_end:
197
- keeps.append((cursor, clip_end))
198
-
199
- return keeps if keeps else [(clip_start, clip_end)]
200
-
201
- def _platform_filter(self, profile: ChannelProfile) -> str:
202
- if profile.target_platform.value in {"tiktok", "youtube_shorts", "instagram_reels"}:
203
- return "scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920"
204
- return "scale=1280:720:force_original_aspect_ratio=decrease,pad=1280:720:(ow-iw)/2:(oh-ih)/2"
205
-
206
- def _subtitle_filter(self, subtitle_path: Path) -> str:
207
- escaped = str(subtitle_path.resolve()).replace("\\", "/").replace(":", "\\:")
208
- style = (
209
- "Fontname=Arial,"
210
- "Fontsize=22,"
211
- "PrimaryColour=&H00FFFFFF,"
212
- "OutlineColour=&H00000000,"
213
- "BorderStyle=1,"
214
- "Outline=2,"
215
- "Shadow=1,"
216
- "Alignment=2,"
217
- "MarginV=210"
218
- )
219
- return f"subtitles='{escaped}':force_style='{style}'"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/highlight.py DELETED
@@ -1,434 +0,0 @@
1
- import json
2
- import re
3
- from uuid import uuid4
4
-
5
- from app.core.config import Settings
6
- from app.models.schemas import ChannelProfile, ClipCandidate, SubtitleCue, TranscriptSegment
7
-
8
-
9
- class QwenHighlightDetector:
10
- def __init__(self, settings: Settings) -> None:
11
- self.settings = settings
12
- self._llm = None
13
-
14
- def detect(
15
- self, transcript: list[TranscriptSegment], profile: ChannelProfile
16
- ) -> list[ClipCandidate]:
17
- if self.settings.demo_mode:
18
- return self._heuristic_detect(transcript, profile)
19
-
20
- try:
21
- return self._qwen_detect(transcript, profile)
22
- except Exception:
23
- return self._heuristic_detect(transcript, profile)
24
-
25
- def _qwen_detect(
26
- self, transcript: list[TranscriptSegment], profile: ChannelProfile
27
- ) -> list[ClipCandidate]:
28
- try:
29
- from vllm import LLM, SamplingParams
30
- except Exception as exc:
31
- raise RuntimeError("vLLM with ROCm backend is required for Qwen inference") from exc
32
-
33
- if self._llm is None:
34
- self._llm = LLM(
35
- model=self.settings.qwen_text_model_id,
36
- dtype=self.settings.preferred_torch_dtype,
37
- trust_remote_code=True,
38
- )
39
-
40
- transcript_text = "\n".join(
41
- f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}"
42
- for segment in transcript
43
- )
44
- niche = _effective_niche(profile)
45
- channel_description = profile.channel_description or "No extra channel description provided."
46
- clip_count = min(profile.clip_count, self.settings.max_clips)
47
- prompt = f"""
48
- You are selecting short-form clips for a creator.
49
- Profile:
50
- - niche: {niche}
51
- - creator description: {channel_description}
52
- - style: {profile.clip_style}
53
- - target length seconds: {profile.clip_length_seconds}
54
- - target number of clips: {clip_count}
55
- - language: {profile.primary_language}
56
- - platform: {profile.target_platform.value}
57
-
58
- Return strict JSON only. Shape:
59
- [
60
- {{
61
- "start_seconds": 12.0,
62
- "end_seconds": 72.0,
63
- "title": "short title",
64
- "reason": "why this will engage viewers",
65
- "score": 91,
66
- "subtitle_text": "clean subtitle text"
67
- }}
68
- ]
69
-
70
- Transcript:
71
- {transcript_text}
72
- """.strip()
73
- sampling = SamplingParams(temperature=0.2, max_tokens=1200)
74
- outputs = self._llm.generate([prompt], sampling)
75
- text = outputs[0].outputs[0].text
76
- payload = self._parse_json_array(text)
77
- clips = [
78
- ClipCandidate(
79
- id=uuid4().hex,
80
- start_seconds=float(item["start_seconds"]),
81
- end_seconds=float(item["end_seconds"]),
82
- title=str(item.get("title") or "Highlight"),
83
- reason=str(item.get("reason") or "High engagement potential"),
84
- score=float(item.get("score") or 75),
85
- subtitle_text=str(item.get("subtitle_text") or ""),
86
- metadata={"model": self.settings.qwen_text_model_id},
87
- )
88
- for item in payload[:clip_count]
89
- ]
90
- return clips or self._heuristic_detect(transcript, profile)
91
-
92
- def _parse_json_array(self, text: str) -> list[dict]:
93
- match = re.search(r"\[[\s\S]*\]", text)
94
- if not match:
95
- raise ValueError("No JSON array in Qwen response")
96
- payload = json.loads(match.group(0))
97
- if not isinstance(payload, list):
98
- raise ValueError("Qwen response is not a list")
99
- return payload
100
-
101
- # ──────────────────────────────────────────────────────────────
102
- # AI subtitle actions (Polish, Translate)
103
- # ──────────────────────────────────────────────────────────────
104
-
105
- def polish_subtitles(
106
- self, cues: list[SubtitleCue], style: str | None = None
107
- ) -> list[SubtitleCue]:
108
- """Rewrite cue text to be punchier and more readable on short-form video.
109
-
110
- Demo mode returns deterministic polished text so the UX is testable
111
- without GPU. Production mode calls Qwen2.5.
112
- """
113
- if self.settings.demo_mode:
114
- return self._heuristic_polish(cues, style)
115
- try:
116
- return self._qwen_polish(cues, style)
117
- except Exception:
118
- return self._heuristic_polish(cues, style)
119
-
120
- def translate_subtitles(
121
- self, cues: list[SubtitleCue], target_language: str
122
- ) -> list[SubtitleCue]:
123
- """Translate cue text to target_language while preserving timing."""
124
- if self.settings.demo_mode:
125
- return self._heuristic_translate(cues, target_language)
126
- try:
127
- return self._qwen_translate(cues, target_language)
128
- except Exception:
129
- return self._heuristic_translate(cues, target_language)
130
-
131
- # ──────────────────────────────────────────────────────────────
132
- # Demo / fallback implementations
133
- # ──────────────────────────────────────────────────────────────
134
-
135
- def _heuristic_polish(
136
- self, cues: list[SubtitleCue], style: str | None
137
- ) -> list[SubtitleCue]:
138
- """Apply simple text transformations that look like an AI polish."""
139
- polished: list[SubtitleCue] = []
140
- for cue in cues:
141
- text = (cue.text or "").strip()
142
- if not text:
143
- polished.append(cue.model_copy())
144
- continue
145
- # Shorten redundant phrasing (heuristic)
146
- text = re.sub(r"\s+", " ", text)
147
- text = re.sub(r"^(so|well|like|um|uh|you know|i mean)[,\s]+", "", text, flags=re.IGNORECASE)
148
- text = text.rstrip(" ,.;:")
149
- # Add light emphasis based on style
150
- if style and style.lower() == "dramatic" and not text.endswith("!"):
151
- text = text + "!"
152
- polished.append(
153
- SubtitleCue(
154
- start_seconds=cue.start_seconds,
155
- end_seconds=cue.end_seconds,
156
- text=text,
157
- )
158
- )
159
- return polished
160
-
161
- def _heuristic_translate(
162
- self, cues: list[SubtitleCue], target_language: str
163
- ) -> list[SubtitleCue]:
164
- """Demo translation: append a marker so the UX shows the action ran."""
165
- marker = f"[{target_language[:2].upper()}]"
166
- translated: list[SubtitleCue] = []
167
- for cue in cues:
168
- text = (cue.text or "").strip()
169
- translated.append(
170
- SubtitleCue(
171
- start_seconds=cue.start_seconds,
172
- end_seconds=cue.end_seconds,
173
- text=f"{marker} {text}" if text else "",
174
- )
175
- )
176
- return translated
177
-
178
- # ──────────────────────────────────────────────────────────────
179
- # Production Qwen calls (used when DEMO_MODE=false on AMD GPU)
180
- # ──────────────────────────────────────────────────────────────
181
-
182
- def _ensure_llm(self):
183
- try:
184
- from vllm import LLM
185
- except Exception as exc:
186
- raise RuntimeError("vLLM with ROCm backend is required for Qwen") from exc
187
- if self._llm is None:
188
- self._llm = LLM(
189
- model=self.settings.qwen_text_model_id,
190
- dtype=self.settings.preferred_torch_dtype,
191
- trust_remote_code=True,
192
- )
193
- return self._llm
194
-
195
- def _qwen_polish(
196
- self, cues: list[SubtitleCue], style: str | None
197
- ) -> list[SubtitleCue]:
198
- from vllm import SamplingParams
199
-
200
- llm = self._ensure_llm()
201
- joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues))
202
- prompt = f"""
203
- Rewrite each subtitle line to be punchier and easier to read on short-form vertical video.
204
- Keep the same number of lines and the same approximate length per line.
205
- Style preference: {style or 'natural'}.
206
- Return one rewritten line per row, prefixed with the original index. No commentary.
207
-
208
- Input:
209
- {joined}
210
- """.strip()
211
- outputs = llm.generate([prompt], SamplingParams(temperature=0.3, max_tokens=800))
212
- raw = outputs[0].outputs[0].text
213
- rewritten = self._parse_indexed_lines(raw, expected=len(cues))
214
- return [
215
- SubtitleCue(
216
- start_seconds=cue.start_seconds,
217
- end_seconds=cue.end_seconds,
218
- text=rewritten[i] if i < len(rewritten) else cue.text,
219
- )
220
- for i, cue in enumerate(cues)
221
- ]
222
-
223
- def _qwen_translate(
224
- self, cues: list[SubtitleCue], target_language: str
225
- ) -> list[SubtitleCue]:
226
- from vllm import SamplingParams
227
-
228
- llm = self._ensure_llm()
229
- joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues))
230
- prompt = f"""
231
- Translate each subtitle line into {target_language}. Preserve line count and order.
232
- Return one translated line per row, prefixed with the original index. No commentary.
233
-
234
- Input:
235
- {joined}
236
- """.strip()
237
- outputs = llm.generate([prompt], SamplingParams(temperature=0.2, max_tokens=1000))
238
- raw = outputs[0].outputs[0].text
239
- translated = self._parse_indexed_lines(raw, expected=len(cues))
240
- return [
241
- SubtitleCue(
242
- start_seconds=cue.start_seconds,
243
- end_seconds=cue.end_seconds,
244
- text=translated[i] if i < len(translated) else cue.text,
245
- )
246
- for i, cue in enumerate(cues)
247
- ]
248
-
249
- def _parse_indexed_lines(self, raw: str, expected: int) -> list[str]:
250
- lines = []
251
- for line in raw.splitlines():
252
- stripped = line.strip()
253
- if not stripped:
254
- continue
255
- match = re.match(r"^\s*\d+[.)\s-]+\s*(.*)$", stripped)
256
- lines.append(match.group(1).strip() if match else stripped)
257
- if len(lines) >= expected:
258
- break
259
- return lines
260
-
261
- def _heuristic_detect(
262
- self, transcript: list[TranscriptSegment], profile: ChannelProfile
263
- ) -> list[ClipCandidate]:
264
- style_terms = {
265
- "funny": ["react", "punchy", "mistake", "surprising"],
266
- "informative": ["important", "practical", "takeaway", "explanation"],
267
- "dramatic": ["problem", "surprising", "before-and-after", "stop scrolling"],
268
- "educational": ["question", "answer", "context", "takeaway"],
269
- }
270
- preferred_terms = style_terms.get(profile.clip_style.lower(), [])
271
- niche = _effective_niche(profile)
272
- profile_terms = [
273
- term
274
- for term in f"{niche} {profile.channel_description}".lower().split()[:30]
275
- if len(term) > 2
276
- ]
277
- scored: list[tuple[float, TranscriptSegment]] = []
278
- for segment in transcript:
279
- text = segment.text.lower()
280
- score = 45.0
281
- score += 12 if "?" in segment.text else 0
282
- score += 8 if any(term in text for term in preferred_terms) else 0
283
- score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0
284
- score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0
285
- score += 5 if any(term in text for term in profile_terms) else 0
286
- score += min(len(segment.text) / 12, 10)
287
- scored.append((min(score, 100), segment))
288
-
289
- scored.sort(key=lambda item: item[0], reverse=True)
290
- clips: list[ClipCandidate] = []
291
- clip_count = min(profile.clip_count, self.settings.max_clips)
292
- for score, segment in scored[:clip_count]:
293
- start = max(0.0, segment.start_seconds - 5.0)
294
- end = start + float(profile.clip_length_seconds)
295
- clips.append(
296
- ClipCandidate(
297
- id=uuid4().hex,
298
- start_seconds=start,
299
- end_seconds=end,
300
- title=self._title_for(segment.text),
301
- reason=self._reason_for(profile, niche),
302
- score=round(score, 1),
303
- subtitle_text=segment.text,
304
- metadata={"model": "heuristic-fallback"},
305
- )
306
- )
307
- return sorted(clips, key=lambda clip: clip.start_seconds)
308
-
309
- def _title_for(self, text: str) -> str:
310
- clean = re.sub(r"\s+", " ", text).strip(" \t\r\n.,!?;:()[]{}\"'")
311
- words = clean.split()
312
- if len(words) > 1:
313
- title = " ".join(words[:7])
314
- else:
315
- title = clean[:48]
316
- return title[:72].rstrip() or "Highlight"
317
-
318
- def _reason_for(self, profile: ChannelProfile, niche: str) -> str:
319
- language = profile.primary_language.lower()
320
- style = _localized_profile_word(profile.clip_style, language, "style")
321
- niche_label = _localized_profile_word(niche, language, "niche")
322
- if "thai" in language:
323
- return f"ตรงกับสไตล์ {style} สำหรับผู้ชมช่องแนว {niche_label}"
324
- if "japanese" in language:
325
- return f"{niche_label} の視聴者に合う {style} スタイルの候補です。"
326
- if "chinese" in language:
327
- return f"符合 {niche_label} 受众期待的 {style} 风格。"
328
- if "korean" in language:
329
- return f"{niche_label} 시청자에게 맞는 {style} 스타일의 후보입니다."
330
- return f"Matches the {profile.clip_style} style for a {niche} audience."
331
-
332
-
333
- def _effective_niche(profile: ChannelProfile) -> str:
334
- if profile.niche.lower() == "other" and profile.niche_custom:
335
- return profile.niche_custom
336
- return profile.niche
337
-
338
-
339
- def _localized_profile_word(value: str, language: str, group: str) -> str:
340
- key = value.lower().replace(" ", "_")
341
- localized = {
342
- "thai": {
343
- "niche": {
344
- "education": "การศึกษา",
345
- "gaming": "เกม",
346
- "podcast": "พอดแคสต์",
347
- "commentary": "เล่า/วิเคราะห์",
348
- "cars": "รถยนต์",
349
- "beauty": "บิวตี้",
350
- "fitness": "ฟิตเนส",
351
- "finance": "การเงิน",
352
- "tech": "เทคโนโลยี",
353
- "lifestyle": "ไลฟ์สไตล์",
354
- "music": "ดนตรี",
355
- },
356
- "style": {
357
- "informative": "ให้ข้อมูล",
358
- "funny": "ตลก",
359
- "dramatic": "ดราม่า",
360
- "educational": "สอนเข้าใจง่าย",
361
- "commentary": "วิเคราะห์",
362
- },
363
- },
364
- "japanese": {
365
- "niche": {
366
- "education": "教育",
367
- "gaming": "ゲーム",
368
- "podcast": "ポッドキャスト",
369
- "commentary": "解説",
370
- "cars": "車",
371
- "beauty": "美容",
372
- "fitness": "フィットネス",
373
- "finance": "金融",
374
- "tech": "テック",
375
- "lifestyle": "ライフスタイル",
376
- "music": "音楽",
377
- },
378
- "style": {
379
- "informative": "情報性の高い",
380
- "funny": "ユーモアのある",
381
- "dramatic": "ドラマチックな",
382
- "educational": "学びやすい",
383
- "commentary": "解説型の",
384
- },
385
- },
386
- "chinese": {
387
- "niche": {
388
- "education": "教育",
389
- "gaming": "游戏",
390
- "podcast": "播客",
391
- "commentary": "解说",
392
- "cars": "汽车",
393
- "beauty": "美妆",
394
- "fitness": "健身",
395
- "finance": "金融",
396
- "tech": "科技",
397
- "lifestyle": "生活方式",
398
- "music": "音乐",
399
- },
400
- "style": {
401
- "informative": "信息量高",
402
- "funny": "有趣",
403
- "dramatic": "戏剧化",
404
- "educational": "教学型",
405
- "commentary": "评论型",
406
- },
407
- },
408
- "korean": {
409
- "niche": {
410
- "education": "교육",
411
- "gaming": "게임",
412
- "podcast": "팟캐스트",
413
- "commentary": "해설",
414
- "cars": "자동차",
415
- "beauty": "뷰티",
416
- "fitness": "피트니스",
417
- "finance": "금융",
418
- "tech": "테크",
419
- "lifestyle": "라이프스타일",
420
- "music": "음악",
421
- },
422
- "style": {
423
- "informative": "정보형",
424
- "funny": "재미있는",
425
- "dramatic": "극적인",
426
- "educational": "교육형",
427
- "commentary": "해설형",
428
- },
429
- },
430
- }
431
- for language_key, groups in localized.items():
432
- if language_key in language:
433
- return groups.get(group, {}).get(key, value)
434
- return value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/multimodal.py DELETED
@@ -1,200 +0,0 @@
1
- import os
2
- import subprocess
3
- import tempfile
4
-
5
- from app.core.config import Settings
6
- from app.models.schemas import ClipCandidate
7
-
8
- _DEMO_VISUALS = [
9
- ("High-energy scene with strong visual contrast and clear subject focus.", 88.0),
10
- ("Close-up with expressive reactions — excellent engagement framing.", 92.0),
11
- ("Dynamic motion sequence; subject well-lit with clean background.", 84.0),
12
- ("Text-overlay-friendly composition with natural colour grading.", 79.0),
13
- ("Wide establishing shot; strong emotional beat in middle frames.", 81.0),
14
- ]
15
-
16
-
17
- class QwenVisualAnalyzer:
18
- def __init__(self, settings: Settings) -> None:
19
- self.settings = settings
20
- self._model = None
21
- self._processor = None
22
-
23
- def enrich(self, video_path: str, clips: list[ClipCandidate]) -> list[ClipCandidate]:
24
- if self.settings.demo_mode:
25
- return self._demo_enrich(clips)
26
- try:
27
- return self._qwen_enrich(video_path, clips)
28
- except Exception:
29
- return clips
30
-
31
- # ------------------------------------------------------------------
32
- # Demo mode
33
- # ------------------------------------------------------------------
34
-
35
- def _demo_enrich(self, clips: list[ClipCandidate]) -> list[ClipCandidate]:
36
- enriched = []
37
- for i, clip in enumerate(clips):
38
- note, vscore = _DEMO_VISUALS[i % len(_DEMO_VISUALS)]
39
- enriched.append(
40
- clip.model_copy(
41
- update={
42
- "metadata": {
43
- **clip.metadata,
44
- "visual_model": "demo",
45
- "visual_note": note,
46
- "visual_score": vscore,
47
- }
48
- }
49
- )
50
- )
51
- return enriched
52
-
53
- # ------------------------------------------------------------------
54
- # Production mode — Qwen2-VL on ROCm
55
- # ------------------------------------------------------------------
56
-
57
- def _load_model(self) -> None:
58
- try:
59
- import torch
60
- from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
61
- except ImportError as exc:
62
- raise RuntimeError("transformers + ROCm PyTorch are required for Qwen2-VL") from exc
63
-
64
- dtype = getattr(torch, self.settings.preferred_torch_dtype, torch.bfloat16)
65
- self._model = Qwen2VLForConditionalGeneration.from_pretrained(
66
- self.settings.qwen_vl_model_id,
67
- torch_dtype=dtype,
68
- device_map="auto",
69
- trust_remote_code=True,
70
- token=self.settings.hf_token or None,
71
- )
72
- self._processor = AutoProcessor.from_pretrained(
73
- self.settings.qwen_vl_model_id,
74
- trust_remote_code=True,
75
- token=self.settings.hf_token or None,
76
- )
77
-
78
- def _qwen_enrich(self, video_path: str, clips: list[ClipCandidate]) -> list[ClipCandidate]:
79
- if self._model is None:
80
- self._load_model()
81
-
82
- enriched = []
83
- for clip in clips:
84
- try:
85
- frames = _sample_frames(video_path, clip.start_seconds, clip.end_seconds, self.settings.ffmpeg_binary)
86
- if not frames:
87
- enriched.append(clip)
88
- continue
89
- note, vscore = self._analyze(frames, clip.title)
90
- enriched.append(
91
- clip.model_copy(
92
- update={
93
- "metadata": {
94
- **clip.metadata,
95
- "visual_model": self.settings.qwen_vl_model_id,
96
- "visual_note": note,
97
- "visual_score": vscore,
98
- }
99
- }
100
- )
101
- )
102
- except Exception:
103
- enriched.append(
104
- clip.model_copy(
105
- update={
106
- "metadata": {
107
- **clip.metadata,
108
- "visual_model": self.settings.qwen_vl_model_id,
109
- "visual_status": "analysis_failed",
110
- }
111
- }
112
- )
113
- )
114
- return enriched
115
-
116
- def _analyze(self, frames: list, title: str) -> tuple[str, float]:
117
- import torch
118
-
119
- messages = [
120
- {
121
- "role": "user",
122
- "content": [
123
- *[{"type": "image", "image": f} for f in frames],
124
- {
125
- "type": "text",
126
- "text": (
127
- f'These frames are from a clip titled "{title}". '
128
- "Describe the visual quality and short-form engagement potential in 1-2 sentences. "
129
- "Then output exactly: SCORE: <integer 0-100>"
130
- ),
131
- },
132
- ],
133
- }
134
- ]
135
- text = self._processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
136
- inputs = self._processor(text=[text], images=frames, return_tensors="pt").to(self._model.device)
137
- with torch.no_grad():
138
- ids = self._model.generate(**inputs, max_new_tokens=140)
139
- reply = self._processor.batch_decode(
140
- ids[:, inputs["input_ids"].shape[1]:],
141
- skip_special_tokens=True,
142
- )[0].strip()
143
-
144
- vscore = 75.0
145
- for line in reversed(reply.splitlines()):
146
- upper = line.strip().upper()
147
- if upper.startswith("SCORE:"):
148
- try:
149
- vscore = float(upper.split(":", 1)[1].strip())
150
- except ValueError:
151
- pass
152
- break
153
-
154
- note = reply.split("SCORE:")[0].strip() or reply
155
- return note, min(max(vscore, 0.0), 100.0)
156
-
157
-
158
- # ------------------------------------------------------------------
159
- # Frame extraction helper
160
- # ------------------------------------------------------------------
161
-
162
- def _sample_frames(video_path: str, start: float, end: float, ffmpeg: str, n: int = 4) -> list:
163
- try:
164
- from PIL import Image
165
- except ImportError:
166
- return []
167
-
168
- duration = max(end - start, 1.0)
169
- timestamps = [start + duration * i / max(n - 1, 1) for i in range(n)]
170
- frames = []
171
- tmp_files = []
172
- try:
173
- for ts in timestamps:
174
- fd, tmp = tempfile.mkstemp(suffix=".jpg")
175
- os.close(fd)
176
- tmp_files.append(tmp)
177
- result = subprocess.run(
178
- [
179
- ffmpeg,
180
- "-ss", f"{ts:.3f}",
181
- "-i", video_path,
182
- "-vframes", "1",
183
- "-q:v", "2",
184
- "-y", tmp,
185
- ],
186
- capture_output=True,
187
- timeout=15,
188
- )
189
- if result.returncode == 0:
190
- try:
191
- frames.append(Image.open(tmp).convert("RGB"))
192
- except Exception:
193
- pass
194
- finally:
195
- for tmp in tmp_files:
196
- try:
197
- os.unlink(tmp)
198
- except OSError:
199
- pass
200
- return frames
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/pipeline.py DELETED
@@ -1,236 +0,0 @@
1
- import asyncio
2
- from pathlib import Path
3
-
4
- from app.core.config import Settings
5
- from app.core.timing import TimingLog
6
- from app.models.schemas import ChannelProfile, ClipCandidate
7
- from app.services.clips import ClipGenerator
8
- from app.services.highlight import QwenHighlightDetector
9
- from app.services.multimodal import QwenVisualAnalyzer
10
- from app.services.transcription import WhisperTranscriber
11
- from app.services.video_input import resolve_youtube_url
12
- from app.storage import JobStore
13
-
14
-
15
- class VideoPipeline:
16
- def __init__(self, settings: Settings, store: JobStore) -> None:
17
- self.settings = settings
18
- self.store = store
19
- self.transcriber = WhisperTranscriber(settings)
20
- self.highlight_detector = QwenHighlightDetector(settings)
21
- self.visual_analyzer = QwenVisualAnalyzer(settings)
22
- self.clip_generator = ClipGenerator(settings, store)
23
-
24
- async def process_source(
25
- self,
26
- job_id: str,
27
- source_kind: str,
28
- source_value: str,
29
- profile: ChannelProfile,
30
- ) -> None:
31
- timings = TimingLog()
32
- try:
33
- self.store.update_job(
34
- job_id,
35
- status="running",
36
- progress=0.04,
37
- message="Preparing video input",
38
- current_step="input",
39
- step_index=1,
40
- step_total=6,
41
- )
42
- with timings.measure("input"):
43
- if source_kind == "youtube":
44
- video_path = await resolve_youtube_url(
45
- source_value, self.store.job_dir(job_id), self.settings
46
- )
47
- else:
48
- video_path = Path(source_value)
49
-
50
- self.store.update_job(
51
- job_id,
52
- progress=0.18,
53
- message="Transcribing with Whisper Large V3",
54
- current_step="transcription",
55
- step_index=2,
56
- step_total=6,
57
- )
58
- with timings.measure("transcription"):
59
- transcript = await asyncio.to_thread(
60
- self.transcriber.transcribe, str(video_path), profile
61
- )
62
- self.store.write_json(
63
- job_id,
64
- "transcript.json",
65
- [segment.model_dump(mode="json") for segment in transcript],
66
- )
67
- self.store.update_job(
68
- job_id,
69
- progress=0.42,
70
- message="Transcript ready",
71
- transcript=transcript,
72
- timings=timings.to_dict(),
73
- )
74
-
75
- self.store.update_job(
76
- job_id,
77
- progress=0.48,
78
- message="Scoring highlights with Qwen",
79
- current_step="highlight_detection",
80
- step_index=3,
81
- step_total=6,
82
- )
83
- with timings.measure("highlight_detection"):
84
- clips = await asyncio.to_thread(self.highlight_detector.detect, transcript, profile)
85
-
86
- self.store.update_job(
87
- job_id,
88
- progress=0.62,
89
- message="Checking visual highlights",
90
- current_step="multimodal_analysis",
91
- step_index=4,
92
- step_total=6,
93
- )
94
- with timings.measure("multimodal_analysis"):
95
- clips = await asyncio.to_thread(self.visual_analyzer.enrich, str(video_path), clips)
96
-
97
- clip_total = len(clips)
98
- self.store.update_job(
99
- job_id,
100
- progress=0.72,
101
- message=f"Preparing to render {clip_total} clips",
102
- current_step="clip_generation",
103
- step_index=5,
104
- step_total=6,
105
- active_clip_index=0,
106
- active_clip_total=clip_total,
107
- )
108
-
109
- def update_render_progress(index: int, total: int) -> None:
110
- progress = 0.72 + (0.22 * ((index - 1) / max(total, 1)))
111
- self.store.update_job(
112
- job_id,
113
- progress=min(progress, 0.94),
114
- message=f"Rendering clip {index}/{total}",
115
- current_step="clip_generation",
116
- step_index=5,
117
- step_total=6,
118
- active_clip_index=index,
119
- active_clip_total=total,
120
- timings=timings.to_dict(),
121
- )
122
-
123
- with timings.measure("clip_generation"):
124
- rendered = await asyncio.to_thread(
125
- self.clip_generator.generate,
126
- job_id,
127
- video_path,
128
- clips,
129
- transcript,
130
- profile,
131
- update_render_progress,
132
- )
133
-
134
- self.store.update_job(
135
- job_id,
136
- progress=0.97,
137
- message="Finalizing clips",
138
- current_step="finalizing",
139
- step_index=6,
140
- step_total=6,
141
- active_clip_index=clip_total,
142
- active_clip_total=clip_total,
143
- timings=timings.to_dict(),
144
- )
145
- self.store.write_json(
146
- job_id, "clips.json", [clip.model_dump(mode="json") for clip in rendered]
147
- )
148
- self.store.update_job(
149
- job_id,
150
- status="completed",
151
- progress=1,
152
- message="Clips ready",
153
- current_step="completed",
154
- step_index=6,
155
- step_total=6,
156
- active_clip_index=clip_total,
157
- active_clip_total=clip_total,
158
- transcript=transcript,
159
- clips=rendered,
160
- timings=timings.to_dict(),
161
- )
162
- except Exception as exc:
163
- self.store.update_job(
164
- job_id,
165
- status="failed",
166
- progress=1,
167
- message="Processing failed",
168
- current_step="failed",
169
- error=str(exc),
170
- timings=timings.to_dict(),
171
- )
172
-
173
- def patch_clip(self, job_id: str, clip_id: str, updates: dict) -> ClipCandidate:
174
- snapshot = self.store.get_job(job_id)
175
- patched: ClipCandidate | None = None
176
- clips: list[ClipCandidate] = []
177
- for clip in snapshot.clips:
178
- if clip.id == clip_id:
179
- clean_updates = {key: value for key, value in updates.items() if value is not None}
180
- clip = clip.model_copy(update=clean_updates)
181
- if clip.end_seconds <= clip.start_seconds:
182
- clip = clip.model_copy(update={"end_seconds": clip.start_seconds + 1})
183
- patched = clip
184
- clips.append(clip)
185
- if patched is None:
186
- raise KeyError(clip_id)
187
- self.store.update_job(job_id, clips=clips)
188
- return patched
189
-
190
- def regenerate_clip(
191
- self,
192
- job_id: str,
193
- clip_id: str,
194
- clip_style: str | None = None,
195
- clip_length_seconds: int | None = None,
196
- subtitle_text: str | None = None,
197
- ) -> ClipCandidate:
198
- snapshot = self.store.get_job(job_id)
199
- source_path = self._source_path(job_id)
200
- clips: list[ClipCandidate] = []
201
- regenerated: ClipCandidate | None = None
202
- for index, clip in enumerate(snapshot.clips, start=1):
203
- if clip.id == clip_id:
204
- profile = snapshot.profile.model_copy(
205
- update={
206
- key: value
207
- for key, value in {
208
- "clip_style": clip_style,
209
- "clip_length_seconds": clip_length_seconds,
210
- }.items()
211
- if value is not None
212
- }
213
- )
214
- if clip_length_seconds is not None:
215
- clip = clip.model_copy(
216
- update={"end_seconds": clip.start_seconds + clip_length_seconds}
217
- )
218
- if subtitle_text is not None:
219
- clip = clip.model_copy(update={"subtitle_text": subtitle_text})
220
- clip = self.clip_generator.render_one(
221
- job_id, source_path, clip, snapshot.transcript, profile, index
222
- )
223
- clip.metadata["regenerated"] = True
224
- regenerated = clip
225
- clips.append(clip)
226
- if regenerated is None:
227
- raise KeyError(clip_id)
228
- self.store.update_job(job_id, clips=clips)
229
- return regenerated
230
-
231
- def _source_path(self, job_id: str) -> Path:
232
- job_dir = self.store.job_dir(job_id)
233
- matches = sorted(job_dir.glob("source.*"))
234
- if not matches:
235
- raise FileNotFoundError("source video missing")
236
- return matches[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/subtitles.py DELETED
@@ -1,151 +0,0 @@
1
- import re
2
- from pathlib import Path
3
-
4
- from app.models.schemas import TranscriptSegment
5
-
6
-
7
- def seconds_to_srt_time(value: float) -> str:
8
- millis = int(round(value * 1000))
9
- hours, remainder = divmod(millis, 3_600_000)
10
- minutes, remainder = divmod(remainder, 60_000)
11
- seconds, millis = divmod(remainder, 1000)
12
- return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
13
-
14
-
15
- def write_srt(
16
- path: Path, clip_start: float, clip_end: float, segments: list[TranscriptSegment]
17
- ) -> list[dict]:
18
- cues: list[dict] = []
19
- rows: list[str] = []
20
- index = 1
21
- for segment in segments:
22
- if segment.end_seconds < clip_start or segment.start_seconds > clip_end:
23
- continue
24
- start = max(0.0, segment.start_seconds - clip_start)
25
- end = min(clip_end - clip_start, segment.end_seconds - clip_start)
26
- for cue in split_timed_caption(segment.text, start, max(end, start + 1.2)):
27
- rows.extend(_srt_row(index, cue["start_seconds"], cue["end_seconds"], cue["text"]))
28
- cues.append(cue)
29
- index += 1
30
- if not rows:
31
- cues = [{"start_seconds": 0.0, "end_seconds": 3.0, "text": ""}]
32
- rows = _srt_row(1, 0.0, 3.0, "")
33
- path.write_text("\n".join(rows), encoding="utf-8")
34
- return cues
35
-
36
-
37
- def write_single_caption_srt(path: Path, duration: float, text: str) -> list[dict]:
38
- safe_duration = max(duration, 1.0)
39
- cues = split_timed_caption(text, 0.0, safe_duration)
40
- rows: list[str] = []
41
- for index, cue in enumerate(cues, start=1):
42
- rows.extend(_srt_row(index, cue["start_seconds"], cue["end_seconds"], cue["text"]))
43
- if not rows:
44
- cues = [{"start_seconds": 0.0, "end_seconds": min(safe_duration, 3.0), "text": ""}]
45
- rows = _srt_row(1, cues[0]["start_seconds"], cues[0]["end_seconds"], "")
46
- path.write_text("\n".join(rows), encoding="utf-8")
47
- return cues
48
-
49
-
50
- def write_srt_from_cues(path: Path, cues: list) -> list[dict]:
51
- """Write SRT using user-supplied per-cue timing (preferred over auto-distribution).
52
-
53
- Accepts list of objects with .start_seconds / .end_seconds / .text attributes
54
- (Pydantic SubtitleCue) or dicts with the same keys.
55
- """
56
- rows: list[str] = []
57
- out_cues: list[dict] = []
58
- index = 1
59
- for cue in cues:
60
- start = float(getattr(cue, "start_seconds", None) or cue.get("start_seconds", 0))
61
- end = float(getattr(cue, "end_seconds", None) or cue.get("end_seconds", 0))
62
- text = str(getattr(cue, "text", None) or cue.get("text", ""))
63
- if end <= start:
64
- end = start + 1.0
65
- clean_text = text.strip()
66
- if not clean_text:
67
- continue
68
- rows.extend(_srt_row(index, start, end, clean_text))
69
- out_cues.append({"start_seconds": round(start, 3), "end_seconds": round(end, 3), "text": clean_text})
70
- index += 1
71
- if not rows:
72
- out_cues = [{"start_seconds": 0.0, "end_seconds": 3.0, "text": ""}]
73
- rows = _srt_row(1, 0.0, 3.0, "")
74
- path.write_text("\n".join(rows), encoding="utf-8")
75
- return out_cues
76
-
77
-
78
- def split_timed_caption(text: str, start: float, end: float) -> list[dict]:
79
- phrases = split_caption_text(text)
80
- if not phrases:
81
- return []
82
-
83
- total_duration = max(end - start, 1.2)
84
- max_cues = max(1, int(total_duration / 1.2))
85
- if len(phrases) > max_cues:
86
- phrases = _merge_phrases(phrases, max_cues)
87
-
88
- cue_duration = min(4.0, max(1.2, total_duration / len(phrases)))
89
- cues: list[dict] = []
90
- cursor = start
91
- for index, phrase in enumerate(phrases):
92
- remaining = len(phrases) - index
93
- max_end = end - ((remaining - 1) * 1.2)
94
- cue_end = min(max_end, cursor + cue_duration)
95
- cue_end = max(cue_end, cursor + 1.2)
96
- if index == len(phrases) - 1:
97
- cue_end = end
98
- cues.append(
99
- {
100
- "start_seconds": round(cursor, 3),
101
- "end_seconds": round(max(cue_end, cursor + 0.8), 3),
102
- "text": phrase,
103
- }
104
- )
105
- cursor = cue_end
106
- return cues
107
-
108
-
109
- def split_caption_text(text: str, max_chars: int = 42, max_words: int = 7) -> list[str]:
110
- clean = re.sub(r"\s+", " ", text.strip())
111
- if not clean:
112
- return []
113
-
114
- words = clean.split()
115
- if len(words) <= 1:
116
- return [clean[index : index + max_chars] for index in range(0, len(clean), max_chars)]
117
-
118
- phrases: list[str] = []
119
- current: list[str] = []
120
- for word in words:
121
- candidate = " ".join([*current, word]).strip()
122
- punctuation_break = bool(current and re.search(r"[,.!?;:]$", current[-1]))
123
- if current and (len(candidate) > max_chars or len(current) >= max_words or punctuation_break):
124
- phrases.append(" ".join(current))
125
- current = [word]
126
- else:
127
- current.append(word)
128
- if current:
129
- phrases.append(" ".join(current))
130
- return phrases
131
-
132
-
133
- def _merge_phrases(phrases: list[str], target_count: int) -> list[str]:
134
- if target_count <= 1:
135
- return [" ".join(phrases)]
136
- merged: list[str] = []
137
- bucket_size = len(phrases) / target_count
138
- for index in range(target_count):
139
- start = round(index * bucket_size)
140
- end = round((index + 1) * bucket_size)
141
- merged.append(" ".join(phrases[start:end]).strip())
142
- return [phrase for phrase in merged if phrase]
143
-
144
-
145
- def _srt_row(index: int, start: float, end: float, text: str) -> list[str]:
146
- return [
147
- str(index),
148
- f"{seconds_to_srt_time(start)} --> {seconds_to_srt_time(end)}",
149
- text.strip(),
150
- "",
151
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/transcription.py DELETED
@@ -1,366 +0,0 @@
1
- from pathlib import Path
2
- from uuid import uuid4
3
-
4
- from app.core.config import Settings
5
- from app.models.schemas import ChannelProfile, SubtitleCue, TranscriptSegment
6
- from app.utils.rocm import torch_device_index
7
-
8
-
9
- class WhisperTranscriber:
10
- def __init__(self, settings: Settings) -> None:
11
- self.settings = settings
12
- self._pipeline = None
13
-
14
- def transcribe(self, video_path: str, profile: ChannelProfile) -> list[TranscriptSegment]:
15
- if self.settings.demo_mode:
16
- return self._demo_transcript(profile)
17
-
18
- try:
19
- from transformers import pipeline
20
- except Exception as exc:
21
- raise RuntimeError("transformers is required for Whisper transcription") from exc
22
-
23
- if self._pipeline is None:
24
- self._pipeline = pipeline(
25
- task="automatic-speech-recognition",
26
- model=self.settings.whisper_model_id,
27
- device=torch_device_index(),
28
- token=self.settings.hf_token,
29
- chunk_length_s=30,
30
- return_timestamps=True,
31
- )
32
-
33
- generate_kwargs = {"task": "transcribe"}
34
- if profile.primary_language and profile.primary_language.lower() != "auto":
35
- generate_kwargs["language"] = profile.primary_language.lower()
36
-
37
- result = self._pipeline(str(video_path), generate_kwargs=generate_kwargs)
38
- chunks = result.get("chunks") or []
39
- if not chunks:
40
- text = result.get("text", "").strip()
41
- return [
42
- TranscriptSegment(
43
- id=uuid4().hex,
44
- start_seconds=0,
45
- end_seconds=max(profile.clip_length_seconds, 15),
46
- text=text,
47
- language=profile.primary_language,
48
- )
49
- ]
50
-
51
- segments: list[TranscriptSegment] = []
52
- for chunk in chunks:
53
- timestamp = chunk.get("timestamp") or (0, 0)
54
- start = float(timestamp[0] or 0)
55
- end = float(timestamp[1] or start + 5)
56
- text = (chunk.get("text") or "").strip()
57
- if text:
58
- segments.append(
59
- TranscriptSegment(
60
- id=uuid4().hex,
61
- start_seconds=start,
62
- end_seconds=max(end, start + 1),
63
- text=text,
64
- language=profile.primary_language,
65
- )
66
- )
67
- return segments
68
-
69
- def align_words(
70
- self,
71
- video_path: str | Path,
72
- text: str,
73
- clip_start: float,
74
- clip_end: float,
75
- ) -> list[SubtitleCue]:
76
- """Estimate per-word/per-phrase timing within [clip_start, clip_end].
77
-
78
- Demo mode: split the text into chunks of ~3 words, distribute timings
79
- across the clip duration. Production: run Whisper word-level timestamps.
80
-
81
- Returns SubtitleCues with timing relative to clip_start.
82
- """
83
- if self.settings.demo_mode or not text.strip():
84
- return self._demo_align_words(text, clip_start, clip_end)
85
- try:
86
- return self._whisper_align_words(video_path, text, clip_start, clip_end)
87
- except Exception:
88
- return self._demo_align_words(text, clip_start, clip_end)
89
-
90
- def _demo_align_words(
91
- self, text: str, clip_start: float, clip_end: float
92
- ) -> list[SubtitleCue]:
93
- clean = " ".join(text.split())
94
- if not clean:
95
- return [SubtitleCue(start_seconds=0.0, end_seconds=2.0, text="")]
96
- words = clean.split()
97
- # Group into ~3 word chunks (typical for short-form caption pacing)
98
- chunk_size = max(2, min(4, max(1, len(words) // 6)))
99
- chunks: list[str] = []
100
- for i in range(0, len(words), chunk_size):
101
- chunks.append(" ".join(words[i : i + chunk_size]))
102
- duration = max(0.5, clip_end - clip_start)
103
- per = duration / len(chunks)
104
- cues: list[SubtitleCue] = []
105
- for i, chunk in enumerate(chunks):
106
- cue_start = round(i * per, 3)
107
- cue_end = round((i + 1) * per, 3)
108
- cues.append(
109
- SubtitleCue(
110
- start_seconds=cue_start,
111
- end_seconds=max(cue_end, cue_start + 0.4),
112
- text=chunk,
113
- )
114
- )
115
- return cues
116
-
117
- def _whisper_align_words(
118
- self, video_path: str | Path, text: str, clip_start: float, clip_end: float
119
- ) -> list[SubtitleCue]:
120
- try:
121
- from transformers import pipeline
122
- except Exception as exc:
123
- raise RuntimeError("transformers is required for word-level timestamps") from exc
124
-
125
- if self._pipeline is None:
126
- self._pipeline = pipeline(
127
- task="automatic-speech-recognition",
128
- model=self.settings.whisper_model_id,
129
- device=torch_device_index(),
130
- token=self.settings.hf_token,
131
- chunk_length_s=30,
132
- return_timestamps="word",
133
- )
134
-
135
- result = self._pipeline(
136
- str(video_path),
137
- generate_kwargs={"task": "transcribe"},
138
- return_timestamps="word",
139
- )
140
- chunks = result.get("chunks") or []
141
- # Filter to chunks inside [clip_start, clip_end] and convert to relative time
142
- cues: list[SubtitleCue] = []
143
- buffer_words: list[tuple[str, float, float]] = []
144
- for chunk in chunks:
145
- ts = chunk.get("timestamp") or (0, 0)
146
- start = float(ts[0] or 0)
147
- end = float(ts[1] or start + 0.3)
148
- word = (chunk.get("text") or "").strip()
149
- if not word:
150
- continue
151
- if end < clip_start or start > clip_end:
152
- continue
153
- buffer_words.append(
154
- (word, max(0.0, start - clip_start), min(clip_end - clip_start, end - clip_start))
155
- )
156
-
157
- # Group into ~3 word phrases
158
- chunk_size = 3
159
- for i in range(0, len(buffer_words), chunk_size):
160
- group = buffer_words[i : i + chunk_size]
161
- text_chunk = " ".join(w for w, _, _ in group)
162
- cue_start = group[0][1]
163
- cue_end = group[-1][2]
164
- cues.append(
165
- SubtitleCue(
166
- start_seconds=round(cue_start, 3),
167
- end_seconds=round(max(cue_end, cue_start + 0.4), 3),
168
- text=text_chunk,
169
- )
170
- )
171
- return cues if cues else self._demo_align_words(text, clip_start, clip_end)
172
-
173
- def _demo_transcript(self, profile: ChannelProfile) -> list[TranscriptSegment]:
174
- style = profile.clip_style.lower()
175
- language = profile.primary_language.lower()
176
- niche_value = (
177
- profile.niche_custom
178
- if profile.niche.lower() == "other" and profile.niche_custom
179
- else profile.niche
180
- )
181
- niche = niche_value.lower()
182
- creator_context = (
183
- profile.channel_description
184
- or "The creator wants clips that feel useful and easy to share."
185
- )
186
- lines = _demo_lines(
187
- language,
188
- _localized_profile_word(niche, language, "niche"),
189
- _localized_profile_word(style, language, "style"),
190
- creator_context,
191
- )
192
- segments: list[TranscriptSegment] = []
193
- cursor = 0.0
194
- for line in lines:
195
- end = cursor + 15.0
196
- segments.append(
197
- TranscriptSegment(
198
- id=uuid4().hex,
199
- start_seconds=cursor,
200
- end_seconds=end,
201
- text=line,
202
- language=profile.primary_language,
203
- )
204
- )
205
- cursor = end
206
- return segments
207
-
208
-
209
- def _demo_lines(language: str, niche: str, style: str, creator_context: str) -> list[str]:
210
- if "thai" in language:
211
- return [
212
- "ช่วงเปิดนี้วางปัญหาหลักของครีเอเตอร์ เวลาวิดีโอยาวซ่อนช่วงที่ดีที่สุดไว้",
213
- "นี่คือความผิดพลาดที่หลายทีมทำ คือเลือกคลิปจากยอดวิวอย่างเดียว",
214
- "คำถามสำคัญคือ ช่วงไหนที่จะทำให้คนหยุดเลื่อนหน้าจอได้ทันที",
215
- f"สำหรับช่องแนว {niche} คำตอบจะเปลี่ยน เพราะผู้ชมคาดหวังจังหวะที่ {style}",
216
- f"บริบทของช่องคือ {creator_context}",
217
- "ช่วงนี้อธิบายได้ชัดที่สุด และมีภาพเปรียบเทียบก่อนกับหลังที่แรง",
218
- "จากนั้นแขกรับเชิญตอบสนองด้วยประโยคสั้นที่เหมาะมากสำหรับ hook",
219
- "ตรงนี้มีข้อคิดที่เอาไปใช้ได้ทันที และยืนเป็นคลิปสั้นได้ด้วยตัวเอง",
220
- "ช่วงท้ายสรุปไอเดียด้วยประโยคชัด ๆ ที่ทำซับได้ง่าย",
221
- ]
222
- if "japanese" in language:
223
- return [
224
- "この冒頭では、長い動画に最高の瞬間が埋もれてしまう問題を示しています。",
225
- "多くのチームが再生数だけでクリップを選ぶという意外なミスをしています。",
226
- "大事な問いは、この瞬間が今すぐスクロールを止めさせるかどうかです。",
227
- f"{niche} チャンネルでは、視聴者が {style} なテンポを期待するため答えが変わります。",
228
- f"チャンネルの文脈はこうです。{creator_context}",
229
- "この部分は説明が最も明確で、ビフォーアフターの対比も強いです。",
230
- "その後、ゲストが短いフックとして使いやすい一言で反応します。",
231
- "ここには単独の短尺クリップとして成立する実用的な学びがあります。",
232
- "最後の部分は字幕にしやすい明確な一言でアイデアをまとめます。",
233
- ]
234
- if "chinese" in language:
235
- return [
236
- "这个开头点出了创作者常遇到的问题:长视频里藏着最好的瞬间。",
237
- "很多团队都会犯一个意外错误,只根据播放量来选择剪辑片段。",
238
- "关键问题很简单:哪个瞬间能让观众立刻停下滑动?",
239
- f"对于 {niche} 频道,答案会不同,因为观众期待 {style} 的节奏。",
240
- f"频道背景是:{creator_context}",
241
- "这一段解释最清楚,并且有很强的前后对比。",
242
- "接着嘉宾给出一句有冲击力的回应,很适合作为短视频 hook。",
243
- "这里有一个实用结论,足够独立成为一个短视频片段。",
244
- "最后一段用一句清晰的话收束观点,也很适合做字幕。",
245
- ]
246
- if "korean" in language:
247
- return [
248
- "이 오프닝은 긴 영상 속 좋은 순간이 묻히는 문제를 보여줍니다.",
249
- "많은 팀이 조회수만 보고 클립을 고르는 의외의 실수를 합니다.",
250
- "핵심 질문은 간단합니다. 어떤 순간이 시청자의 스크롤을 멈추게 할까요?",
251
- f"{niche} 채널에서는 시청자가 {style} 리듬을 기대하기 때문에 답이 달라집니다.",
252
- f"채널 맥락은 다음과 같습니다. {creator_context}",
253
- "이 부분은 설명이 가장 명확하고 전후 대비도 강합니다.",
254
- "그다음 게스트가 짧은 훅으로 쓰기 좋은 강한 한마디를 합니다.",
255
- "여기에는 단독 숏폼 클립으로도 충분한 실용적인 takeaway가 있습니다.",
256
- "마지막 부분은 자막으로 만들기 쉬운 명확한 문장으로 아이디어를 정리합니다.",
257
- ]
258
- return [
259
- "This opening sets up the main problem creators face when a long video hides the best moments.",
260
- "Here is the surprising mistake most teams make when they choose clips only by view count.",
261
- "The important question is simple: which moment would make someone stop scrolling right now?",
262
- f"For a {niche} channel, the answer changes because the audience expects a {style} rhythm.",
263
- f"The channel context is simple: {creator_context}",
264
- "This section has the clearest explanation and a strong before-and-after contrast.",
265
- "Then the guest reacts with a punchy line that works well as a short hook.",
266
- "A practical takeaway lands here, with enough context to stand alone as a sixty second clip.",
267
- "The final segment wraps the idea with a direct callout that is easy to subtitle.",
268
- ]
269
-
270
-
271
- def _localized_profile_word(value: str, language: str, group: str) -> str:
272
- key = value.lower().replace(" ", "_")
273
- localized = {
274
- "thai": {
275
- "niche": {
276
- "education": "การศึกษา",
277
- "gaming": "เกม",
278
- "podcast": "พอดแคสต์",
279
- "commentary": "เล่า/วิเคราะห์",
280
- "cars": "รถยนต์",
281
- "beauty": "บิวตี้",
282
- "fitness": "ฟิตเนส",
283
- "finance": "การเงิน",
284
- "tech": "เทคโนโลยี",
285
- "lifestyle": "ไลฟ์สไตล์",
286
- "music": "ดนตรี",
287
- },
288
- "style": {
289
- "informative": "ให้ข้อมูล",
290
- "funny": "ตลก",
291
- "dramatic": "ดราม่า",
292
- "educational": "สอนเข้าใจง่าย",
293
- "commentary": "วิเคราะห์",
294
- },
295
- },
296
- "japanese": {
297
- "niche": {
298
- "education": "教育",
299
- "gaming": "ゲーム",
300
- "podcast": "ポッドキャスト",
301
- "commentary": "解説",
302
- "cars": "車",
303
- "beauty": "美容",
304
- "fitness": "フィットネス",
305
- "finance": "金融",
306
- "tech": "テック",
307
- "lifestyle": "ライフスタイル",
308
- "music": "音楽",
309
- },
310
- "style": {
311
- "informative": "情報性の高い",
312
- "funny": "ユーモアのある",
313
- "dramatic": "ドラマチックな",
314
- "educational": "学びやすい",
315
- "commentary": "解説型の",
316
- },
317
- },
318
- "chinese": {
319
- "niche": {
320
- "education": "教育",
321
- "gaming": "游戏",
322
- "podcast": "播客",
323
- "commentary": "解说",
324
- "cars": "汽车",
325
- "beauty": "美妆",
326
- "fitness": "健身",
327
- "finance": "金融",
328
- "tech": "科技",
329
- "lifestyle": "生活方式",
330
- "music": "音乐",
331
- },
332
- "style": {
333
- "informative": "信息量高",
334
- "funny": "有趣",
335
- "dramatic": "戏剧化",
336
- "educational": "教学型",
337
- "commentary": "评论型",
338
- },
339
- },
340
- "korean": {
341
- "niche": {
342
- "education": "교육",
343
- "gaming": "게임",
344
- "podcast": "팟캐스트",
345
- "commentary": "해설",
346
- "cars": "자동차",
347
- "beauty": "뷰티",
348
- "fitness": "피트니스",
349
- "finance": "금융",
350
- "tech": "테크",
351
- "lifestyle": "라이프스타일",
352
- "music": "음악",
353
- },
354
- "style": {
355
- "informative": "정보형",
356
- "funny": "재미있는",
357
- "dramatic": "극적인",
358
- "educational": "교육형",
359
- "commentary": "해설형",
360
- },
361
- },
362
- }
363
- for language_key, groups in localized.items():
364
- if language_key in language:
365
- return groups.get(group, {}).get(key, value)
366
- return value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/services/video_input.py DELETED
@@ -1,80 +0,0 @@
1
- import asyncio
2
- import shutil
3
- import subprocess
4
- from pathlib import Path
5
-
6
- from fastapi import UploadFile
7
-
8
- from app.core.config import Settings
9
-
10
-
11
- async def save_upload(upload: UploadFile, job_dir: Path) -> Path:
12
- suffix = Path(upload.filename or "upload.mp4").suffix or ".mp4"
13
- destination = job_dir / f"source{suffix.lower()}"
14
- with destination.open("wb") as handle:
15
- while chunk := await upload.read(1024 * 1024):
16
- handle.write(chunk)
17
- return destination
18
-
19
-
20
- async def resolve_youtube_url(url: str, job_dir: Path, settings: Settings) -> Path:
21
- if settings.demo_mode:
22
- return await asyncio.to_thread(create_demo_video, job_dir, settings)
23
-
24
- try:
25
- import yt_dlp
26
- except Exception as exc:
27
- raise RuntimeError("yt-dlp is required for YouTube ingestion") from exc
28
-
29
- output_template = str(job_dir / "source.%(ext)s")
30
- ydl_opts = {
31
- "outtmpl": output_template,
32
- "format": "bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4]/best",
33
- "merge_output_format": "mp4",
34
- "quiet": True,
35
- "noprogress": True,
36
- }
37
-
38
- def download() -> Path:
39
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
40
- ydl.download([url])
41
- matches = sorted(job_dir.glob("source.*"))
42
- if not matches:
43
- raise RuntimeError("yt-dlp finished without producing a video")
44
- return matches[0]
45
-
46
- return await asyncio.to_thread(download)
47
-
48
-
49
- def create_demo_video(job_dir: Path, settings: Settings) -> Path:
50
- destination = job_dir / "source.mp4"
51
- ffmpeg = shutil.which(settings.ffmpeg_binary)
52
- if not ffmpeg:
53
- destination.write_bytes(b"")
54
- return destination
55
-
56
- command = [
57
- ffmpeg,
58
- "-y",
59
- "-f",
60
- "lavfi",
61
- "-i",
62
- "testsrc2=size=1280x720:rate=30:duration=120",
63
- "-f",
64
- "lavfi",
65
- "-i",
66
- "sine=frequency=660:sample_rate=48000:duration=120",
67
- "-shortest",
68
- "-c:v",
69
- "libx264",
70
- "-pix_fmt",
71
- "yuv420p",
72
- "-c:a",
73
- "aac",
74
- str(destination),
75
- ]
76
- try:
77
- subprocess.run(command, check=True, capture_output=True, text=True, timeout=45)
78
- except Exception:
79
- destination.write_bytes(b"")
80
- return destination
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/storage.py DELETED
@@ -1,58 +0,0 @@
1
- import json
2
- from pathlib import Path
3
- from uuid import uuid4
4
-
5
- from app.core.config import Settings
6
- from app.models.schemas import ChannelProfile, JobSnapshot, utc_now
7
-
8
-
9
- class JobStore:
10
- def __init__(self, settings: Settings) -> None:
11
- self.settings = settings
12
- self.root = settings.storage_dir
13
- self.jobs_root = self.root / "jobs"
14
- self.jobs_root.mkdir(parents=True, exist_ok=True)
15
-
16
- def create_job(self, profile: ChannelProfile, source: dict) -> JobSnapshot:
17
- job_id = uuid4().hex
18
- job_dir = self.job_dir(job_id)
19
- job_dir.mkdir(parents=True, exist_ok=True)
20
- snapshot = JobSnapshot(
21
- id=job_id,
22
- status="queued",
23
- progress=0,
24
- message="Queued",
25
- source=source,
26
- profile=profile,
27
- )
28
- self.save_job(snapshot)
29
- return snapshot
30
-
31
- def job_dir(self, job_id: str) -> Path:
32
- return self.jobs_root / job_id
33
-
34
- def media_url(self, job_id: str, filename: str) -> str:
35
- return f"/media/jobs/{job_id}/{filename}"
36
-
37
- def save_job(self, snapshot: JobSnapshot) -> JobSnapshot:
38
- snapshot.updated_at = utc_now()
39
- path = self.job_dir(snapshot.id) / "job.json"
40
- path.write_text(snapshot.model_dump_json(indent=2), encoding="utf-8")
41
- return snapshot
42
-
43
- def get_job(self, job_id: str) -> JobSnapshot:
44
- path = self.job_dir(job_id) / "job.json"
45
- if not path.exists():
46
- raise FileNotFoundError(job_id)
47
- data = json.loads(path.read_text(encoding="utf-8"))
48
- return JobSnapshot.model_validate(data)
49
-
50
- def update_job(self, job_id: str, **updates) -> JobSnapshot:
51
- snapshot = self.get_job(job_id)
52
- updated = snapshot.model_copy(update=updates)
53
- return self.save_job(updated)
54
-
55
- def write_json(self, job_id: str, filename: str, payload: object) -> Path:
56
- path = self.job_dir(job_id) / filename
57
- path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
58
- return path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/utils/__init__.py DELETED
@@ -1 +0,0 @@
1
- """Runtime helpers."""
 
 
backend/app/utils/rocm.py DELETED
@@ -1,33 +0,0 @@
1
- from typing import Any
2
-
3
-
4
- def detect_accelerator() -> dict[str, Any]:
5
- try:
6
- import torch
7
- except Exception as exc:
8
- return {
9
- "torch_available": False,
10
- "cuda_api_available": False,
11
- "rocm_hip_version": None,
12
- "device_name": None,
13
- "error": str(exc),
14
- }
15
-
16
- cuda_available = bool(torch.cuda.is_available())
17
- device_name = torch.cuda.get_device_name(0) if cuda_available else None
18
- return {
19
- "torch_available": True,
20
- "cuda_api_available": cuda_available,
21
- "rocm_hip_version": getattr(torch.version, "hip", None),
22
- "cuda_version": getattr(torch.version, "cuda", None),
23
- "device_name": device_name,
24
- "device_count": torch.cuda.device_count() if cuda_available else 0,
25
- }
26
-
27
-
28
- def torch_device_index() -> int:
29
- try:
30
- import torch
31
- except Exception:
32
- return -1
33
- return 0 if torch.cuda.is_available() else -1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/workers/__init__.py DELETED
@@ -1 +0,0 @@
1
- """Optional async workers."""
 
 
backend/app/workers/celery_app.py DELETED
@@ -1,15 +0,0 @@
1
- from celery import Celery
2
-
3
- from app.core.config import get_settings
4
-
5
- settings = get_settings()
6
-
7
- celery_app = Celery("ai_clip_studio", broker=settings.redis_url, backend=settings.redis_url)
8
- celery_app.conf.task_serializer = "json"
9
- celery_app.conf.result_serializer = "json"
10
- celery_app.conf.accept_content = ["json"]
11
-
12
-
13
- @celery_app.task(name="pipeline.process_job")
14
- def process_job(job_id: str) -> str:
15
- return f"Queued job {job_id}. FastAPI background tasks are active by default."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/main.py ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ElevenClip AI — FastAPI Backend.
2
+
3
+ Endpoints:
4
+ POST /api/video-info — get YouTube metadata (no download)
5
+ POST /api/process — full pipeline (download/upload → clips)
6
+ WS /ws/progress/{session} — real-time pipeline progress
7
+ GET /api/clips/{session} — list generated clips
8
+ PATCH /api/clips/{session}/{index}/subtitles — update subtitle event
9
+ PATCH /api/clips/{session}/{index}/style — apply global style override
10
+ POST /api/clips/{session}/{index}/render — burn-in subtitles → download
11
+ GET /downloads/{session}/{filename} — serve output files
12
+ """
13
+ import asyncio
14
+ import json
15
+ import os
16
+ import uuid
17
+ from pathlib import Path
18
+ from typing import Optional
19
+
20
+ from fastapi import FastAPI, UploadFile, File, Form, Header, WebSocket, WebSocketDisconnect, HTTPException
21
+ from fastapi.middleware.cors import CORSMiddleware
22
+ from fastapi.staticfiles import StaticFiles
23
+ from pydantic import BaseModel
24
+ from loguru import logger
25
+
26
+ from src.gpu.rocm_utils import get_device, log_gpu_status
27
+ from src.gpu.vllm_manager import ensure_vllm_running, vllm_stop, vllm_status
28
+ from src.ingestion.youtube import download_video_async, get_video_info
29
+ from src.transcription.whisper import transcribe_async, extract_audio
30
+ from src.analysis.scene_detector import detect_scenes, sample_frames
31
+ from src.analysis.vision import analyze_scenes_batch_async
32
+ from src.analysis.highlight_scorer import score_scenes, select_top_clips
33
+ from src.processing.clip_extractor import extract_all_clips_async, burn_subtitles
34
+ from src.processing.subtitle import generate_subtitles, update_subtitle_event, apply_global_style_override
35
+ from src.processing.high_retention import apply_hre
36
+
37
+ app = FastAPI(title="ElevenClip AI", version="1.0.0")
38
+
39
+ app.add_middleware(
40
+ CORSMiddleware,
41
+ allow_origins=["*"],
42
+ allow_credentials=True,
43
+ allow_methods=["*"],
44
+ allow_headers=["*"],
45
+ )
46
+
47
+ WORK_DIR = Path(os.getenv("WORK_DIR", "/tmp/elevnclip"))
48
+ WORK_DIR.mkdir(parents=True, exist_ok=True)
49
+
50
+ DEMO_ACCESS_CODE = os.getenv("DEMO_ACCESS_CODE", "").strip()
51
+ MAX_CONCURRENT_JOBS = int(os.getenv("MAX_CONCURRENT_JOBS", "1"))
52
+ MAX_UPLOAD_MB = int(os.getenv("MAX_UPLOAD_MB", "500"))
53
+
54
+ app.mount("/downloads", StaticFiles(directory=str(WORK_DIR)), name="downloads")
55
+
56
+ # In-memory session store + WebSocket registry
57
+ sessions: dict[str, dict] = {}
58
+ ws_connections: dict[str, WebSocket] = {}
59
+ ws_queues: dict[str, list[dict]] = {} # buffer progress messages until WS connects
60
+ active_jobs: set[str] = set()
61
+
62
+
63
+ def _require_access(x_demo_key: Optional[str]) -> None:
64
+ """Optional public-demo guard for expensive GPU endpoints."""
65
+ if DEMO_ACCESS_CODE and (x_demo_key or "").strip() != DEMO_ACCESS_CODE:
66
+ raise HTTPException(403, "Access code required for generation")
67
+
68
+
69
+ # ─── Startup ──────────────────────────────────────────────────────────────
70
+
71
+ @app.on_event("startup")
72
+ async def startup():
73
+ log_gpu_status()
74
+ # Pre-populate demo session so /editor?session=demo always works
75
+ sessions["demo"] = {"status": "done", "clips": _build_demo_clips()}
76
+
77
+
78
+ def _build_demo_clips() -> list[dict]:
79
+ return [
80
+ {
81
+ "index": 1, "start": 0.0, "end": 45.0, "duration": 45.0, "score": 0.91,
82
+ "clip_path": None, "final_path": None, "ass_path": None,
83
+ "download_url": None, "raw_url": None,
84
+ "highlight_reason": "High-energy moment with peak audience reaction",
85
+ "vision_analysis": {"excitement_score": 0.92, "tiktok_potential": 0.89, "emotion": "excited", "action_type": "gaming"},
86
+ },
87
+ {
88
+ "index": 2, "start": 90.0, "end": 150.0, "duration": 60.0, "score": 0.83,
89
+ "clip_path": None, "final_path": None, "ass_path": None,
90
+ "download_url": None, "raw_url": None,
91
+ "highlight_reason": "Funny reaction — peak humor level detected",
92
+ "vision_analysis": {"excitement_score": 0.78, "tiktok_potential": 0.85, "emotion": "funny", "action_type": "reaction"},
93
+ },
94
+ {
95
+ "index": 3, "start": 210.0, "end": 270.0, "duration": 60.0, "score": 0.76,
96
+ "clip_path": None, "final_path": None, "ass_path": None,
97
+ "download_url": None, "raw_url": None,
98
+ "highlight_reason": "Educational highlight with strong engagement signal",
99
+ "vision_analysis": {"excitement_score": 0.70, "tiktok_potential": 0.80, "emotion": "happy", "action_type": "tutorial"},
100
+ },
101
+ ]
102
+
103
+
104
+ # ─── WebSocket Progress ────────────────────────────────────────────────────
105
+
106
+ @app.websocket("/ws/progress/{session_id}")
107
+ async def ws_progress(websocket: WebSocket, session_id: str):
108
+ await websocket.accept()
109
+ ws_connections[session_id] = websocket
110
+
111
+ # Flush messages that were sent before the WS connected
112
+ for msg in ws_queues.pop(session_id, []):
113
+ try:
114
+ await websocket.send_json(msg)
115
+ except Exception:
116
+ break
117
+
118
+ try:
119
+ while True:
120
+ await asyncio.sleep(30) # keep-alive
121
+ except WebSocketDisconnect:
122
+ ws_connections.pop(session_id, None)
123
+
124
+
125
+ async def send_progress(session_id: str, stage: str, pct: int, message: str = ""):
126
+ payload = {"stage": stage, "pct": pct, "message": message}
127
+ sessions.setdefault(session_id, {})["last_progress"] = payload
128
+
129
+ ws = ws_connections.get(session_id)
130
+ if ws:
131
+ try:
132
+ await ws.send_json(payload)
133
+ return
134
+ except Exception:
135
+ ws_connections.pop(session_id, None)
136
+
137
+ # WS not yet connected — buffer for flush on connect
138
+ ws_queues.setdefault(session_id, []).append(payload)
139
+
140
+
141
+ # ─── Models ───────────────────────────────────────────────────────────────
142
+
143
+ class VideoInfoRequest(BaseModel):
144
+ url: str
145
+
146
+ DEMO_VIDEO_DIR = Path("/root/ElevenClip-AI/demo_videos")
147
+ _DEMO_CANDIDATES = ["demo1.mp4", "demo2.mp4", "demo.mp4"]
148
+
149
+ def _get_demo_video() -> Path | None:
150
+ import random
151
+ available = [DEMO_VIDEO_DIR / f for f in _DEMO_CANDIDATES if (DEMO_VIDEO_DIR / f).exists()]
152
+ return random.choice(available) if available else None
153
+
154
+ class ProcessSettings(BaseModel):
155
+ youtube_url: Optional[str] = None
156
+ use_demo_video: bool = False
157
+ channel_description: str = ""
158
+ clip_style: str = "entertaining"
159
+ target_duration: int = 60
160
+ clip_count: int = 3
161
+ clip_language: str = "auto"
162
+ subtitle_language: str = "en"
163
+ mode: str = "normal" # "normal" | "hre"
164
+ aspect_mode: str = "crop" # "crop" | "letterbox"
165
+ style_config: dict = {}
166
+
167
+ class SubtitlePatch(BaseModel):
168
+ event_index: int
169
+ updates: dict # {text, start, end}
170
+
171
+ class GlobalStylePatch(BaseModel):
172
+ style_config: dict
173
+
174
+
175
+ # ─── Routes ───────────────────────────────────────────────────────────────
176
+
177
+ @app.get("/health")
178
+ async def health():
179
+ return {"status": "ok", "device": get_device()}
180
+
181
+
182
+ @app.post("/api/video-info")
183
+ async def video_info(req: VideoInfoRequest, x_demo_key: Optional[str] = Header(None, alias="X-Demo-Key")):
184
+ _require_access(x_demo_key)
185
+ try:
186
+ return get_video_info(req.url)
187
+ except Exception as e:
188
+ raise HTTPException(400, str(e))
189
+
190
+
191
+ @app.post("/api/process")
192
+ async def process(
193
+ settings_json: str = Form(...),
194
+ file: Optional[UploadFile] = File(None),
195
+ x_demo_key: Optional[str] = Header(None, alias="X-Demo-Key"),
196
+ ):
197
+ """Main pipeline endpoint. Returns session_id immediately; progress via WebSocket."""
198
+ _require_access(x_demo_key)
199
+ if len(active_jobs) >= MAX_CONCURRENT_JOBS:
200
+ raise HTTPException(429, "GPU is busy. Please try again in a few minutes.")
201
+
202
+ settings = ProcessSettings(**json.loads(settings_json))
203
+ session_id = str(uuid.uuid4())
204
+ session_dir = WORK_DIR / session_id
205
+ session_dir.mkdir(parents=True, exist_ok=True)
206
+
207
+ sessions[session_id] = {"status": "starting", "clips": []}
208
+
209
+ # Read file bytes NOW — UploadFile becomes invalid once the response is sent
210
+ file_bytes: Optional[bytes] = None
211
+ file_name: Optional[str] = None
212
+ if file:
213
+ file_bytes = await file.read()
214
+ file_name = file.filename or "upload.mp4"
215
+ if len(file_bytes) > MAX_UPLOAD_MB * 1024 * 1024:
216
+ raise HTTPException(413, f"File too large. Max upload size is {MAX_UPLOAD_MB} MB.")
217
+
218
+ active_jobs.add(session_id)
219
+ asyncio.create_task(_run_pipeline(session_id, session_dir, settings, file_bytes, file_name))
220
+ return {"session_id": session_id}
221
+
222
+
223
+ # ─── Pipeline ─────────────────────────────────────────────────────────────
224
+
225
+ async def _run_pipeline(
226
+ session_id: str,
227
+ session_dir: Path,
228
+ settings: ProcessSettings,
229
+ file_bytes: Optional[bytes],
230
+ file_name: Optional[str],
231
+ ):
232
+ loop = asyncio.get_running_loop()
233
+ frames_dir = session_dir / "frames"
234
+
235
+ try:
236
+ # ── 1. Acquire video ──────────────────────────────────────────────
237
+ await send_progress(session_id, "download", 5, "Acquiring video...")
238
+
239
+ if settings.use_demo_video and (demo_vid := _get_demo_video()):
240
+ video_path = demo_vid
241
+ await send_progress(session_id, "download", 30, f"Using demo video: {demo_vid.name}")
242
+ elif settings.youtube_url:
243
+ def pct_cb(p):
244
+ asyncio.run_coroutine_threadsafe(
245
+ send_progress(session_id, "download", max(5, int(p * 0.28)), f"Downloading {p:.0f}%"),
246
+ loop,
247
+ )
248
+ video_path = await download_video_async(
249
+ settings.youtube_url, session_dir, session_id, pct_cb
250
+ )
251
+ elif file_bytes:
252
+ suffix = Path(file_name).suffix if file_name else ".mp4"
253
+ video_path = session_dir / f"{session_id}_input{suffix}"
254
+ await loop.run_in_executor(None, video_path.write_bytes, file_bytes)
255
+ else:
256
+ raise ValueError("No video source provided")
257
+
258
+ await send_progress(session_id, "download", 30, "Video ready")
259
+
260
+ # ── 2. Extract audio ─────────────────────────────────────────────
261
+ await send_progress(session_id, "audio", 32, "Extracting audio (16kHz mono)...")
262
+ audio_path = session_dir / f"{session_id}_audio.wav"
263
+ await loop.run_in_executor(None, lambda: extract_audio(video_path, audio_path))
264
+
265
+ # ── 3+4. Scene detection AND Whisper transcription IN PARALLEL ───
266
+ # Scene detection runs on CPU; Whisper runs on AMD GPU. True concurrency.
267
+ await send_progress(session_id, "scenes", 35, "Scene detection + Whisper transcription (parallel on AMD ROCm)...")
268
+ device = get_device()
269
+
270
+ scenes_future = loop.run_in_executor(None, lambda: detect_scenes(video_path))
271
+ transcript_task = transcribe_async(
272
+ audio_path,
273
+ clip_language=settings.clip_language,
274
+ subtitle_language=settings.subtitle_language,
275
+ device=device,
276
+ )
277
+ scenes, transcript = await asyncio.gather(scenes_future, transcript_task)
278
+
279
+ await send_progress(
280
+ session_id, "transcribe", 58,
281
+ f"Whisper: {len(transcript.get('segments', []))} segments | SceneDetect: {len(scenes)} scenes"
282
+ )
283
+
284
+ # Frame sampling (after scenes list is known)
285
+ scenes_with_frames = await loop.run_in_executor(
286
+ None, lambda: sample_frames(video_path, scenes, frames_dir)
287
+ )
288
+
289
+ # ── 5. Qwen2.5-VL multimodal analysis (concurrent requests to vLLM) ─
290
+ n_scenes = len(scenes_with_frames)
291
+ await send_progress(session_id, "vision", 58, "Ensuring AI model is running...")
292
+ await loop.run_in_executor(
293
+ None,
294
+ lambda: ensure_vllm_running(
295
+ progress_cb=lambda msg: asyncio.run_coroutine_threadsafe(
296
+ send_progress(session_id, "vision", 59, msg), loop
297
+ )
298
+ ),
299
+ )
300
+ await send_progress(session_id, "vision", 60, f"Qwen2.5-VL analyzing {n_scenes} scenes (vision + audio + text fusion)...")
301
+ scenes_analyzed = await analyze_scenes_batch_async(
302
+ scenes_with_frames,
303
+ transcript.get("segments", []),
304
+ channel_description=settings.channel_description,
305
+ clip_style=settings.clip_style,
306
+ )
307
+ await send_progress(session_id, "vision", 76, f"Multimodal analysis complete: {n_scenes} scenes scored")
308
+
309
+ # ── 6. Multi-signal scoring ─────────────────────────────────────
310
+ await send_progress(session_id, "scoring", 77, "Scoring: 0.40×vision + 0.35×audio_energy + 0.25×text_keywords")
311
+ scored = score_scenes(scenes_analyzed, audio_path, settings.clip_style, settings.target_duration)
312
+ selected = select_top_clips(scored, settings.clip_count, settings.target_duration)
313
+
314
+ # ── 7. Extract clips (AMD AMF hardware encoder) ─────────────────
315
+ await send_progress(session_id, "cutting", 81, f"Cutting {len(selected)} clips (h264_amf)...")
316
+ clips = await extract_all_clips_async(video_path, selected, session_dir, session_id, aspect_mode=settings.aspect_mode)
317
+
318
+ # ── 8. Subtitles / HRE (all clips in parallel) ─────────────────
319
+ await send_progress(session_id, "subtitles", 86, "Generating subtitles (parallel)...")
320
+
321
+ subtitle_tasks = []
322
+ final_clips = []
323
+
324
+ for clip in clips:
325
+ if not clip.get("clip_path"):
326
+ continue
327
+ clip_path = Path(clip["clip_path"])
328
+ i = clip["clip_index"]
329
+
330
+ clip_transcript = {
331
+ **transcript,
332
+ "segments": [
333
+ s for s in transcript.get("segments", [])
334
+ if s["start"] < clip["end"] and s["end"] > clip["start"]
335
+ ],
336
+ }
337
+
338
+ ass_path = session_dir / f"{session_id}_clip_{i:02d}.ass"
339
+ final_path = session_dir / f"{session_id}_clip_{i:02d}_final.mp4"
340
+
341
+ if settings.mode == "hre":
342
+ subtitle_tasks.append(loop.run_in_executor(
343
+ None,
344
+ lambda cp=clip_path, cd=clip, tr=clip_transcript, fp=final_path:
345
+ apply_hre(cp, cd, tr, fp)
346
+ ))
347
+ else:
348
+ def _gen_and_burn(cp=clip_path, ap=ass_path, tr=clip_transcript, cs=clip["start"], fp=final_path):
349
+ generate_subtitles(tr, ap, settings.style_config, clip_start_offset=cs)
350
+ burn_subtitles(cp, ap, fp)
351
+ subtitle_tasks.append(loop.run_in_executor(None, _gen_and_burn))
352
+
353
+ final_clips.append({
354
+ "index": i,
355
+ "start": clip["start"],
356
+ "end": clip["end"],
357
+ "duration": clip["end"] - clip["start"],
358
+ "score": clip.get("final_score", 0),
359
+ "clip_path": str(clip_path),
360
+ "final_path": str(final_path),
361
+ "ass_path": str(ass_path) if settings.mode == "normal" else None,
362
+ "download_url": f"/downloads/{session_id}/{final_path.name}",
363
+ "raw_url": f"/downloads/{session_id}/{clip_path.name}",
364
+ "vision_analysis": clip.get("vision_analysis", {}),
365
+ "highlight_reason": clip.get("vision_analysis", {}).get("highlight_reason", ""),
366
+ })
367
+
368
+ if subtitle_tasks:
369
+ await asyncio.gather(*subtitle_tasks)
370
+
371
+ sessions[session_id] = {"status": "done", "clips": final_clips}
372
+ await send_progress(session_id, "done", 100, f"Done! {len(final_clips)} clips ready for download.")
373
+
374
+ except Exception as e:
375
+ logger.exception(f"Pipeline failed [{session_id}]")
376
+ sessions[session_id] = {"status": "error", "error": str(e), "clips": []}
377
+ await send_progress(session_id, "error", 0, f"Pipeline error: {e}")
378
+ finally:
379
+ active_jobs.discard(session_id)
380
+
381
+
382
+ # ─── Editor API ───────────────────────────────────────────────────────────
383
+
384
+ @app.get("/api/clips/{session_id}")
385
+ async def get_clips(session_id: str):
386
+ session = sessions.get(session_id)
387
+ if not session:
388
+ raise HTTPException(404, "Session not found")
389
+ return session
390
+
391
+
392
+ @app.patch("/api/clips/{session_id}/{clip_index}/subtitles")
393
+ async def patch_subtitle(session_id: str, clip_index: int, patch: SubtitlePatch):
394
+ clip = _get_clip_or_404(session_id, clip_index)
395
+ if not clip.get("ass_path"):
396
+ raise HTTPException(404, "No subtitle file for this clip")
397
+ update_subtitle_event(Path(clip["ass_path"]), patch.event_index, patch.updates)
398
+ return {"ok": True}
399
+
400
+
401
+ @app.patch("/api/clips/{session_id}/{clip_index}/style")
402
+ async def patch_global_style(session_id: str, clip_index: int, patch: GlobalStylePatch):
403
+ clip = _get_clip_or_404(session_id, clip_index)
404
+ if not clip.get("ass_path"):
405
+ raise HTTPException(404, "No subtitle file for this clip")
406
+ apply_global_style_override(Path(clip["ass_path"]), patch.style_config)
407
+ return {"ok": True}
408
+
409
+
410
+ @app.post("/api/clips/{session_id}/{clip_index}/render")
411
+ async def render_clip(session_id: str, clip_index: int):
412
+ clip = _get_clip_or_404(session_id, clip_index)
413
+
414
+ clip_path = Path(clip["clip_path"])
415
+ ass_path = Path(clip["ass_path"]) if clip.get("ass_path") else None
416
+ final_path = clip_path.parent / f"{clip_path.stem}_edited.mp4"
417
+
418
+ if ass_path and ass_path.exists():
419
+ loop = asyncio.get_running_loop()
420
+ await loop.run_in_executor(None, lambda: burn_subtitles(clip_path, ass_path, final_path))
421
+ else:
422
+ final_path = Path(clip["final_path"])
423
+
424
+ download_url = f"/downloads/{session_id}/{final_path.name}"
425
+ clip["download_url"] = download_url
426
+ clip["final_path"] = str(final_path)
427
+ return {"download_url": download_url}
428
+
429
+
430
+ def _get_clip_or_404(session_id: str, clip_index: int) -> dict:
431
+ session = sessions.get(session_id)
432
+ if not session:
433
+ raise HTTPException(404, "Session not found")
434
+ clip = next((c for c in session.get("clips", []) if c["index"] == clip_index), None)
435
+ if not clip:
436
+ raise HTTPException(404, f"Clip {clip_index} not found")
437
+ return clip
438
+
439
+
440
+ # ─── vLLM management endpoints ────────────────────────────────────────────────
441
+
442
+ @app.get("/api/vllm/status")
443
+ async def get_vllm_status():
444
+ return vllm_status()
445
+
446
+
447
+ @app.post("/api/vllm/stop")
448
+ async def stop_vllm(x_demo_key: Optional[str] = Header(None, alias="X-Demo-Key")):
449
+ _require_access(x_demo_key)
450
+ loop = asyncio.get_running_loop()
451
+ await loop.run_in_executor(None, vllm_stop)
452
+ return {"ok": True, "message": "vLLM stopped — will restart automatically on next job"}
453
+
454
+
455
+ @app.post("/api/vllm/start")
456
+ async def start_vllm(x_demo_key: Optional[str] = Header(None, alias="X-Demo-Key")):
457
+ _require_access(x_demo_key)
458
+ loop = asyncio.get_running_loop()
459
+ await loop.run_in_executor(None, ensure_vllm_running)
460
+ return {"ok": True, "status": vllm_status()}
461
+
462
+
463
+ if __name__ == "__main__":
464
+ import uvicorn
465
+ log_gpu_status()
466
+ uvicorn.run(app, host="0.0.0.0", port=8000, reload=False)
backend/pyproject.toml DELETED
@@ -1,44 +0,0 @@
1
- [project]
2
- name = "elevenclip-ai-backend"
3
- version = "0.1.0"
4
- description = "FastAPI backend for ElevenClip.AI on AMD ROCm"
5
- requires-python = ">=3.11"
6
- dependencies = [
7
- "fastapi>=0.115.0",
8
- "uvicorn[standard]>=0.30.0",
9
- "pydantic>=2.8.0",
10
- "python-multipart>=0.0.9",
11
- "yt-dlp>=2025.1.15",
12
- "celery[redis]>=5.4.0",
13
- "redis>=5.0.0"
14
- ]
15
-
16
- [project.optional-dependencies]
17
- ai = [
18
- "transformers>=4.47.0",
19
- "accelerate>=1.2.0",
20
- "sentencepiece>=0.2.0",
21
- "safetensors>=0.4.5",
22
- "Pillow>=10.0.0",
23
- "qwen-vl-utils>=0.0.8"
24
- ]
25
- rocm-inference = [
26
- "vllm>=0.6.6",
27
- "optimum-amd>=0.1.0; platform_system == 'Linux'"
28
- ]
29
- dev = [
30
- "pytest>=8.3.0",
31
- "httpx>=0.27.0",
32
- "ruff>=0.6.0"
33
- ]
34
-
35
- [build-system]
36
- requires = ["setuptools>=69.0"]
37
- build-backend = "setuptools.build_meta"
38
-
39
- [tool.setuptools.packages.find]
40
- include = ["app*"]
41
-
42
- [tool.ruff]
43
- line-length = 100
44
- target-version = "py311"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/requirements.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI & server
2
+ fastapi==0.115.5
3
+ uvicorn[standard]==0.32.1
4
+ python-multipart==0.0.20
5
+ websockets==14.1
6
+ aiofiles==24.1.0
7
+ httpx==0.28.1
8
+
9
+ # Video download
10
+ yt-dlp==2025.4.30
11
+
12
+ # Video processing (ffmpeg called via subprocess — no Python wrapper needed)
13
+ scenedetect[opencv]==0.6.5.2
14
+ librosa==0.10.2
15
+ soundfile==0.12.1
16
+
17
+ # AI — Whisper STT (ROCm-optimized)
18
+ # PyTorch must be installed separately with ROCm wheels:
19
+ # pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
20
+ transformers==4.47.1
21
+ accelerate==1.2.1
22
+
23
+ # AI — Vision: Qwen2.5-VL via vLLM OpenAI-compatible API
24
+ # vLLM installed separately:
25
+ # pip install vllm --extra-index-url https://download.pytorch.org/whl/rocm6.2
26
+ openai==1.57.4
27
+
28
+ # Subtitles
29
+ pysubs2==1.7.3
30
+
31
+ # Utils
32
+ numpy==1.26.4
33
+ pillow==11.0.0
34
+ python-dotenv==1.0.1
35
+ pydantic==2.10.4
36
+ pydantic-settings==2.7.0
37
+ loguru==0.7.3
backend/src/__init__.py ADDED
File without changes
backend/src/analysis/__init__.py ADDED
File without changes
backend/src/analysis/highlight_scorer.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Multi-signal highlight scoring: Vision + Audio energy + Text keywords."""
2
+ import math
3
+ from pathlib import Path
4
+ from loguru import logger
5
+
6
+
7
+ # Style-specific keyword boosts
8
+ STYLE_KEYWORDS = {
9
+ "funny": ["haha", "lol", "funny", "joke", "laugh", "omg", "what", "no way", "ตลก", "ฮา", "โอ้โห", "搞笑", "哈哈"],
10
+ "serious": ["important", "key", "must", "critical", "สำคัญ", "ต้อง", "หลัก", "重要", "关键"],
11
+ "educational": ["learn", "tip", "trick", "how", "why", "เรียน", "วิธี", "ทำไม", "学习", "方法", "技巧"],
12
+ "gaming": ["win", "lose", "boss", "kill", "score", "level", "ชนะ", "แพ้", "赢", "输"],
13
+ "entertainment": ["wow", "amazing", "incredible", "unbelievable", "เจ๋ง", "เยี่ยม", "厉害", "太棒了"],
14
+ }
15
+
16
+
17
+ def compute_audio_energy(audio_path: Path, scenes: list[dict]) -> list[float]:
18
+ """Compute RMS energy per scene using librosa."""
19
+ try:
20
+ import librosa
21
+ import numpy as np
22
+
23
+ y, sr = librosa.load(str(audio_path), sr=16000, mono=True)
24
+ energies = []
25
+
26
+ for scene in scenes:
27
+ start_sample = int(scene["start"] * sr)
28
+ end_sample = int(scene["end"] * sr)
29
+ segment = y[start_sample:end_sample]
30
+ if len(segment) == 0:
31
+ energies.append(0.0)
32
+ continue
33
+ rms = float(np.sqrt(np.mean(segment ** 2)))
34
+ energies.append(rms)
35
+
36
+ # Normalize to 0-1
37
+ if max(energies) > 0:
38
+ max_e = max(energies)
39
+ energies = [e / max_e for e in energies]
40
+
41
+ return energies
42
+
43
+ except ImportError:
44
+ logger.warning("librosa not installed, using uniform audio energy")
45
+ return [0.5] * len(scenes)
46
+ except Exception as e:
47
+ logger.error(f"Audio energy computation failed: {e}")
48
+ return [0.5] * len(scenes)
49
+
50
+
51
+ def compute_text_score(transcript_text: str, clip_style: str) -> float:
52
+ """Score transcript text based on style keywords (0-1)."""
53
+ if not transcript_text:
54
+ return 0.3
55
+
56
+ text_lower = transcript_text.lower()
57
+ keywords = STYLE_KEYWORDS.get(clip_style.lower(), [])
58
+ if not keywords:
59
+ return 0.3
60
+
61
+ hits = sum(1 for kw in keywords if kw in text_lower)
62
+ score = min(1.0, hits / max(len(keywords) * 0.2, 1))
63
+ return max(0.1, score)
64
+
65
+
66
+ def score_scenes(
67
+ scenes_analyzed: list[dict],
68
+ audio_path: Path,
69
+ clip_style: str = "entertaining",
70
+ target_duration: int = 60,
71
+ ) -> list[dict]:
72
+ """Compute final highlight scores for all scenes.
73
+
74
+ Final score = 0.40 × vision + 0.35 × audio_energy + 0.25 × text_keywords
75
+ """
76
+ # Audio energy per scene
77
+ audio_energies = compute_audio_energy(audio_path, scenes_analyzed)
78
+
79
+ scored = []
80
+ for i, scene in enumerate(scenes_analyzed):
81
+ analysis = scene.get("vision_analysis", {})
82
+
83
+ vision_score = (
84
+ analysis.get("excitement_score", 0.5) * 0.5 +
85
+ analysis.get("tiktok_potential", 0.5) * 0.3 +
86
+ analysis.get("humor_level", 0.3) * 0.2
87
+ )
88
+
89
+ audio_score = audio_energies[i]
90
+
91
+ # Text from transcript segments overlapping this scene
92
+ transcript_text = scene.get("transcript_text", "")
93
+ text_score = compute_text_score(transcript_text, clip_style)
94
+
95
+ final_score = (
96
+ 0.40 * vision_score +
97
+ 0.35 * audio_score +
98
+ 0.25 * text_score
99
+ )
100
+
101
+ # Penalize very short or very long scenes relative to target
102
+ duration = scene["duration"]
103
+ duration_penalty = 1.0 - abs(duration - target_duration) / max(target_duration * 2, 1)
104
+ duration_penalty = max(0.5, duration_penalty)
105
+
106
+ scored.append({
107
+ **scene,
108
+ "vision_score": round(vision_score, 3),
109
+ "audio_score": round(audio_score, 3),
110
+ "text_score": round(text_score, 3),
111
+ "final_score": round(final_score * duration_penalty, 3),
112
+ })
113
+
114
+ scored.sort(key=lambda s: s["final_score"], reverse=True)
115
+ logger.info(f"Top scene: {scored[0]['start']:.1f}s score={scored[0]['final_score']:.3f}" if scored else "No scenes")
116
+ return scored
117
+
118
+
119
+ def select_top_clips(
120
+ scored_scenes: list[dict],
121
+ count: int,
122
+ target_duration: int,
123
+ min_gap_sec: float = 30.0,
124
+ ) -> list[dict]:
125
+ """Select top-N non-overlapping clips.
126
+
127
+ Merges adjacent high-scoring scenes to reach target_duration.
128
+ Ensures clips don't overlap (min_gap_sec between selections).
129
+ """
130
+ selected = []
131
+ used_ranges = []
132
+
133
+ for scene in scored_scenes:
134
+ if len(selected) >= count:
135
+ break
136
+
137
+ # Check overlap with already selected clips
138
+ overlaps = any(
139
+ abs(scene["start"] - used_start) < min_gap_sec
140
+ for used_start in used_ranges
141
+ )
142
+ if overlaps:
143
+ continue
144
+
145
+ # Adjust clip boundaries to match target_duration
146
+ clip = _adjust_clip_duration(scene, target_duration)
147
+ selected.append(clip)
148
+ used_ranges.append(clip["start"])
149
+
150
+ logger.info(f"Selected {len(selected)}/{count} clips")
151
+ return sorted(selected, key=lambda c: c["start"])
152
+
153
+
154
+ def _adjust_clip_duration(scene: dict, target_sec: int) -> dict:
155
+ """Expand or shrink a scene to approximately target_sec."""
156
+ current_dur = scene["end"] - scene["start"]
157
+ if abs(current_dur - target_sec) < 5:
158
+ return scene
159
+
160
+ # Center the target window on the scene midpoint
161
+ mid = (scene["start"] + scene["end"]) / 2
162
+ half = target_sec / 2
163
+ new_start = max(0, mid - half)
164
+ new_end = new_start + target_sec
165
+
166
+ return {**scene, "start": new_start, "end": new_end, "duration": target_sec}
backend/src/analysis/scene_detector.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Scene detection using PySceneDetect."""
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ from loguru import logger
5
+
6
+
7
+ def detect_scenes(
8
+ video_path: Path,
9
+ threshold: float = 27.0,
10
+ min_scene_len_sec: float = 2.0,
11
+ ) -> list[dict]:
12
+ """Detect scene cuts and return list of scenes with timestamps.
13
+
14
+ Returns:
15
+ [{"start": float, "end": float, "duration": float}, ...]
16
+ """
17
+ try:
18
+ from scenedetect import open_video, SceneManager
19
+ from scenedetect.detectors import ContentDetector
20
+
21
+ video = open_video(str(video_path))
22
+ scene_manager = SceneManager()
23
+ scene_manager.add_detector(ContentDetector(threshold=threshold))
24
+
25
+ logger.info(f"Running scene detection on: {video_path.name}")
26
+ scene_manager.detect_scenes(video, show_progress=False)
27
+ scene_list = scene_manager.get_scene_list()
28
+
29
+ scenes = []
30
+ for start_tc, end_tc in scene_list:
31
+ start = start_tc.get_seconds()
32
+ end = end_tc.get_seconds()
33
+ duration = end - start
34
+ if duration >= min_scene_len_sec:
35
+ scenes.append({"start": start, "end": end, "duration": duration})
36
+
37
+ logger.info(f"Detected {len(scenes)} scenes")
38
+ if not scenes:
39
+ logger.warning("0 scenes from ContentDetector — using fixed-interval fallback")
40
+ return _fixed_interval_scenes(video_path, interval_sec=8.0)
41
+ return scenes
42
+
43
+ except ImportError:
44
+ logger.warning("scenedetect not installed, using fixed-interval fallback")
45
+ return _fixed_interval_scenes(video_path, interval_sec=5.0)
46
+ except Exception as e:
47
+ logger.error(f"Scene detection failed: {e}")
48
+ return _fixed_interval_scenes(video_path, interval_sec=5.0)
49
+
50
+
51
+ def _fixed_interval_scenes(video_path: Path, interval_sec: float = 5.0) -> list[dict]:
52
+ """Fallback: split video into fixed-interval scenes."""
53
+ import subprocess
54
+ result = subprocess.run(
55
+ ["ffprobe", "-v", "error", "-show_entries", "format=duration",
56
+ "-of", "default=noprint_wrappers=1:nokey=1", str(video_path)],
57
+ capture_output=True, text=True
58
+ )
59
+ try:
60
+ total = float(result.stdout.strip())
61
+ except ValueError:
62
+ total = 300.0
63
+
64
+ scenes = []
65
+ t = 0.0
66
+ while t < total:
67
+ end = min(t + interval_sec, total)
68
+ scenes.append({"start": t, "end": end, "duration": end - t})
69
+ t = end
70
+ return scenes
71
+
72
+
73
+ def sample_frames(
74
+ video_path: Path,
75
+ scenes: list[dict],
76
+ output_dir: Path,
77
+ frames_per_scene: int = 3,
78
+ ) -> list[dict]:
79
+ """Extract representative frames from each scene for vision analysis.
80
+
81
+ Returns scenes with added 'frame_paths' key.
82
+ """
83
+ import subprocess
84
+ output_dir.mkdir(parents=True, exist_ok=True)
85
+
86
+ result_scenes = []
87
+ for i, scene in enumerate(scenes):
88
+ mid = scene["start"] + scene["duration"] / 2
89
+ frame_paths = []
90
+
91
+ # Sample frames at start, middle, end of scene
92
+ timestamps = [
93
+ scene["start"] + scene["duration"] * 0.2,
94
+ mid,
95
+ scene["start"] + scene["duration"] * 0.8,
96
+ ][:frames_per_scene]
97
+
98
+ for j, ts in enumerate(timestamps):
99
+ frame_path = output_dir / f"scene_{i:04d}_frame_{j}.jpg"
100
+ cmd = [
101
+ "ffmpeg", "-y", "-ss", str(ts), "-i", str(video_path),
102
+ "-vframes", "1", "-q:v", "2", "-vf", "scale=640:-1",
103
+ str(frame_path)
104
+ ]
105
+ subprocess.run(cmd, capture_output=True)
106
+ if frame_path.exists():
107
+ frame_paths.append(str(frame_path))
108
+
109
+ result_scenes.append({**scene, "index": i, "frame_paths": frame_paths})
110
+
111
+ return result_scenes
backend/src/analysis/vision.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Qwen2.5-VL multimodal scene analysis via vLLM OpenAI-compatible API.
2
+
3
+ Sends video frames + transcript text together (true multimodal fusion).
4
+ Outputs: excitement_score, face_bbox, action_type, humor_level, emotion.
5
+ All scenes analyzed concurrently — vLLM handles GPU batching internally.
6
+ """
7
+ import asyncio
8
+ import base64
9
+ import json
10
+ import os
11
+ from pathlib import Path
12
+ from typing import Optional
13
+ from loguru import logger
14
+
15
+ VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "http://localhost:8000/v1")
16
+ VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
17
+ VLLM_API_KEY = os.getenv("VLLM_API_KEY", "EMPTY")
18
+
19
+ ANALYSIS_PROMPT = """You are a TikTok content expert analyzing a livestream segment for highlight potential.
20
+ Analyze the provided video frames and transcript text together as a unified multimodal signal.
21
+
22
+ Respond ONLY with valid JSON matching this exact schema — no markdown, no explanation:
23
+ {{
24
+ "excitement_score": <0.0-1.0>,
25
+ "humor_level": <0.0-1.0>,
26
+ "emotion": "<neutral|happy|surprised|angry|sad|excited|funny>",
27
+ "action_type": "<talking|gaming|reaction|tutorial|entertainment|sports|other>",
28
+ "has_face": <true|false>,
29
+ "face_bbox": [<x1_pct>, <y1_pct>, <x2_pct>, <y2_pct>] or null,
30
+ "highlight_reason": "<one sentence: why this IS or isn't a good TikTok highlight>",
31
+ "tiktok_potential": <0.0-1.0>
32
+ }}
33
+
34
+ Channel context: {channel_description}
35
+ Requested clip style: {clip_style}
36
+ """
37
+
38
+
39
+ def _encode_image(image_path: str) -> str:
40
+ with open(image_path, "rb") as f:
41
+ return base64.b64encode(f.read()).decode("utf-8")
42
+
43
+
44
+ def analyze_scene(
45
+ scene: dict,
46
+ transcript_text: str = "",
47
+ channel_description: str = "",
48
+ clip_style: str = "entertaining",
49
+ ) -> dict:
50
+ """Analyze a single scene using Qwen2.5-VL (vision + text multimodal fusion).
51
+
52
+ Sends up to 3 representative frames + transcript context to vLLM.
53
+ Returns analysis dict with excitement_score, face_bbox, etc.
54
+ """
55
+ try:
56
+ from openai import OpenAI
57
+
58
+ client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
59
+ frame_paths = scene.get("frame_paths", [])
60
+ if not frame_paths:
61
+ return _default_analysis()
62
+
63
+ content = []
64
+
65
+ # Add up to 3 frames as base64 images
66
+ for frame_path in frame_paths[:3]:
67
+ if Path(frame_path).exists():
68
+ b64 = _encode_image(frame_path)
69
+ content.append({
70
+ "type": "image_url",
71
+ "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
72
+ })
73
+
74
+ if not content:
75
+ return _default_analysis()
76
+
77
+ prompt = ANALYSIS_PROMPT.format(
78
+ channel_description=channel_description or "General content creator",
79
+ clip_style=clip_style,
80
+ )
81
+ if transcript_text.strip():
82
+ prompt += f"\n\nTranscript for this segment:\n\"{transcript_text.strip()}\""
83
+
84
+ content.append({"type": "text", "text": prompt})
85
+
86
+ response = client.chat.completions.create(
87
+ model=VLLM_MODEL,
88
+ messages=[{"role": "user", "content": content}],
89
+ max_tokens=300,
90
+ temperature=0.1,
91
+ )
92
+
93
+ raw = response.choices[0].message.content.strip()
94
+ # Strip markdown code fences if present
95
+ if raw.startswith("```"):
96
+ parts = raw.split("```")
97
+ raw = parts[1] if len(parts) > 1 else raw
98
+ if raw.startswith("json"):
99
+ raw = raw[4:]
100
+
101
+ analysis = json.loads(raw.strip())
102
+ logger.debug(
103
+ f"Scene [{scene['start']:.1f}s-{scene['end']:.1f}s]: "
104
+ f"excitement={analysis.get('excitement_score', 0):.2f} "
105
+ f"tiktok={analysis.get('tiktok_potential', 0):.2f} | "
106
+ f"{analysis.get('highlight_reason', '')[:60]}"
107
+ )
108
+ try:
109
+ from src.gpu.vllm_manager import vllm_touch
110
+ vllm_touch()
111
+ except Exception:
112
+ pass
113
+ return analysis
114
+
115
+ except Exception as e:
116
+ logger.warning(f"Vision analysis failed at {scene.get('start', 0):.1f}s: {e}")
117
+ return _default_analysis()
118
+
119
+
120
+ async def analyze_scenes_batch_async(
121
+ scenes_with_frames: list[dict],
122
+ transcript_segments: list[dict],
123
+ channel_description: str = "",
124
+ clip_style: str = "entertaining",
125
+ ) -> list[dict]:
126
+ """Analyze all scenes concurrently.
127
+
128
+ Sends all vLLM requests in parallel — the server queues and batches them
129
+ internally, giving full GPU utilization on AMD MI300X.
130
+ Each result includes 'vision_analysis' and 'transcript_text' for scoring.
131
+ """
132
+ loop = asyncio.get_running_loop()
133
+
134
+ async def _analyze_one(scene: dict) -> dict:
135
+ scene_text = " ".join(
136
+ seg["text"] for seg in transcript_segments
137
+ if seg["start"] < scene["end"] and seg["end"] > scene["start"]
138
+ )
139
+ analysis = await loop.run_in_executor(
140
+ None,
141
+ lambda s=scene, t=scene_text: analyze_scene(s, t, channel_description, clip_style),
142
+ )
143
+ return {**scene, "vision_analysis": analysis, "transcript_text": scene_text}
144
+
145
+ results = await asyncio.gather(*[_analyze_one(s) for s in scenes_with_frames])
146
+ logger.info(f"Vision analysis complete: {len(results)} scenes")
147
+ return list(results)
148
+
149
+
150
+ def _default_analysis() -> dict:
151
+ """Fallback analysis when vLLM is unavailable (keeps pipeline running)."""
152
+ return {
153
+ "excitement_score": 0.5,
154
+ "humor_level": 0.3,
155
+ "emotion": "neutral",
156
+ "action_type": "talking",
157
+ "has_face": False,
158
+ "face_bbox": None,
159
+ "highlight_reason": "Vision model unavailable — using audio+text signals only",
160
+ "tiktok_potential": 0.4,
161
+ }
162
+
163
+
164
+ HRE_SEGMENT_PROMPT = """Analyze this video frame for high-retention TikTok editing decisions.
165
+
166
+ Segment {seg_idx} of {n_total}. Transcript: "{context}"
167
+
168
+ Respond ONLY with valid JSON — no markdown:
169
+ {{
170
+ "zoom_direction": "<in|out|hold>",
171
+ "zoom_speed": "<fast|slow>",
172
+ "face_detected": <true|false>,
173
+ "face_cx": <0.0-1.0>,
174
+ "face_cy": <0.0-1.0>,
175
+ "subtitle_position": "<top|bottom>",
176
+ "subtitle_color": "<white|yellow|cyan|orange|green>",
177
+ "energy_level": "<high|medium|low>",
178
+ "moment_type": "<hook|punchline|context|reaction|transition>"
179
+ }}
180
+
181
+ Rules:
182
+ - seg_idx==0: always zoom_direction=in, zoom_speed=fast (hook the viewer)
183
+ - zoom IN fast: punchlines, reactions, peak energy
184
+ - zoom IN slow: context, buildup, moderate energy
185
+ - zoom OUT: reveals, breathing room after intensity
186
+ - HOLD: stable content, text-heavy moments
187
+ - subtitle TOP: face is in bottom half → put text at top
188
+ - subtitle BOTTOM: face is in top half → text at bottom
189
+ - face_cx/face_cy: face center as 0.0-1.0 fraction of frame
190
+ """
191
+
192
+
193
+ def analyze_frame_for_hre(
194
+ frame_path: "Path",
195
+ context: str = "",
196
+ seg_idx: int = 0,
197
+ n_total: int = 1,
198
+ ) -> dict:
199
+ """Per-segment HRE: zoom direction, subtitle position+color for this moment."""
200
+ try:
201
+ from openai import OpenAI
202
+
203
+ client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
204
+ if not Path(frame_path).exists():
205
+ return _default_hre_analysis(seg_idx, n_total)
206
+
207
+ b64 = _encode_image(str(frame_path))
208
+ prompt = HRE_SEGMENT_PROMPT.format(
209
+ seg_idx=seg_idx, n_total=n_total, context=context[:200]
210
+ )
211
+ response = client.chat.completions.create(
212
+ model=VLLM_MODEL,
213
+ messages=[{
214
+ "role": "user",
215
+ "content": [
216
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
217
+ {"type": "text", "text": prompt},
218
+ ],
219
+ }],
220
+ max_tokens=200,
221
+ temperature=0.1,
222
+ )
223
+ raw = response.choices[0].message.content.strip()
224
+ if raw.startswith("```"):
225
+ parts = raw.split("```")
226
+ raw = parts[1] if len(parts) > 1 else raw
227
+ if raw.startswith("json"):
228
+ raw = raw[4:]
229
+
230
+ analysis = json.loads(raw.strip())
231
+ logger.debug(
232
+ f"HRE seg {seg_idx}/{n_total}: "
233
+ f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) "
234
+ f"sub={analysis.get('subtitle_position')}/{analysis.get('subtitle_color')} "
235
+ f"type={analysis.get('moment_type')}"
236
+ )
237
+ try:
238
+ from src.gpu.vllm_manager import vllm_touch
239
+ vllm_touch()
240
+ except Exception:
241
+ pass
242
+ return analysis
243
+
244
+ except Exception as e:
245
+ logger.warning(f"HRE frame analysis failed (seg {seg_idx}): {e}")
246
+ return _default_hre_analysis(seg_idx, n_total)
247
+
248
+
249
+ def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
250
+ """Fallback with varied decisions based on position in clip."""
251
+ if seg_idx == 0:
252
+ zoom_dir, zoom_speed, moment = "in", "fast", "hook"
253
+ elif seg_idx == n_total - 1:
254
+ zoom_dir, zoom_speed, moment = "out", "slow", "transition"
255
+ elif seg_idx % 3 == 1:
256
+ zoom_dir, zoom_speed, moment = "hold", "slow", "context"
257
+ else:
258
+ zoom_dir, zoom_speed, moment = "in", "slow", "reaction"
259
+
260
+ _colors = ["yellow", "white", "cyan", "orange", "white", "yellow"]
261
+ _positions = ["bottom", "top", "bottom", "top", "bottom", "top"]
262
+
263
+ return {
264
+ "zoom_direction": zoom_dir,
265
+ "zoom_speed": zoom_speed,
266
+ "face_detected": False,
267
+ "face_cx": 0.5,
268
+ "face_cy": 0.38,
269
+ "subtitle_position": _positions[seg_idx % len(_positions)],
270
+ "subtitle_color": _colors[seg_idx % len(_colors)],
271
+ "energy_level": "medium",
272
+ "moment_type": moment,
273
+ }
274
+
275
+
276
+ def get_emoji_for_scene(scene_text: str, emotion: str, action_type: str) -> str:
277
+ """Use the configured Qwen2.5-VL model as a text prompt to select an emoji."""
278
+ try:
279
+ from openai import OpenAI
280
+ client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
281
+
282
+ response = client.chat.completions.create(
283
+ model=VLLM_MODEL,
284
+ messages=[{"role": "user", "content": (
285
+ f"Select ONE emoji for this TikTok moment.\n"
286
+ f"Emotion: {emotion}\nAction: {action_type}\n"
287
+ f"Text: \"{scene_text[:200]}\"\n"
288
+ f"Reply with ONLY the emoji character, nothing else."
289
+ )}],
290
+ max_tokens=5,
291
+ temperature=0.3,
292
+ )
293
+ emoji = response.choices[0].message.content.strip()
294
+ if len(emoji) <= 4:
295
+ return emoji
296
+ except Exception:
297
+ pass
298
+
299
+ emoji_map = {
300
+ "happy": "😄", "excited": "🔥", "funny": "😂",
301
+ "surprised": "😲", "angry": "😤", "sad": "😢",
302
+ "neutral": "💡", "gaming": "🎮", "tutorial": "📚",
303
+ "entertainment": "✨", "reaction": "😱",
304
+ }
305
+ return emoji_map.get(emotion) or emoji_map.get(action_type, "⚡")
backend/src/gpu/__init__.py ADDED
File without changes
backend/src/gpu/rocm_utils.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """AMD ROCm device management and monitoring."""
2
+ import os
3
+ import subprocess
4
+ from loguru import logger
5
+
6
+
7
+ def get_device() -> str:
8
+ """Return 'cuda' (ROCm uses cuda device name in PyTorch) or 'cpu'."""
9
+ try:
10
+ import torch
11
+ if torch.cuda.is_available():
12
+ device_name = torch.cuda.get_device_name(0)
13
+ logger.info(f"GPU detected: {device_name}")
14
+ return "cuda"
15
+ except ImportError:
16
+ pass
17
+ logger.warning("No GPU available, falling back to CPU")
18
+ return "cpu"
19
+
20
+
21
+ def get_vram_gb() -> float:
22
+ """Return available VRAM in GB."""
23
+ try:
24
+ import torch
25
+ if torch.cuda.is_available():
26
+ total = torch.cuda.get_device_properties(0).total_memory
27
+ return round(total / 1024**3, 1)
28
+ except Exception:
29
+ pass
30
+ return 0.0
31
+
32
+
33
+ def get_gpu_utilization() -> dict:
34
+ """Return GPU utilization stats via rocm-smi."""
35
+ try:
36
+ result = subprocess.run(
37
+ ["rocm-smi", "--showuse", "--showmemuse", "--csv"],
38
+ capture_output=True, text=True, timeout=5
39
+ )
40
+ if result.returncode == 0:
41
+ lines = result.stdout.strip().split("\n")
42
+ if len(lines) >= 2:
43
+ headers = lines[0].split(",")
44
+ values = lines[1].split(",")
45
+ return dict(zip(headers, values))
46
+ except (FileNotFoundError, subprocess.TimeoutExpired):
47
+ pass
48
+
49
+ # Fallback: PyTorch memory stats
50
+ try:
51
+ import torch
52
+ if torch.cuda.is_available():
53
+ allocated = torch.cuda.memory_allocated(0) / 1024**3
54
+ reserved = torch.cuda.memory_reserved(0) / 1024**3
55
+ total = torch.cuda.get_device_properties(0).total_memory / 1024**3
56
+ return {
57
+ "vram_used_gb": round(allocated, 2),
58
+ "vram_reserved_gb": round(reserved, 2),
59
+ "vram_total_gb": round(total, 2),
60
+ "vram_pct": round(allocated / total * 100, 1) if total > 0 else 0,
61
+ }
62
+ except Exception:
63
+ pass
64
+ return {}
65
+
66
+
67
+ def get_optimal_batch_size(model_type: str = "whisper") -> int:
68
+ """Return optimal batch size based on available VRAM."""
69
+ vram = get_vram_gb()
70
+ if model_type == "whisper":
71
+ if vram >= 48:
72
+ return 32
73
+ elif vram >= 24:
74
+ return 16
75
+ elif vram >= 16:
76
+ return 8
77
+ return 4
78
+ elif model_type == "vision":
79
+ if vram >= 80:
80
+ return 8
81
+ elif vram >= 48:
82
+ return 4
83
+ return 1
84
+ return 1
85
+
86
+
87
+ def log_gpu_status():
88
+ stats = get_gpu_utilization()
89
+ if stats:
90
+ logger.info(f"GPU stats: {stats}")
91
+ else:
92
+ logger.info(f"GPU: {get_device()} | VRAM: {get_vram_gb()} GB")
backend/src/gpu/vllm_manager.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """On-demand vLLM process manager.
2
+
3
+ Starts vLLM when first needed, shuts it down after idle.
4
+ Set VLLM_ON_DEMAND=false to use an externally managed vLLM instead.
5
+ Set VLLM_IDLE_TIMEOUT=300 (seconds) to control the idle shutdown window.
6
+ """
7
+ import os
8
+ import subprocess
9
+ import threading
10
+ import time
11
+
12
+ import requests
13
+ from loguru import logger
14
+
15
+ VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
16
+ VLLM_PORT = int(os.getenv("VLLM_PORT", "8000"))
17
+ IDLE_TIMEOUT = int(os.getenv("VLLM_IDLE_TIMEOUT", "300")) # 5 min default
18
+ ON_DEMAND = os.getenv("VLLM_ON_DEMAND", "true").lower() == "true"
19
+ DOCKER_CONTAINER = os.getenv("VLLM_DOCKER_CONTAINER", "rocm") # container that has vllm installed
20
+
21
+
22
+ class _VLLMManager:
23
+ def __init__(self):
24
+ self._proc: subprocess.Popen | None = None
25
+ self._lock = threading.Lock()
26
+ self._last_used = 0.0
27
+ threading.Thread(target=self._watchdog, daemon=True, name="vllm-watchdog").start()
28
+
29
+ # ── Public ────────────────────────────────────────────────────────────
30
+
31
+ def is_running(self) -> bool:
32
+ if not ON_DEMAND or DOCKER_CONTAINER:
33
+ # Docker mode or external vLLM: rely solely on health endpoint
34
+ return self._check_health()
35
+ with self._lock:
36
+ if self._proc is None or self._proc.poll() is not None:
37
+ return False
38
+ return self._check_health()
39
+
40
+ def ensure_running(self, progress_cb=None) -> None:
41
+ """Start vLLM if not running. Blocks until healthy (max 3 min)."""
42
+ if not ON_DEMAND:
43
+ return
44
+ with self._lock:
45
+ if self._check_health():
46
+ self._last_used = time.time()
47
+ return
48
+ self._start(progress_cb)
49
+
50
+ def stop(self) -> None:
51
+ if not ON_DEMAND:
52
+ return
53
+ with self._lock:
54
+ self._stop_locked()
55
+
56
+ def touch(self) -> None:
57
+ """Reset idle timer — call after each successful vLLM API call."""
58
+ self._last_used = time.time()
59
+
60
+ def status(self) -> dict:
61
+ running = self.is_running()
62
+ idle = round(time.time() - self._last_used, 1) if self._last_used else None
63
+ return {
64
+ "running": running,
65
+ "on_demand": ON_DEMAND,
66
+ "idle_seconds": idle,
67
+ "idle_timeout": IDLE_TIMEOUT,
68
+ "model": VLLM_MODEL,
69
+ }
70
+
71
+ # ── Internal ──────────────────────────────────────────────────────────
72
+
73
+ def _health_url(self) -> str:
74
+ return f"http://localhost:{VLLM_PORT}/health"
75
+
76
+ def _check_health(self) -> bool:
77
+ try:
78
+ return requests.get(self._health_url(), timeout=2).status_code == 200
79
+ except Exception:
80
+ return False
81
+
82
+ def _start(self, progress_cb=None) -> None:
83
+ logger.info("vLLM: starting on demand…")
84
+ if progress_cb:
85
+ progress_cb("Starting AI model (Qwen2.5-VL)… ~2 min first time")
86
+
87
+ # Try Docker container first (vLLM may only be installed inside a container)
88
+ if DOCKER_CONTAINER:
89
+ self._start_via_docker(progress_cb)
90
+ else:
91
+ self._start_via_subprocess(progress_cb)
92
+
93
+ def _start_via_docker(self, progress_cb=None) -> None:
94
+ """Start vLLM inside an existing Docker container via docker exec."""
95
+ cmd = (
96
+ f"vllm serve {VLLM_MODEL} "
97
+ f"--host 0.0.0.0 --port {VLLM_PORT} "
98
+ f"--gpu-memory-utilization 0.85 --max-model-len 4096 "
99
+ f"> /tmp/vllm_server.log 2>&1"
100
+ )
101
+ subprocess.Popen(
102
+ ["docker", "exec", "-d", DOCKER_CONTAINER, "bash", "-c", cmd],
103
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
104
+ )
105
+ self._proc = None # process lives inside container, tracked by health check
106
+
107
+ deadline = time.time() + 200
108
+ tick = 0
109
+ while time.time() < deadline:
110
+ time.sleep(5)
111
+ tick += 1
112
+ if self._check_health():
113
+ self._last_used = time.time()
114
+ logger.info(f"vLLM (docker) ready after {tick * 5}s")
115
+ return
116
+ if progress_cb and tick % 6 == 0:
117
+ progress_cb(f"AI model loading… {tick * 5}s")
118
+
119
+ raise RuntimeError("vLLM did not start within 200s")
120
+
121
+ def _start_via_subprocess(self, progress_cb=None) -> None:
122
+ """Start vLLM as a direct subprocess (vllm must be in current Python env)."""
123
+ import sys
124
+ self._proc = subprocess.Popen(
125
+ [
126
+ sys.executable, "-m", "vllm.entrypoints.openai.api_server",
127
+ "--model", VLLM_MODEL,
128
+ "--device", "rocm",
129
+ "--port", str(VLLM_PORT),
130
+ "--gpu-memory-utilization", "0.85",
131
+ "--max-model-len", "4096",
132
+ ],
133
+ stdout=subprocess.DEVNULL,
134
+ stderr=subprocess.PIPE,
135
+ )
136
+
137
+ deadline = time.time() + 200
138
+ tick = 0
139
+ while time.time() < deadline:
140
+ time.sleep(5)
141
+ tick += 1
142
+ if self._proc.poll() is not None:
143
+ err = self._proc.stderr.read().decode()[-600:]
144
+ raise RuntimeError(f"vLLM exited during startup: {err}")
145
+ if self._check_health():
146
+ self._last_used = time.time()
147
+ logger.info(f"vLLM ready after {tick * 5}s")
148
+ return
149
+ if progress_cb and tick % 6 == 0:
150
+ progress_cb(f"AI model loading… {tick * 5}s")
151
+
152
+ raise RuntimeError("vLLM did not start within 200s")
153
+
154
+ def _stop_locked(self) -> None:
155
+ if DOCKER_CONTAINER:
156
+ subprocess.run(
157
+ ["docker", "exec", DOCKER_CONTAINER, "pkill", "-f", "vllm"],
158
+ capture_output=True,
159
+ )
160
+ self._proc = None
161
+ elif self._proc and self._proc.poll() is None:
162
+ self._proc.terminate()
163
+ try:
164
+ self._proc.wait(timeout=10)
165
+ except subprocess.TimeoutExpired:
166
+ self._proc.kill()
167
+ self._proc = None
168
+ logger.info("vLLM stopped")
169
+
170
+ def _watchdog(self) -> None:
171
+ while True:
172
+ time.sleep(60)
173
+ if not ON_DEMAND or IDLE_TIMEOUT <= 0:
174
+ continue
175
+ with self._lock:
176
+ if (self._proc
177
+ and self._proc.poll() is None
178
+ and self._last_used > 0
179
+ and time.time() - self._last_used > IDLE_TIMEOUT):
180
+ logger.info(
181
+ f"vLLM idle {IDLE_TIMEOUT}s → shutting down to save GPU credits"
182
+ )
183
+ self._stop_locked()
184
+
185
+
186
+ _manager = _VLLMManager()
187
+
188
+
189
+ # ── Module-level helpers ──────────────────────────────────────────────────────
190
+
191
+ def ensure_vllm_running(progress_cb=None) -> None:
192
+ _manager.ensure_running(progress_cb)
193
+
194
+
195
+ def vllm_touch() -> None:
196
+ _manager.touch()
197
+
198
+
199
+ def vllm_stop() -> None:
200
+ _manager.stop()
201
+
202
+
203
+ def vllm_is_running() -> bool:
204
+ return _manager.is_running()
205
+
206
+
207
+ def vllm_status() -> dict:
208
+ return _manager.status()
backend/src/ingestion/__init__.py ADDED
File without changes
backend/src/ingestion/uploader.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Handle file upload from user."""
2
+ import shutil
3
+ from pathlib import Path
4
+ from fastapi import UploadFile
5
+ from loguru import logger
6
+
7
+ ALLOWED_EXTENSIONS = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".m4v"}
8
+ MAX_SIZE_BYTES = 2 * 1024 * 1024 * 1024 # 2 GB
9
+
10
+
11
+ async def save_upload(
12
+ file: UploadFile,
13
+ output_dir: Path,
14
+ session_id: str,
15
+ ) -> Path:
16
+ """Save uploaded video file to disk."""
17
+ suffix = Path(file.filename or "video.mp4").suffix.lower()
18
+ if suffix not in ALLOWED_EXTENSIONS:
19
+ raise ValueError(f"Unsupported file type: {suffix}. Allowed: {ALLOWED_EXTENSIONS}")
20
+
21
+ output_dir.mkdir(parents=True, exist_ok=True)
22
+ dest = output_dir / f"{session_id}_input{suffix}"
23
+
24
+ size = 0
25
+ with open(dest, "wb") as f:
26
+ while chunk := await file.read(1024 * 1024): # 1MB chunks
27
+ size += len(chunk)
28
+ if size > MAX_SIZE_BYTES:
29
+ dest.unlink(missing_ok=True)
30
+ raise ValueError("File too large (max 2 GB)")
31
+ f.write(chunk)
32
+
33
+ logger.info(f"Saved upload: {dest} ({size / 1024 / 1024:.1f} MB)")
34
+ return dest
backend/src/ingestion/youtube.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """YouTube video downloader using yt-dlp."""
2
+ import asyncio
3
+ import subprocess
4
+ from pathlib import Path
5
+ from typing import Optional, Callable
6
+ import yt_dlp
7
+ from loguru import logger
8
+
9
+
10
+ def _progress_hook(callback: Optional[Callable] = None):
11
+ def hook(d: dict):
12
+ if d["status"] == "downloading" and callback:
13
+ pct = d.get("_percent_str", "0%").strip().replace("%", "")
14
+ try:
15
+ callback(float(pct))
16
+ except ValueError:
17
+ pass
18
+ return hook
19
+
20
+
21
+ def download_video(
22
+ url: str,
23
+ output_dir: Path,
24
+ session_id: str,
25
+ progress_callback: Optional[Callable] = None,
26
+ max_height: int = 1080,
27
+ ) -> Path:
28
+ """Download video from YouTube (or any yt-dlp-supported site).
29
+
30
+ Returns path to downloaded MP4 file.
31
+ """
32
+ output_dir.mkdir(parents=True, exist_ok=True)
33
+ output_template = str(output_dir / f"{session_id}_input.%(ext)s")
34
+
35
+ ydl_opts = {
36
+ "format": (
37
+ f"bestvideo[vcodec^=avc1][height<={max_height}]+bestaudio/"
38
+ f"bestvideo[vcodec^=avc][height<={max_height}]+bestaudio/"
39
+ f"bestvideo[vcodec!^=av01][height<={max_height}]+bestaudio/"
40
+ f"best[height<={max_height}]/best"
41
+ ),
42
+ "format_sort": ["vcodec:h264"],
43
+ "outtmpl": output_template,
44
+ "merge_output_format": "mp4",
45
+ "quiet": True,
46
+ "no_warnings": True,
47
+ "progress_hooks": [_progress_hook(progress_callback)],
48
+ "postprocessors": [{
49
+ "key": "FFmpegVideoConvertor",
50
+ "preferedformat": "mp4",
51
+ }],
52
+ # Use iOS/Android clients to bypass datacenter IP bot-detection
53
+ "extractor_args": {
54
+ "youtube": {
55
+ "player_client": ["ios", "android", "tv_embedded"],
56
+ }
57
+ },
58
+ }
59
+ _inject_cookies(ydl_opts)
60
+
61
+ logger.info(f"Downloading: {url}")
62
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
63
+ info = ydl.extract_info(url, download=True)
64
+ title = info.get("title", "video")
65
+ duration = info.get("duration", 0)
66
+ logger.info(f"Downloaded: '{title}' ({duration}s)")
67
+
68
+ output_path = output_dir / f"{session_id}_input.mp4"
69
+ if not output_path.exists():
70
+ for f in output_dir.glob(f"{session_id}_input.*"):
71
+ output_path = f
72
+ break
73
+
74
+ # Safety: transcode AV1 → h264 if yt-dlp still picked it
75
+ output_path = _ensure_h264(output_path)
76
+ return output_path
77
+
78
+
79
+ def _ensure_h264(video_path: Path) -> Path:
80
+ """Transcode to h264 if video codec is AV1 (not supported by PySceneDetect on this server)."""
81
+ probe = subprocess.run(
82
+ ["ffprobe", "-v", "error", "-select_streams", "v:0",
83
+ "-show_entries", "stream=codec_name", "-of", "csv=p=0", str(video_path)],
84
+ capture_output=True, text=True,
85
+ )
86
+ codec = probe.stdout.strip().lower()
87
+ if codec not in ("av1", "av01"):
88
+ return video_path
89
+
90
+ logger.warning(f"AV1 detected ({video_path.name}), transcoding to h264...")
91
+ out = video_path.with_name(video_path.stem + "_h264.mp4")
92
+ result = subprocess.run(
93
+ ["ffmpeg", "-y", "-i", str(video_path), "-c:v", "libx264", "-preset", "fast",
94
+ "-crf", "23", "-c:a", "aac", "-b:a", "128k", str(out)],
95
+ capture_output=True, text=True,
96
+ )
97
+ if result.returncode == 0:
98
+ logger.info(f"Transcoded to h264: {out.name}")
99
+ return out
100
+ logger.error(f"Transcode failed: {result.stderr[-200:]}")
101
+ return video_path
102
+
103
+
104
+ _COOKIES_PATH = Path("/root/cookies.txt")
105
+
106
+
107
+ def _inject_cookies(opts: dict) -> None:
108
+ """Add cookiefile to ydl_opts if cookies.txt exists on server."""
109
+ if _COOKIES_PATH.exists():
110
+ opts["cookiefile"] = str(_COOKIES_PATH)
111
+ logger.debug(f"Using cookies: {_COOKIES_PATH}")
112
+
113
+
114
+ def get_video_info(url: str) -> dict:
115
+ """Return metadata without downloading."""
116
+ ydl_opts = {
117
+ "quiet": True,
118
+ "no_warnings": True,
119
+ "skip_download": True,
120
+ "extractor_args": {
121
+ "youtube": {"player_client": ["ios", "android", "tv_embedded"]}
122
+ },
123
+ }
124
+ _inject_cookies(ydl_opts)
125
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
126
+ info = ydl.extract_info(url, download=False)
127
+ return {
128
+ "title": info.get("title", ""),
129
+ "duration": info.get("duration", 0),
130
+ "thumbnail": info.get("thumbnail", ""),
131
+ "channel": info.get("channel", ""),
132
+ "view_count": info.get("view_count", 0),
133
+ "description": info.get("description", "")[:500],
134
+ }
135
+
136
+
137
+ async def download_video_async(
138
+ url: str,
139
+ output_dir: Path,
140
+ session_id: str,
141
+ progress_callback: Optional[Callable] = None,
142
+ ) -> Path:
143
+ """Async wrapper for download_video."""
144
+ loop = asyncio.get_event_loop()
145
+ return await loop.run_in_executor(
146
+ None, lambda: download_video(url, output_dir, session_id, progress_callback)
147
+ )
backend/src/processing/__init__.py ADDED
File without changes
backend/src/processing/clip_extractor.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Extract video clips using ffmpeg-python."""
2
+ import asyncio
3
+ import subprocess
4
+ from pathlib import Path
5
+ from loguru import logger
6
+
7
+
8
+ def extract_clip(
9
+ video_path: Path,
10
+ start: float,
11
+ end: float,
12
+ output_path: Path,
13
+ use_hw_encode: bool = True,
14
+ vertical: bool = True,
15
+ face_bbox: list = None,
16
+ **kwargs,
17
+ ) -> Path:
18
+ """Cut a clip and convert to 9:16 vertical (1080x1920) for TikTok.
19
+
20
+ face_bbox: [x1, y1, x2, y2] in pixels from Qwen2.5-VL — used to center
21
+ the crop on the face. Falls back to center crop when None.
22
+ Uses AMD AMF hardware encoder when available.
23
+ """
24
+ output_path.parent.mkdir(parents=True, exist_ok=True)
25
+ encoders = ["h264_amf", "libx264"] if use_hw_encode else ["libx264"]
26
+
27
+ # 9:16 vertical conversion filter
28
+ vf_filters = []
29
+ if vertical:
30
+ aspect_mode = kwargs.get("aspect_mode", "crop")
31
+ if aspect_mode == "letterbox":
32
+ # Fit entire 16:9 frame into 9:16, black bars top+bottom
33
+ vf_filters.append(
34
+ "scale=1080:1920:force_original_aspect_ratio=decrease,"
35
+ "pad=1080:1920:(ow-iw)/2:(oh-ih)/2:black"
36
+ )
37
+ else:
38
+ # Crop: scale to 1920 height first, then center-crop to 1080 wide
39
+ # Optionally center on face_bbox x when available
40
+ if face_bbox and len(face_bbox) == 4:
41
+ x1, _, x2, _ = face_bbox
42
+ face_cx = int((x1 + x2) / 2)
43
+ crop = f"scale=-1:1920,crop=1080:1920:max(0\\,min(iw-1080\\,{face_cx}*iw/in_w-540)):0"
44
+ else:
45
+ crop = "scale=-1:1920,crop=1080:1920:(iw-1080)/2:0"
46
+ vf_filters.append(crop)
47
+
48
+ for encoder in encoders:
49
+ cmd = ["ffmpeg", "-y", "-ss", str(start), "-to", str(end), "-i", str(video_path)]
50
+ if vf_filters:
51
+ cmd += ["-vf", ",".join(vf_filters)]
52
+ cmd += ["-c:v", encoder, "-c:a", "aac", "-b:a", "128k", "-movflags", "+faststart", str(output_path)]
53
+ result = subprocess.run(cmd, capture_output=True, text=True)
54
+ if result.returncode == 0:
55
+ if encoder == "h264_amf":
56
+ logger.info(f"Encoded 9:16 with AMD AMF: {output_path.name}")
57
+ return output_path
58
+ elif encoder == "h264_amf":
59
+ logger.debug("AMD AMF not available, falling back to libx264")
60
+
61
+ raise RuntimeError(f"All encoders failed for clip {output_path.name}")
62
+
63
+
64
+ def burn_subtitles(
65
+ clip_path: Path,
66
+ ass_path: Path,
67
+ output_path: Path,
68
+ use_hw_encode: bool = True,
69
+ ) -> Path:
70
+ """Burn ASS subtitles into video using ffmpeg.
71
+
72
+ Returns path to output video with burned-in subtitles.
73
+ """
74
+ output_path.parent.mkdir(parents=True, exist_ok=True)
75
+ ass_str = str(ass_path).replace("\\", "/").replace(":", "\\:")
76
+
77
+ encoders = ["h264_amf", "libx264"] if use_hw_encode else ["libx264"]
78
+
79
+ for encoder in encoders:
80
+ cmd = [
81
+ "ffmpeg", "-y",
82
+ "-i", str(clip_path),
83
+ "-vf", f"ass='{ass_str}'",
84
+ "-c:v", encoder,
85
+ "-c:a", "copy",
86
+ "-movflags", "+faststart",
87
+ str(output_path),
88
+ ]
89
+ result = subprocess.run(cmd, capture_output=True, text=True)
90
+ if result.returncode == 0:
91
+ return output_path
92
+ elif encoder == "h264_amf":
93
+ logger.debug("AMD AMF burn-sub failed, using libx264")
94
+
95
+ raise RuntimeError(f"Subtitle burn-in failed for {clip_path.name}\n{result.stderr[-500:]}")
96
+
97
+
98
+ def extract_all_clips(
99
+ video_path: Path,
100
+ selected_clips: list[dict],
101
+ output_dir: Path,
102
+ session_id: str,
103
+ aspect_mode: str = "crop",
104
+ ) -> list[dict]:
105
+ """Extract all selected clips from video. Returns list with added 'clip_path'."""
106
+ results = []
107
+ for i, clip in enumerate(selected_clips):
108
+ out_path = output_dir / f"{session_id}_clip_{i+1:02d}_raw.mp4"
109
+ face_bbox = clip.get("vision_analysis", {}).get("face_bbox")
110
+ try:
111
+ extract_clip(video_path, clip["start"], clip["end"], out_path, face_bbox=face_bbox, aspect_mode=aspect_mode)
112
+ results.append({**clip, "clip_index": i + 1, "clip_path": str(out_path)})
113
+ logger.info(f"Extracted clip {i+1}: {clip['start']:.1f}s–{clip['end']:.1f}s → {out_path.name}")
114
+ except Exception as e:
115
+ logger.error(f"Failed to extract clip {i+1}: {e}")
116
+ results.append({**clip, "clip_index": i + 1, "clip_path": None, "error": str(e)})
117
+ return results
118
+
119
+
120
+ async def extract_all_clips_async(
121
+ video_path: Path,
122
+ selected_clips: list[dict],
123
+ output_dir: Path,
124
+ session_id: str,
125
+ aspect_mode: str = "crop",
126
+ ) -> list[dict]:
127
+ loop = asyncio.get_event_loop()
128
+ return await loop.run_in_executor(
129
+ None,
130
+ lambda: extract_all_clips(video_path, selected_clips, output_dir, session_id, aspect_mode)
131
+ )
backend/src/processing/emoji_overlay.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Emoji and text overlay utilities for HRE pipeline."""
2
+ import subprocess
3
+ from pathlib import Path
4
+ from loguru import logger
5
+
6
+
7
+ def add_emoji_overlay(
8
+ video_path: Path,
9
+ emoji: str,
10
+ output_path: Path,
11
+ x: str = "w-100",
12
+ y: str = "50",
13
+ size: int = 80,
14
+ start_sec: float = 0.0,
15
+ end_sec: float = 3.0,
16
+ ) -> Path:
17
+ """Add emoji text overlay to video using ffmpeg drawtext."""
18
+ escaped = emoji.replace("'", "\\'").replace(":", "\\:")
19
+
20
+ vf = (
21
+ f"drawtext=text='{escaped}'"
22
+ f":fontsize={size}:x={x}:y={y}"
23
+ f":enable='between(t,{start_sec},{end_sec})'"
24
+ )
25
+
26
+ cmd = [
27
+ "ffmpeg", "-y", "-i", str(video_path),
28
+ "-vf", vf,
29
+ "-c:v", "libx264", "-c:a", "copy",
30
+ str(output_path),
31
+ ]
32
+ result = subprocess.run(cmd, capture_output=True, text=True)
33
+ if result.returncode == 0 and output_path.exists():
34
+ return output_path
35
+ logger.warning(f"Emoji overlay failed: {result.stderr[-200:]}")
36
+ return video_path # fallback to original
backend/src/processing/high_retention.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """High-Retention Editing pipeline — per-segment AI decisions.
2
+
3
+ Each 3-5s segment gets its own zoom direction, subtitle position,
4
+ and caption color driven by Qwen2.5-VL analyzing one frame per segment.
5
+
6
+ Pipeline per clip:
7
+ 1. Segment clip at speech pauses (3-5s chunks)
8
+ 2. Extract midpoint frame from each segment
9
+ 3. Qwen2.5-VL analyzes each frame → zoom + subtitle decisions
10
+ 4. ffmpeg filter_complex: per-segment zoompan + concat
11
+ 5. ASS subtitles with per-segment alignment/color/size override tags
12
+ """
13
+ import subprocess
14
+ import tempfile
15
+ from pathlib import Path
16
+ from loguru import logger
17
+
18
+
19
+ # ─── Video metadata ────────────────────────────────────────────────────────────
20
+
21
+ def _probe_dimensions(video_path: Path) -> tuple[int, int]:
22
+ probe = subprocess.run(
23
+ ["ffprobe", "-v", "error", "-select_streams", "v:0",
24
+ "-show_entries", "stream=width,height", "-of", "csv=p=0",
25
+ str(video_path)],
26
+ capture_output=True, text=True,
27
+ )
28
+ try:
29
+ w, h = map(int, probe.stdout.strip().split(","))
30
+ return w, h
31
+ except Exception:
32
+ return 1080, 1920
33
+
34
+
35
+ def _probe_duration(video_path: Path) -> float:
36
+ probe = subprocess.run(
37
+ ["ffprobe", "-v", "error", "-show_entries", "format=duration",
38
+ "-of", "csv=p=0", str(video_path)],
39
+ capture_output=True, text=True,
40
+ )
41
+ try:
42
+ return float(probe.stdout.strip())
43
+ except Exception:
44
+ return 0.0
45
+
46
+
47
+ def _has_audio_stream(video_path: Path) -> bool:
48
+ probe = subprocess.run(
49
+ ["ffprobe", "-v", "error", "-select_streams", "a",
50
+ "-show_entries", "stream=codec_type", "-of", "csv=p=0",
51
+ str(video_path)],
52
+ capture_output=True, text=True,
53
+ )
54
+ return bool(probe.stdout.strip())
55
+
56
+
57
+ # ─── Segmentation ─────────────────────────────────────────────────────────────
58
+
59
+ def _segment_clip(
60
+ duration: float,
61
+ transcript: dict,
62
+ clip_start: float,
63
+ max_seg: float = 4.5,
64
+ ) -> list[dict]:
65
+ """Divide clip into segments at speech pauses, max_seg seconds each."""
66
+ words: list[dict] = []
67
+ for seg in transcript.get("segments", []):
68
+ words.extend(seg.get("words", []))
69
+
70
+ if clip_start > 0:
71
+ words = [
72
+ {**w, "start": max(0.0, w["start"] - clip_start),
73
+ "end": max(0.0, w["end"] - clip_start)}
74
+ for w in words
75
+ ]
76
+ words = [w for w in words if w["end"] > 0 and w["start"] < duration]
77
+
78
+ # Collect pause midpoints as candidate cut times
79
+ cuts = [0.0]
80
+ for i in range(len(words) - 1):
81
+ gap = words[i + 1]["start"] - words[i]["end"]
82
+ if gap > 0.2:
83
+ cuts.append((words[i]["end"] + words[i + 1]["start"]) / 2.0)
84
+ cuts.append(duration)
85
+ cuts = sorted(set(cuts))
86
+
87
+ # Merge short intervals, split long ones
88
+ segs: list[dict] = []
89
+ start = 0.0
90
+ for cut in cuts[1:]:
91
+ seg_len = cut - start
92
+ if seg_len < 1.5 and cut < duration:
93
+ continue # too short — extend to next cut
94
+ if seg_len > max_seg:
95
+ t = start
96
+ while t + max_seg < cut:
97
+ segs.append({"start": t, "end": t + max_seg})
98
+ t += max_seg
99
+ if cut - t > 0.5:
100
+ segs.append({"start": t, "end": cut})
101
+ start = cut
102
+ else:
103
+ segs.append({"start": start, "end": cut})
104
+ start = cut
105
+
106
+ # Fallback: split evenly if not enough segments
107
+ if len(segs) < 2:
108
+ n = max(2, round(duration / 4.0))
109
+ d = duration / n
110
+ segs = [{"start": i * d, "end": min((i + 1) * d, duration)} for i in range(n)]
111
+
112
+ return segs
113
+
114
+
115
+ # ─── Frame extraction ─────────────────────────────────────────────────────────
116
+
117
+ def _extract_frame(video_path: Path, t: float, out_path: Path) -> bool:
118
+ cmd = [
119
+ "ffmpeg", "-y", "-ss", f"{t:.3f}", "-i", str(video_path),
120
+ "-vframes", "1", "-q:v", "3", str(out_path),
121
+ ]
122
+ result = subprocess.run(cmd, capture_output=True, timeout=30)
123
+ return result.returncode == 0 and out_path.exists()
124
+
125
+
126
+ # ─── Per-segment AI analysis ──────────────────────────────────────────────────
127
+
128
+ def _analyze_segment(
129
+ video_path: Path,
130
+ seg: dict,
131
+ seg_idx: int,
132
+ n_total: int,
133
+ transcript: dict,
134
+ clip_start: float,
135
+ tmp_dir: Path,
136
+ ) -> dict:
137
+ from src.analysis.vision import analyze_frame_for_hre, _default_hre_analysis
138
+
139
+ mid_t = (seg["start"] + seg["end"]) / 2.0
140
+ frame_path = tmp_dir / f"seg_{seg_idx:03d}.jpg"
141
+
142
+ if not _extract_frame(video_path, mid_t, frame_path):
143
+ return _default_hre_analysis(seg_idx, n_total)
144
+
145
+ words_all: list[dict] = []
146
+ for s in transcript.get("segments", []):
147
+ words_all.extend(s.get("words", []))
148
+
149
+ abs_start = seg["start"] + clip_start
150
+ abs_end = seg["end"] + clip_start
151
+ context = " ".join(
152
+ w.get("word", w.get("text", ""))
153
+ for w in words_all
154
+ if w.get("start", 0) < abs_end and w.get("end", 0) > abs_start
155
+ ).strip()
156
+
157
+ return analyze_frame_for_hre(frame_path, context, seg_idx, n_total)
158
+
159
+
160
+ # ─── Zoom expression builders ─────────────────────────────────────────────────
161
+
162
+ def _build_zoom_exprs(
163
+ analysis: dict,
164
+ w: int,
165
+ h: int,
166
+ ) -> tuple[str, str, str]:
167
+ """Return (z_expr, x_expr, y_expr) for ffmpeg zoompan from HRE analysis.
168
+ Note: \\, escapes comma inside ffmpeg filter expressions.
169
+ """
170
+ direction = analysis.get("zoom_direction", "in")
171
+ speed = analysis.get("zoom_speed", "slow")
172
+ face_detected = bool(analysis.get("face_detected", False))
173
+ face_cx = float(analysis.get("face_cx") or 0.5)
174
+ face_cy = float(analysis.get("face_cy") or 0.38)
175
+
176
+ if direction == "in":
177
+ if speed == "fast":
178
+ z_expr, max_zoom = "min(1.2+n*0.0014\\,1.6)", 1.6
179
+ else:
180
+ z_expr, max_zoom = "min(1.05+n*0.0006\\,1.35)", 1.35
181
+ elif direction == "out":
182
+ if speed == "fast":
183
+ z_expr, max_zoom = "max(1.6-n*0.0016\\,1.0)", 1.6
184
+ else:
185
+ z_expr, max_zoom = "max(1.4-n*0.0010\\,1.0)", 1.4
186
+ else: # hold
187
+ z_expr, max_zoom = "1.1", 1.1
188
+
189
+ if face_detected and direction == "in" and max_zoom > 1.05:
190
+ raw_cx = int(face_cx * w - w / (max_zoom * 2))
191
+ raw_cy = int(face_cy * h - h / (max_zoom * 2))
192
+ safe_cx = max(0, min(w - int(w / max_zoom), raw_cx))
193
+ safe_cy = max(0, min(h - int(h / max_zoom), raw_cy))
194
+ ctr_x = w / 2 - w / (max_zoom * 2)
195
+ ctr_y = h / 2 - h / (max_zoom * 2)
196
+ x_expr = (
197
+ f"(iw/2-(iw/zoom/2))+({safe_cx}-{ctr_x:.1f})*(zoom-1)/({max_zoom}-1)"
198
+ )
199
+ y_expr = (
200
+ f"(ih/2-(ih/zoom/2))+({safe_cy}-{ctr_y:.1f})*(zoom-1)/({max_zoom}-1)"
201
+ )
202
+ else:
203
+ x_expr = "iw/2-(iw/zoom/2)"
204
+ if direction == "in":
205
+ y_bias = min(face_cy, 0.5) if face_cy < 0.55 else 0.38
206
+ y_expr = f"ih*{y_bias:.2f}-(ih/zoom/2)"
207
+ else:
208
+ y_expr = "ih/2-(ih/zoom/2)"
209
+
210
+ return z_expr, x_expr, y_expr
211
+
212
+
213
+ # ─── Per-segment zoom via filter_complex ──────────────────────────────────────
214
+
215
+ def _apply_per_segment_zoom(
216
+ input_path: Path,
217
+ segments: list[dict],
218
+ analyses: list[dict],
219
+ w: int,
220
+ h: int,
221
+ output_path: Path,
222
+ has_audio: bool = True,
223
+ ) -> Path:
224
+ """Apply different zoompan to each segment, concat into single stream."""
225
+ filter_parts: list[str] = []
226
+ v_labels: list[str] = []
227
+ a_labels: list[str] = []
228
+
229
+ for i, (seg, analysis) in enumerate(zip(segments, analyses)):
230
+ s = f"{seg['start']:.3f}"
231
+ e = f"{seg['end']:.3f}"
232
+ z, x, y = _build_zoom_exprs(analysis, w, h)
233
+ zp = f"zoompan=z='{z}':x='{x}':y='{y}':d=1:s={w}x{h}:fps=30"
234
+ filter_parts.append(f"[0:v]trim={s}:{e},setpts=PTS-STARTPTS,{zp}[v{i}]")
235
+ v_labels.append(f"[v{i}]")
236
+ if has_audio:
237
+ filter_parts.append(f"[0:a]atrim={s}:{e},asetpts=PTS-STARTPTS[a{i}]")
238
+ a_labels.append(f"[a{i}]")
239
+
240
+ n = len(segments)
241
+ filter_parts.append("".join(v_labels) + f"concat=n={n}:v=1:a=0[vout]")
242
+ if has_audio:
243
+ filter_parts.append("".join(a_labels) + f"concat=n={n}:v=0:a=1[aout]")
244
+
245
+ cmd = [
246
+ "ffmpeg", "-y", "-i", str(input_path),
247
+ "-filter_complex", ";".join(filter_parts),
248
+ "-map", "[vout]",
249
+ ]
250
+ if has_audio:
251
+ cmd += ["-map", "[aout]", "-c:a", "aac"]
252
+ cmd += ["-c:v", "libx264", "-movflags", "+faststart", str(output_path)]
253
+
254
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
255
+ if result.returncode == 0 and output_path.exists():
256
+ logger.info(f"Per-segment zoom: {n} segments, {w}x{h}")
257
+ return output_path
258
+ logger.warning(f"Per-segment zoom failed: {result.stderr[-800:]}")
259
+ return input_path
260
+
261
+
262
+ # ─── Per-segment ASS subtitles ────────────────────────────────────────────────
263
+
264
+ _ASS_COLORS = {
265
+ "white": "&H00FFFFFF",
266
+ "yellow": "&H0000FFFF",
267
+ "cyan": "&H00FFFF00",
268
+ "orange": "&H000066FF",
269
+ "green": "&H0000FF00",
270
+ "red": "&H000000FF",
271
+ }
272
+
273
+
274
+ def _ts(t: float) -> str:
275
+ h = int(t // 3600)
276
+ m = int((t % 3600) // 60)
277
+ s = t % 60
278
+ return f"{h}:{m:02d}:{s:06.3f}"
279
+
280
+
281
+ def _generate_per_segment_subtitles(
282
+ transcript: dict,
283
+ ass_path: Path,
284
+ clip_start: float,
285
+ segments: list[dict],
286
+ analyses: list[dict],
287
+ ) -> None:
288
+ """Write ASS with per-segment alignment, color, and font-size overrides."""
289
+ events: list[dict] = []
290
+
291
+ # Word-level events
292
+ for seg in transcript.get("segments", []):
293
+ for w in seg.get("words", []):
294
+ t0 = max(0.0, float(w.get("start", 0)) - clip_start)
295
+ t1 = max(0.0, float(w.get("end", 0)) - clip_start)
296
+ text = w.get("word", w.get("text", "")).strip()
297
+ if text and t1 > 0:
298
+ events.append({"start": t0, "end": max(t1, t0 + 0.08), "text": text})
299
+
300
+ # Sentence-level fallback (split into 3-word chunks)
301
+ if not events:
302
+ for seg in transcript.get("segments", []):
303
+ t0 = max(0.0, float(seg.get("start", 0)) - clip_start)
304
+ t1 = max(0.0, float(seg.get("end", 0)) - clip_start)
305
+ text = seg.get("text", "").strip()
306
+ if not text or t1 <= 0:
307
+ continue
308
+ wlist = text.split()
309
+ chunk = 3
310
+ n_ch = max(1, (len(wlist) + chunk - 1) // chunk)
311
+ dur = (t1 - t0) / n_ch
312
+ for j in range(n_ch):
313
+ events.append({
314
+ "start": t0 + j * dur,
315
+ "end": t0 + (j + 1) * dur,
316
+ "text": " ".join(wlist[j * chunk:(j + 1) * chunk]),
317
+ })
318
+
319
+ def get_an(t: float) -> dict:
320
+ for seg, an in zip(segments, analyses):
321
+ if seg["start"] <= t < seg["end"]:
322
+ return an
323
+ return analyses[-1] if analyses else {}
324
+
325
+ lines = [
326
+ "[Script Info]",
327
+ "ScriptType: v4.00+",
328
+ "PlayResX: 1080",
329
+ "PlayResY: 1920",
330
+ "ScaledBorderAndShadow: yes",
331
+ "",
332
+ "[V4+ Styles]",
333
+ "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, "
334
+ "OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
335
+ "ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
336
+ "Alignment, MarginL, MarginR, MarginV, Encoding",
337
+ "Style: Default,Impact,90,&H00FFFFFF,&H0000FFFF,&H00000000,&H80000000,"
338
+ "-1,0,0,0,100,100,0,0,1,4,0,2,40,40,200,1",
339
+ "",
340
+ "[Events]",
341
+ "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text",
342
+ ]
343
+
344
+ for ev in events:
345
+ an = get_an(ev["start"])
346
+ color = _ASS_COLORS.get(an.get("subtitle_color", "white"), "&H00FFFFFF")
347
+ pos = an.get("subtitle_position", "bottom")
348
+ energy = an.get("energy_level", "medium")
349
+ moment = an.get("moment_type", "context")
350
+
351
+ alignment = 8 if pos == "top" else 2
352
+ margin_v = 120 if pos == "top" else 200
353
+ fs = (108 if energy == "high" or moment in ("hook", "punchline")
354
+ else 80 if energy == "low" else 92)
355
+
356
+ # Pop animation: start 130% scale, shrink to 100% in 120ms
357
+ pop = "{\\fscx130\\fscy130\\t(0,120,\\fscx100\\fscy100)}"
358
+ tag = f"{{\\an{alignment}\\1c{color}&\\fs{fs}\\b1}}{pop}"
359
+
360
+ lines.append(
361
+ f"Dialogue: 0,{_ts(ev['start'])},{_ts(ev['end'])},"
362
+ f"Default,,0,0,{margin_v},,{tag}{ev['text'].upper()}"
363
+ )
364
+
365
+ ass_path.write_text("\n".join(lines), encoding="utf-8")
366
+ logger.debug(f"ASS: {len(events)} events across {len(segments)} segments")
367
+
368
+
369
+ # ─── Emoji ─────────────────────────────────────────────────────────────────────
370
+
371
+ def _get_emoji(clip_data: dict, analyses: list[dict] | None = None) -> str:
372
+ if analyses:
373
+ energy_rank = {"high": 3, "medium": 2, "low": 1}
374
+ best = max(analyses, key=lambda a: energy_rank.get(a.get("energy_level", "low"), 1))
375
+ moment_emoji = {
376
+ "hook": "🔥", "punchline": "😂", "reaction": "😲",
377
+ "context": "💡", "transition": "✨",
378
+ }
379
+ if emoji := moment_emoji.get(best.get("moment_type", "")):
380
+ return emoji
381
+
382
+ a = clip_data.get("vision_analysis", {})
383
+ emotion = a.get("emotion", "excited")
384
+ action = a.get("action_type", "entertainment")
385
+ transcript_text = clip_data.get("transcript_text", "")
386
+ if transcript_text:
387
+ try:
388
+ from src.analysis.vision import get_emoji_for_scene
389
+ return get_emoji_for_scene(transcript_text, emotion, action)
390
+ except Exception:
391
+ pass
392
+
393
+ fb = {"happy": "😄", "excited": "🔥", "funny": "😂", "surprised": "😲",
394
+ "gaming": "🎮", "tutorial": "📚", "angry": "😤", "sad": "😢"}
395
+ return fb.get(emotion, fb.get(action, "⚡"))
396
+
397
+
398
+ # ─── Final render ─────────────────────────────────────────────────────────────
399
+
400
+ def _render_final(
401
+ video_path: Path,
402
+ ass_path: Path,
403
+ emoji: str,
404
+ output_path: Path,
405
+ ) -> None:
406
+ ass_str = str(ass_path).replace("\\", "/").replace(":", "\\:")
407
+ emoji_filter = (
408
+ f"drawtext=text='{emoji}':fontsize=80:x=w-100:y=50"
409
+ f":enable='between(t\\,0\\,3)'"
410
+ )
411
+ vf = f"ass='{ass_str}',{emoji_filter}"
412
+
413
+ cmd = [
414
+ "ffmpeg", "-y", "-i", str(video_path),
415
+ "-vf", vf, "-c:v", "libx264", "-c:a", "copy",
416
+ "-movflags", "+faststart", str(output_path),
417
+ ]
418
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
419
+ if result.returncode != 0:
420
+ cmd2 = [
421
+ "ffmpeg", "-y", "-i", str(video_path),
422
+ "-vf", f"ass='{ass_str}'",
423
+ "-c:v", "libx264", "-c:a", "copy", str(output_path),
424
+ ]
425
+ result2 = subprocess.run(cmd2, capture_output=True, text=True, timeout=300)
426
+ if result2.returncode != 0:
427
+ logger.error(f"HRE render failed: {result2.stderr[-300:]}")
428
+ return
429
+ logger.info(f"HRE render complete → {output_path.name}")
430
+
431
+
432
+ # ─── Main pipeline ────────────────────────────────────────────────────────────
433
+
434
+ def apply_hre(
435
+ clip_path: Path,
436
+ clip_data: dict,
437
+ transcript: dict,
438
+ output_path: Path,
439
+ ) -> Path:
440
+ """Apply per-segment AI-driven HRE: each 3-5s chunk gets its own zoom + subtitle style."""
441
+ output_path.parent.mkdir(parents=True, exist_ok=True)
442
+ clip_start = clip_data.get("start", 0.0)
443
+
444
+ with tempfile.TemporaryDirectory() as _tmp:
445
+ tmp_dir = Path(_tmp)
446
+ tmp_zoomed = tmp_dir / "zoomed.mp4"
447
+
448
+ w, h = _probe_dimensions(clip_path)
449
+ duration = _probe_duration(clip_path)
450
+ if duration <= 0:
451
+ duration = float(clip_data.get("end", clip_start + 30)) - clip_start
452
+ has_audio = _has_audio_stream(clip_path)
453
+
454
+ # 1. Segment at speech pauses
455
+ segments = _segment_clip(duration, transcript, clip_start)
456
+ n = len(segments)
457
+ logger.info(
458
+ f"HRE clip {clip_data.get('index', '?')}: "
459
+ f"{duration:.1f}s → {n} segments (AI analyzing each)"
460
+ )
461
+
462
+ # 2. Qwen2.5-VL analyzes each segment
463
+ analyses = [
464
+ _analyze_segment(clip_path, seg, i, n, transcript, clip_start, tmp_dir)
465
+ for i, seg in enumerate(segments)
466
+ ]
467
+
468
+ for i, (seg, an) in enumerate(zip(segments, analyses)):
469
+ logger.info(
470
+ f" [{seg['start']:.1f}s-{seg['end']:.1f}s] "
471
+ f"zoom={an.get('zoom_direction')}({an.get('zoom_speed')}) "
472
+ f"sub={an.get('subtitle_position')}/{an.get('subtitle_color')} "
473
+ f"type={an.get('moment_type')} energy={an.get('energy_level')}"
474
+ )
475
+
476
+ # 3. Per-segment zoom via filter_complex
477
+ zoomed = _apply_per_segment_zoom(
478
+ clip_path, segments, analyses, w, h, tmp_zoomed, has_audio=has_audio
479
+ )
480
+
481
+ # 4. Per-segment ASS subtitles
482
+ ass_path = output_path.with_suffix(".ass")
483
+ _generate_per_segment_subtitles(transcript, ass_path, clip_start, segments, analyses)
484
+
485
+ # 5. Emoji from highest-energy segment
486
+ emoji = _get_emoji(clip_data, analyses)
487
+
488
+ # 6. Render
489
+ _render_final(zoomed, ass_path, emoji, output_path)
490
+
491
+ return output_path
backend/src/processing/subtitle.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate ASS subtitles using pysubs2.
2
+
3
+ Supports: word-by-word, sentence, karaoke, fade, pop, typewriter animations.
4
+ Full ASS spec: font, size, 4-color layers, border, shadow, position, alignment.
5
+ Handles Thai/Chinese character-level splitting.
6
+ """
7
+ from pathlib import Path
8
+ from typing import Optional
9
+ import pysubs2
10
+ from pysubs2 import SSAFile, SSAEvent, SSAStyle
11
+ from loguru import logger
12
+
13
+ # Languages that split by character rather than word
14
+ CHAR_LEVEL_LANGUAGES = {"th", "zh", "ja", "km", "lo"}
15
+
16
+ # Default font per language
17
+ DEFAULT_FONTS = {
18
+ "th": "Noto Sans Thai",
19
+ "zh": "Noto Sans SC",
20
+ "zh-tw": "Noto Sans TC",
21
+ "ja": "Noto Sans JP",
22
+ "ko": "Noto Sans KR",
23
+ "en": "Montserrat",
24
+ "default": "Noto Sans",
25
+ }
26
+
27
+ # Animation presets (ASS override tags)
28
+ def _fade_tags(fade_in_ms: int = 200, fade_out_ms: int = 200) -> str:
29
+ return f"{{\\fade({fade_in_ms},{fade_out_ms})}}"
30
+
31
+ def _pop_tags() -> str:
32
+ return "{\\t(0,100,\\fscx120\\fscy120)\\t(100,200,\\fscx100\\fscy100)}"
33
+
34
+ def _typewriter_per_char(char: str, delay_ms: int) -> str:
35
+ return f"{{\\alpha&HFF&\\t({delay_ms},{delay_ms+80},\\alpha&H00&)}}{char}"
36
+
37
+ def _bounce_tags() -> str:
38
+ return "{\\t(0,150,\\frz-5)\\t(150,300,\\frz5)\\t(300,400,\\frz0)}"
39
+
40
+
41
+ def _color_to_ass(hex_color: str, alpha: int = 0) -> str:
42
+ """Convert #RRGGBB hex to ASS &HAABBGGRR format."""
43
+ hex_color = hex_color.lstrip("#")
44
+ if len(hex_color) == 6:
45
+ r, g, b = hex_color[0:2], hex_color[2:4], hex_color[4:6]
46
+ else:
47
+ r, g, b = "FF", "FF", "FF"
48
+ aa = f"{alpha:02X}"
49
+ return f"&H{aa}{b}{g}{r}"
50
+
51
+
52
+ def build_style(
53
+ font_family: str = "Noto Sans",
54
+ font_size: int = 72,
55
+ primary_color: str = "#FFFFFF",
56
+ secondary_color: str = "#FFFF00",
57
+ outline_color: str = "#000000",
58
+ shadow_color: str = "#000000",
59
+ primary_alpha: int = 0,
60
+ outline_alpha: int = 0,
61
+ shadow_alpha: int = 80,
62
+ bold: bool = True,
63
+ italic: bool = False,
64
+ underline: bool = False,
65
+ outline_size: float = 4.0,
66
+ shadow_size: float = 2.0,
67
+ alignment: int = 2, # 2=bottom-center, 8=top-center
68
+ margin_l: int = 40,
69
+ margin_r: int = 40,
70
+ margin_v: int = 250,
71
+ scale_x: int = 100,
72
+ scale_y: int = 100,
73
+ spacing: float = 0.0,
74
+ angle: float = 0.0,
75
+ ) -> SSAStyle:
76
+ style = SSAStyle()
77
+ style.fontname = font_family
78
+ style.fontsize = font_size
79
+ style.primarycolor = pysubs2.Color(*_hex_to_rgba(primary_color, primary_alpha))
80
+ style.secondarycolor = pysubs2.Color(*_hex_to_rgba(secondary_color, 0))
81
+ style.outlinecolor = pysubs2.Color(*_hex_to_rgba(outline_color, outline_alpha))
82
+ style.backcolor = pysubs2.Color(*_hex_to_rgba(shadow_color, shadow_alpha))
83
+ style.bold = bold
84
+ style.italic = italic
85
+ style.underline = underline
86
+ style.outline = outline_size
87
+ style.shadow = shadow_size
88
+ style.alignment = alignment
89
+ style.marginl = margin_l
90
+ style.marginr = margin_r
91
+ style.marginv = margin_v
92
+ style.scalex = scale_x
93
+ style.scaley = scale_y
94
+ style.spacing = spacing
95
+ style.angle = angle
96
+ style.borderstyle = 1 # outline + shadow
97
+ return style
98
+
99
+
100
+ def _hex_to_rgba(hex_color: str, alpha_0_255: int = 0):
101
+ """Convert #RRGGBB to (R, G, B, A) where A=0 is opaque."""
102
+ hex_color = hex_color.lstrip("#")
103
+ if len(hex_color) == 6:
104
+ r = int(hex_color[0:2], 16)
105
+ g = int(hex_color[2:4], 16)
106
+ b = int(hex_color[4:6], 16)
107
+ else:
108
+ r, g, b = 255, 255, 255
109
+ return r, g, b, alpha_0_255
110
+
111
+
112
+ def generate_subtitles(
113
+ transcript: dict,
114
+ output_path: Path,
115
+ style_config: dict,
116
+ clip_start_offset: float = 0.0,
117
+ ) -> Path:
118
+ """Generate .ass subtitle file from transcript.
119
+
120
+ Args:
121
+ transcript: Output from whisper.py
122
+ output_path: Where to save the .ass file
123
+ style_config: Dict with font/color/animation settings from frontend
124
+ clip_start_offset: Shift all timestamps (for sub-clips from longer video)
125
+ """
126
+ subs = SSAFile()
127
+ subs.info["PlayResX"] = "1080"
128
+ subs.info["PlayResY"] = "1920"
129
+ subs.info["ScaledBorderAndShadow"] = "yes"
130
+ subs.info["WrapStyle"] = "0"
131
+
132
+ display_mode = style_config.get("display_mode", "word") # "word" or "sentence"
133
+ animation = style_config.get("animation", "none") # none|fade|karaoke|pop|typewriter|bounce
134
+ subtitle_lang = style_config.get("subtitle_language", "en")
135
+ char_level = transcript.get("char_level", False) or subtitle_lang in CHAR_LEVEL_LANGUAGES
136
+
137
+ font_family = style_config.get("font_family") or DEFAULT_FONTS.get(subtitle_lang, DEFAULT_FONTS["default"])
138
+
139
+ style = build_style(
140
+ font_family=font_family,
141
+ font_size=style_config.get("font_size", 72),
142
+ primary_color=style_config.get("primary_color", "#FFFFFF"),
143
+ secondary_color=style_config.get("secondary_color", "#FFFF00"),
144
+ outline_color=style_config.get("outline_color", "#000000"),
145
+ shadow_color=style_config.get("shadow_color", "#000000"),
146
+ primary_alpha=style_config.get("primary_alpha", 0),
147
+ outline_alpha=style_config.get("outline_alpha", 0),
148
+ shadow_alpha=style_config.get("shadow_alpha", 80),
149
+ bold=style_config.get("bold", True),
150
+ italic=style_config.get("italic", False),
151
+ underline=style_config.get("underline", False),
152
+ outline_size=style_config.get("outline_size", 4.0),
153
+ shadow_size=style_config.get("shadow_size", 2.0),
154
+ alignment=style_config.get("alignment", 2),
155
+ margin_l=style_config.get("margin_l", 40),
156
+ margin_r=style_config.get("margin_r", 40),
157
+ margin_v=style_config.get("margin_v", 250),
158
+ scale_x=style_config.get("scale_x", 100),
159
+ scale_y=style_config.get("scale_y", 100),
160
+ spacing=style_config.get("spacing", 0.0),
161
+ angle=style_config.get("angle", 0.0),
162
+ )
163
+ subs.styles["Default"] = style
164
+
165
+ segments = transcript.get("segments", [])
166
+
167
+ for seg in segments:
168
+ words = seg.get("words", [])
169
+ seg_end = seg["end"] - clip_start_offset
170
+ if seg_end <= 0:
171
+ continue # segment ends before clip starts — skip entirely
172
+
173
+ seg_start = max(0.0, seg["start"] - clip_start_offset)
174
+
175
+ if display_mode == "sentence" or not words:
176
+ _add_sentence_event(subs, seg["text"], seg_start, seg_end, animation, style_config)
177
+ else:
178
+ if animation == "karaoke":
179
+ _add_karaoke_line(subs, words, seg_start, seg_end, clip_start_offset, char_level)
180
+ else:
181
+ _add_word_events(subs, words, seg_start, seg_end, animation, char_level, style_config, clip_start_offset)
182
+
183
+ output_path.parent.mkdir(parents=True, exist_ok=True)
184
+ subs.save(str(output_path), encoding="utf-8")
185
+ logger.info(f"Generated {len(subs)} subtitle events → {output_path.name}")
186
+ return output_path
187
+
188
+
189
+ def _add_sentence_event(subs, text, start, end, animation, style_config):
190
+ tags = ""
191
+ if animation == "fade":
192
+ fi = style_config.get("fade_in_ms", 200)
193
+ fo = style_config.get("fade_out_ms", 200)
194
+ tags = _fade_tags(fi, fo)
195
+ elif animation == "pop":
196
+ tags = _pop_tags()
197
+ elif animation == "bounce":
198
+ tags = _bounce_tags()
199
+
200
+ event = SSAEvent(
201
+ start=pysubs2.make_time(s=start),
202
+ end=pysubs2.make_time(s=end),
203
+ text=tags + text.strip(),
204
+ )
205
+ subs.append(event)
206
+
207
+
208
+ def _add_word_events(subs, words, seg_start, seg_end, animation, char_level, style_config, clip_offset=0.0):
209
+ """Add one SSAEvent per word (word-by-word mode)."""
210
+ unit_list = []
211
+ for w in words:
212
+ if char_level:
213
+ for ch in w["word"]:
214
+ unit_list.append({"word": ch, "start": w["start"], "end": w["end"]})
215
+ else:
216
+ unit_list.append(w)
217
+
218
+ for i, unit in enumerate(unit_list):
219
+ start = unit["start"] - clip_offset
220
+ end = (unit["end"] - clip_offset) if unit["end"] > unit["start"] else start + 0.3
221
+ if start < 0:
222
+ continue
223
+
224
+ tags = ""
225
+ if animation == "fade":
226
+ fi = style_config.get("fade_in_ms", 150)
227
+ fo = style_config.get("fade_out_ms", 100)
228
+ tags = _fade_tags(fi, fo)
229
+ elif animation == "pop":
230
+ tags = _pop_tags()
231
+ elif animation == "typewriter":
232
+ delay = int((start - seg_start) * 1000)
233
+ tags = _typewriter_per_char("", delay)
234
+
235
+ event = SSAEvent(
236
+ start=pysubs2.make_time(s=start),
237
+ end=pysubs2.make_time(s=end),
238
+ text=tags + unit["word"].strip(),
239
+ )
240
+ subs.append(event)
241
+
242
+
243
+ def _add_karaoke_line(subs, words, seg_start, seg_end, clip_offset, char_level):
244
+ """Add karaoke-style line: full line visible, words highlight in sequence."""
245
+ karaoke_text = ""
246
+ for w in words:
247
+ duration_cs = int((w["end"] - w["start"]) * 100)
248
+ word_text = w["word"].strip()
249
+ if char_level:
250
+ for ch in word_text:
251
+ karaoke_text += f"{{\\kf{duration_cs // max(len(word_text), 1)}}}{ch}"
252
+ else:
253
+ karaoke_text += f"{{\\kf{duration_cs}}}{word_text} "
254
+
255
+ event = SSAEvent(
256
+ start=pysubs2.make_time(s=seg_start),
257
+ end=pysubs2.make_time(s=seg_end),
258
+ text=karaoke_text.strip(),
259
+ )
260
+ subs.append(event)
261
+
262
+
263
+ def update_subtitle_event(
264
+ ass_path: Path,
265
+ event_index: int,
266
+ updates: dict,
267
+ ) -> Path:
268
+ """Update a single subtitle event (for editor patches)."""
269
+ subs = SSAFile.load(str(ass_path))
270
+ if event_index >= len(subs):
271
+ raise IndexError(f"Event index {event_index} out of range")
272
+
273
+ evt = subs[event_index]
274
+ if "text" in updates:
275
+ evt.text = updates["text"]
276
+ if "start" in updates:
277
+ evt.start = pysubs2.make_time(s=updates["start"])
278
+ if "end" in updates:
279
+ evt.end = pysubs2.make_time(s=updates["end"])
280
+
281
+ subs.save(str(ass_path), encoding="utf-8")
282
+ return ass_path
283
+
284
+
285
+ def apply_global_style_override(ass_path: Path, style_config: dict) -> Path:
286
+ """Re-apply global style overrides to all events (for live preview)."""
287
+ subs = SSAFile.load(str(ass_path))
288
+ new_style = build_style(**{k: v for k, v in style_config.items() if k in build_style.__code__.co_varnames})
289
+ subs.styles["Default"] = new_style
290
+ subs.save(str(ass_path), encoding="utf-8")
291
+ return ass_path
backend/src/transcription/__init__.py ADDED
File without changes
backend/src/transcription/whisper.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Speech-to-text using insanely-fast-whisper with ROCm support.
2
+
3
+ Word-level timestamps for subtitle generation.
4
+ Supports transcription (same language) and translation (→ English then to target).
5
+ """
6
+ import asyncio
7
+ import subprocess
8
+ import json
9
+ import os
10
+ from pathlib import Path
11
+ from typing import Optional
12
+ from loguru import logger
13
+
14
+ # Language codes supported by Whisper
15
+ WHISPER_LANGUAGES = {
16
+ "thai": "th",
17
+ "english": "en",
18
+ "chinese": "zh",
19
+ "japanese": "ja",
20
+ "korean": "ko",
21
+ "french": "fr",
22
+ "german": "de",
23
+ "spanish": "es",
24
+ "portuguese": "pt",
25
+ "russian": "ru",
26
+ "arabic": "ar",
27
+ "hindi": "hi",
28
+ "vietnamese": "vi",
29
+ "indonesian": "id",
30
+ "malay": "ms",
31
+ }
32
+
33
+ # Languages that need character-level splitting (no word spaces)
34
+ CHAR_LEVEL_LANGUAGES = {"th", "zh", "ja", "km", "lo", "my"}
35
+
36
+
37
+ def extract_audio(video_path: Path, audio_path: Path) -> Path:
38
+ """Extract mono 16kHz audio from video using ffmpeg."""
39
+ cmd = [
40
+ "ffmpeg", "-y", "-i", str(video_path),
41
+ "-ac", "1", "-ar", "16000",
42
+ "-vn", "-f", "wav", str(audio_path)
43
+ ]
44
+ result = subprocess.run(cmd, capture_output=True, text=True)
45
+ if result.returncode != 0:
46
+ raise RuntimeError(f"ffmpeg audio extraction failed: {result.stderr}")
47
+ return audio_path
48
+
49
+
50
+ def transcribe(
51
+ audio_path: Path,
52
+ clip_language: str = "auto",
53
+ subtitle_language: str = "en",
54
+ model_size: str = "large-v3",
55
+ device: str = "cuda",
56
+ batch_size: int = 16,
57
+ ) -> dict:
58
+ """Transcribe audio and return word-level timestamps.
59
+
60
+ Returns:
61
+ {
62
+ "text": str,
63
+ "segments": [{"start": float, "end": float, "text": str, "words": [...]}],
64
+ "language": str,
65
+ "char_level": bool,
66
+ }
67
+ """
68
+ clip_lang_code = WHISPER_LANGUAGES.get(clip_language.lower(), None)
69
+ sub_lang_code = WHISPER_LANGUAGES.get(subtitle_language.lower(), "en")
70
+
71
+ # Determine whisper task
72
+ task = "transcribe"
73
+ if clip_lang_code and sub_lang_code and clip_lang_code != sub_lang_code:
74
+ if sub_lang_code == "en":
75
+ task = "translate" # Whisper built-in translate → English
76
+ else:
77
+ task = "transcribe" # Non-English targets keep transcription in the selected language.
78
+
79
+ logger.info(f"Whisper: task={task}, clip_lang={clip_lang_code}, sub_lang={sub_lang_code}, model={model_size}")
80
+
81
+ try:
82
+ from transformers import pipeline
83
+ import torch
84
+
85
+ # AMD ROCm: float16 triggers HIPBLAS_STATUS_INTERNAL_ERROR on some models.
86
+ # Use float32 for stability; bfloat16 as middle ground if available.
87
+ if device == "cuda":
88
+ try:
89
+ name = torch.cuda.get_device_name(0).lower()
90
+ is_amd = any(k in name for k in ("amd", "radeon", "instinct", "mi"))
91
+ except Exception:
92
+ is_amd = True # default safe
93
+ dtype = torch.bfloat16 if is_amd else torch.float16
94
+ else:
95
+ dtype = torch.float32
96
+
97
+ def _run_on_cpu(gk):
98
+ logger.warning("Whisper: running on CPU (GPU unavailable or OOM)")
99
+ pipe_cpu = pipeline(
100
+ "automatic-speech-recognition",
101
+ model=f"openai/whisper-{model_size}",
102
+ torch_dtype=torch.float32,
103
+ device="cpu",
104
+ )
105
+ return pipe_cpu(str(audio_path), batch_size=1,
106
+ return_timestamps="word", generate_kwargs=gk)
107
+
108
+ generate_kwargs = {"task": task}
109
+ if clip_lang_code:
110
+ generate_kwargs["language"] = clip_lang_code
111
+
112
+ # Check free VRAM — if GPU is nearly full, go straight to CPU
113
+ use_gpu = device == "cuda"
114
+ if use_gpu:
115
+ try:
116
+ free_bytes = torch.cuda.mem_get_info(0)[0]
117
+ if free_bytes < 8 * 1024 ** 3: # < 8 GB free
118
+ logger.warning(f"Whisper: only {free_bytes/1024**3:.1f} GB free — using CPU")
119
+ use_gpu = False
120
+ except Exception:
121
+ pass
122
+
123
+ pipe = None
124
+ try:
125
+ if not use_gpu:
126
+ result = _run_on_cpu(generate_kwargs)
127
+ else:
128
+ pipe = pipeline(
129
+ "automatic-speech-recognition",
130
+ model=f"openai/whisper-{model_size}",
131
+ torch_dtype=dtype,
132
+ device=device,
133
+ model_kwargs={"attn_implementation": "sdpa"},
134
+ )
135
+ result = pipe(str(audio_path), batch_size=batch_size,
136
+ return_timestamps="word", generate_kwargs=generate_kwargs)
137
+ except (RuntimeError, Exception) as e:
138
+ err = str(e)
139
+ if any(k in err for k in ("HIPBLAS", "HIP", "out of memory", "OutOfMemory", "CUDA")):
140
+ logger.warning(f"GPU error in Whisper ({err[:120]}), retrying on CPU")
141
+ result = _run_on_cpu(generate_kwargs)
142
+ else:
143
+ raise
144
+ finally:
145
+ if pipe is not None:
146
+ del pipe
147
+ try:
148
+ torch.cuda.empty_cache()
149
+ except Exception:
150
+ pass
151
+
152
+ segments = _build_segments(result, sub_lang_code)
153
+ char_level = sub_lang_code in CHAR_LEVEL_LANGUAGES
154
+
155
+ return {
156
+ "text": result.get("text", ""),
157
+ "segments": segments,
158
+ "language": clip_lang_code or "auto",
159
+ "char_level": char_level,
160
+ "task": task,
161
+ }
162
+
163
+ except ImportError:
164
+ logger.warning("transformers not available, using stub transcription")
165
+ return _stub_transcription(str(audio_path))
166
+
167
+
168
+ def _build_segments(whisper_result: dict, target_lang: str) -> list:
169
+ """Convert Whisper output to segment list with word timestamps."""
170
+ segments = []
171
+ chunks = whisper_result.get("chunks", [])
172
+
173
+ if not chunks:
174
+ # Fallback: single segment
175
+ return [{"start": 0.0, "end": 30.0, "text": whisper_result.get("text", ""), "words": []}]
176
+
177
+ current_seg = {"start": None, "end": None, "text": "", "words": []}
178
+ SEGMENT_GAP = 1.5 # seconds gap to split into new segment
179
+
180
+ for chunk in chunks:
181
+ ts = chunk.get("timestamp", [0, 0])
182
+ start, end = (ts[0] or 0.0), (ts[1] or ts[0] or 0.0)
183
+ text = chunk.get("text", "").strip()
184
+
185
+ if not text:
186
+ continue
187
+
188
+ if current_seg["start"] is None:
189
+ current_seg["start"] = start
190
+
191
+ if current_seg["words"] and start - current_seg["end"] > SEGMENT_GAP:
192
+ segments.append(current_seg)
193
+ current_seg = {"start": start, "end": end, "text": text, "words": []}
194
+ else:
195
+ current_seg["text"] += (" " if current_seg["text"] else "") + text
196
+
197
+ current_seg["words"].append({"word": text, "start": start, "end": end})
198
+ current_seg["end"] = end
199
+
200
+ if current_seg["start"] is not None:
201
+ segments.append(current_seg)
202
+
203
+ return segments
204
+
205
+
206
+ def _stub_transcription(audio_path: str) -> dict:
207
+ """Return minimal stub when Whisper is unavailable (dev/CPU mode)."""
208
+ return {
209
+ "text": "[Transcription not available — Whisper model not loaded]",
210
+ "segments": [{"start": 0.0, "end": 5.0, "text": "Sample subtitle", "words": [
211
+ {"word": "Sample", "start": 0.0, "end": 0.5},
212
+ {"word": "subtitle", "start": 0.6, "end": 1.0},
213
+ ]}],
214
+ "language": "en",
215
+ "char_level": False,
216
+ "task": "transcribe",
217
+ }
218
+
219
+
220
+ async def transcribe_async(
221
+ audio_path: Path,
222
+ clip_language: str = "auto",
223
+ subtitle_language: str = "en",
224
+ model_size: str = "large-v3",
225
+ device: str = "cuda",
226
+ ) -> dict:
227
+ """Async wrapper for transcribe."""
228
+ loop = asyncio.get_event_loop()
229
+ from src.gpu.rocm_utils import get_optimal_batch_size
230
+ batch_size = get_optimal_batch_size("whisper")
231
+ return await loop.run_in_executor(
232
+ None,
233
+ lambda: transcribe(audio_path, clip_language, subtitle_language, model_size, device, batch_size)
234
+ )
deploy/setup_droplet.sh ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # ElevenClip AI — Full AMD Droplet Setup Script
3
+ # Run once after fresh boot: bash /root/setup_droplet.sh
4
+ set -e
5
+
6
+ LOG=/tmp/elevnclip_setup.log
7
+ exec > >(tee -a "$LOG") 2>&1
8
+
9
+ echo "=== ElevenClip AI Droplet Setup $(date) ==="
10
+
11
+ # ── 1. Update repo ────────────────────────────────────────────────────────────
12
+ echo "[1/5] Pulling latest code..."
13
+ cd /root/ElevenClip-AI
14
+ git pull origin master
15
+
16
+ # ── 2. Python venv + pip install ─────────────────────────────────────────────
17
+ echo "[2/5] Installing Python dependencies..."
18
+ if [ ! -f /root/venv/bin/activate ]; then
19
+ python3 -m venv /root/venv
20
+ fi
21
+ source /root/venv/bin/activate
22
+ pip install --upgrade pip -q
23
+ pip install -r backend/requirements.txt -q
24
+ echo "PACKAGES_DONE"
25
+
26
+ # ── 3. Start vLLM inside Docker container ────────────────────────────────────
27
+ echo "[3/5] Starting vLLM with Qwen2.5-VL-7B-Instruct..."
28
+ docker start rocm 2>/dev/null || true
29
+ sleep 3
30
+
31
+ # Kill any stale vllm process
32
+ docker exec rocm bash -c "pkill -f 'vllm serve' 2>/dev/null || true"
33
+ sleep 2
34
+
35
+ # Start vLLM detached
36
+ docker exec -d rocm bash -c '
37
+ vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
38
+ --port 8000 \
39
+ --dtype float16 \
40
+ --trust-remote-code \
41
+ --max-model-len 4096 \
42
+ --gpu-memory-utilization 0.7 \
43
+ --limit-mm-per-prompt "image=3" \
44
+ > /tmp/vllm.log 2>&1
45
+ '
46
+ echo "vLLM started in background (downloading model, may take 5-15 min)"
47
+
48
+ # ── 4. Start FastAPI backend on port 8080 ────────────────────────────────────
49
+ echo "[4/5] Starting FastAPI backend on :8080..."
50
+ pkill -f "uvicorn backend.main" 2>/dev/null || true
51
+ sleep 1
52
+
53
+ cd /root/ElevenClip-AI
54
+ VLLM_BASE_URL=http://localhost:8000/v1 \
55
+ VLLM_MODEL=Qwen/Qwen2.5-VL-7B-Instruct \
56
+ WORK_DIR=/tmp/elevnclip \
57
+ NEXT_PUBLIC_API_URL=http://localhost:8080 \
58
+ nohup /root/venv/bin/uvicorn backend.main:app \
59
+ --host 0.0.0.0 \
60
+ --port 8080 \
61
+ --workers 1 \
62
+ --log-level info \
63
+ > /tmp/fastapi.log 2>&1 &
64
+
65
+ echo "FastAPI PID: $!"
66
+ echo "FASTAPI_STARTED"
67
+
68
+ # ── 5. Poll vLLM health ───────────────────────────────────────────────────────
69
+ echo "[5/5] Waiting for vLLM to load model..."
70
+ for i in $(seq 1 180); do
71
+ if curl -sf http://localhost:8000/health > /dev/null 2>&1; then
72
+ echo "vLLM READY after $((i * 5))s!"
73
+ echo "VLLM_READY"
74
+ break
75
+ fi
76
+ if [ $((i % 12)) -eq 0 ]; then
77
+ echo " Still loading... $((i * 5))s elapsed"
78
+ docker exec rocm bash -c "tail -3 /tmp/vllm.log 2>/dev/null"
79
+ fi
80
+ sleep 5
81
+ done
82
+
83
+ echo ""
84
+ echo "=== Setup complete! ==="
85
+ echo " FastAPI: http://129.212.178.101:8080"
86
+ echo " vLLM API: http://129.212.178.101:8000/v1"
87
+ echo " Logs: /tmp/fastapi.log | docker exec rocm cat /tmp/vllm.log"
deploy/start_fastapi.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ pkill -f uvicorn 2>/dev/null
3
+ sleep 1
4
+ cd /root/ElevenClip-AI/backend
5
+ export VLLM_BASE_URL=http://localhost:8000/v1
6
+ export VLLM_MODEL=Qwen/Qwen2.5-VL-7B-Instruct
7
+ export WORK_DIR=/tmp/elevnclip
8
+ mkdir -p /tmp/elevnclip
9
+ nohup /root/venv/bin/uvicorn main:app --host 0.0.0.0 --port 8080 --workers 1 > /tmp/fastapi.log 2>&1 &
10
+ echo "FastAPI PID: $!"