| """ |
| Caption Renderer V4 - Gradio App |
| Converts JSON transcripts to WebM video with styled captions |
| """ |
|
|
| import os |
| import json |
| import uuid |
| import tempfile |
| from typing import List, Dict, Optional |
| import gradio as gr |
| import cloudinary |
| import cloudinary.uploader |
|
|
| from canvas_renderer import render_frame, WIDTH, HEIGHT |
| from video_encoder import encode_frames_pipe |
|
|
| |
| CLOUD_NAME = os.environ.get("CLOUDINARY_CLOUD_NAME", "dgfhhszx8") |
| UPLOAD_PRESET = os.environ.get("CLOUDINARY_UPLOAD_PRESET", "testing") |
|
|
| FPS = 24 |
|
|
|
|
| def parse_transcript(transcript_json: str) -> List[Dict]: |
| """Parse the transcript JSON input""" |
| try: |
| data = json.loads(transcript_json) |
| |
| if isinstance(data, list): |
| return data |
| elif isinstance(data, dict) and "fullTranscript" in data: |
| return data["fullTranscript"] |
| else: |
| raise ValueError("Invalid transcript format") |
| except json.JSONDecodeError as e: |
| raise ValueError(f"Invalid JSON: {e}") |
|
|
|
|
| def group_words_into_phrases(transcript: List[Dict], max_words: int = 4, max_duration: float = 2.0) -> List[Dict]: |
| """ |
| Group individual words into display phrases. |
| Each phrase will be shown together, with word-by-word highlighting. |
| |
| Returns list of phrases with structure: |
| { |
| 'words': ['word1', 'word2', ...], |
| 'timings': [{'start': 0.0, 'end': 0.5}, ...], |
| 'start': phrase_start, |
| 'end': phrase_end |
| } |
| """ |
| if not transcript: |
| return [] |
| |
| phrases = [] |
| current_phrase = {'words': [], 'timings': [], 'start': None, 'end': None} |
| |
| for item in transcript: |
| word = item['text'] |
| start = float(item['start']) |
| end = float(item['end']) |
| |
| if current_phrase['start'] is None: |
| current_phrase['start'] = start |
| |
| current_phrase['words'].append(word) |
| current_phrase['timings'].append({'start': start, 'end': end}) |
| current_phrase['end'] = end |
| |
| |
| phrase_duration = current_phrase['end'] - current_phrase['start'] |
| if len(current_phrase['words']) >= max_words or phrase_duration >= max_duration: |
| phrases.append(current_phrase) |
| current_phrase = {'words': [], 'timings': [], 'start': None, 'end': None} |
| |
| |
| if current_phrase['words']: |
| phrases.append(current_phrase) |
| |
| return phrases |
|
|
|
|
| def generate_video(transcript_json: str, style: str, progress=gr.Progress()) -> tuple: |
| """ |
| Main video generation function. |
| |
| Args: |
| transcript_json: JSON string with transcript data |
| style: Caption style (hormozi, cinematic, netflix, neon) |
| progress: Gradio progress tracker |
| |
| Returns: |
| Tuple of (video_path, cloudinary_url) |
| """ |
| progress(0, desc="Parsing transcript...") |
| |
| |
| try: |
| transcript = parse_transcript(transcript_json) |
| except ValueError as e: |
| raise gr.Error(f"Failed to parse transcript: {e}") |
| |
| if not transcript: |
| raise gr.Error("Empty transcript provided") |
| |
| progress(0.1, desc="Grouping words into phrases...") |
| |
| |
| phrases = group_words_into_phrases(transcript) |
| |
| if not phrases: |
| raise gr.Error("No phrases generated from transcript") |
| |
| |
| total_duration = max(p['end'] for p in phrases) + 0.5 |
| total_frames = int(total_duration * FPS) |
| |
| progress(0.2, desc=f"Generating {total_frames} frames...") |
| |
| |
| frames = [] |
| for frame_idx in range(total_frames): |
| current_time = frame_idx / FPS |
| |
| |
| active_phrase = None |
| for phrase in phrases: |
| if phrase['start'] <= current_time <= phrase['end']: |
| active_phrase = phrase |
| break |
| |
| if active_phrase: |
| words = active_phrase['words'] |
| |
| active_word_idx = -1 |
| for i, timing in enumerate(active_phrase['timings']): |
| if timing['start'] <= current_time <= timing['end']: |
| active_word_idx = i |
| break |
| |
| frame = render_frame(words, active_word_idx, style) |
| else: |
| |
| if phrases: |
| last_phrase = phrases[-1] |
| frame = render_frame(last_phrase['words'], -1, style) |
| else: |
| from PIL import Image |
| frame = Image.new('RGB', (WIDTH, HEIGHT), (0, 255, 0)) |
| |
| frames.append(frame) |
| |
| |
| if frame_idx % max(1, total_frames // 10) == 0: |
| pct = 0.2 + (frame_idx / total_frames) * 0.5 |
| progress(pct, desc=f"Rendering frame {frame_idx}/{total_frames}...") |
| |
| progress(0.7, desc="Encoding video...") |
| |
| |
| output_dir = tempfile.mkdtemp(prefix="caption_video_") |
| output_path = os.path.join(output_dir, f"caption_{uuid.uuid4().hex[:8]}.webm") |
| |
| |
| try: |
| encode_frames_pipe(frames, output_path, fps=FPS) |
| except RuntimeError as e: |
| raise gr.Error(f"Video encoding failed: {e}") |
| |
| |
| if not os.path.exists(output_path) or os.path.getsize(output_path) < 1000: |
| raise gr.Error("Video encoding produced empty or invalid file") |
| |
| progress(0.85, desc="Uploading to Cloudinary...") |
| |
| |
| try: |
| result = cloudinary.uploader.unsigned_upload( |
| output_path, |
| UPLOAD_PRESET, |
| cloud_name=CLOUD_NAME, |
| resource_type="video" |
| ) |
| cloudinary_url = result.get("secure_url", "") |
| except Exception as e: |
| cloudinary_url = f"Upload failed: {e}" |
| |
| progress(1.0, desc="Done!") |
| |
| return output_path, cloudinary_url |
|
|
|
|
| |
| SAMPLE_TRANSCRIPT = json.dumps([ |
| {"text": "WATCH", "start": 0.0, "end": 0.5}, |
| {"text": "THIS", "start": 0.5, "end": 1.0}, |
| {"text": "AMAZING", "start": 1.0, "end": 1.6}, |
| {"text": "VIDEO", "start": 1.6, "end": 2.2}, |
| ], indent=2) |
|
|
| |
| with gr.Blocks(title="Caption Renderer V4", theme=gr.themes.Soft()) as demo: |
| gr.Markdown("# 🎬 Caption Renderer V4") |
| gr.Markdown("Convert JSON transcripts to WebM videos with animated captions (green screen)") |
| |
| with gr.Row(): |
| with gr.Column(): |
| transcript_input = gr.Textbox( |
| label="Transcript JSON", |
| placeholder='[{"text": "HELLO", "start": 0.0, "end": 0.5}, ...]', |
| lines=12, |
| value=SAMPLE_TRANSCRIPT |
| ) |
| |
| style_dropdown = gr.Dropdown( |
| choices=["hormozi", "cinematic", "netflix", "neon"], |
| value="hormozi", |
| label="Caption Style" |
| ) |
| |
| generate_btn = gr.Button("🎥 Generate Video", variant="primary") |
| |
| with gr.Column(): |
| video_output = gr.Video(label="Generated Video") |
| cloudinary_url = gr.Textbox(label="Cloudinary URL", interactive=False) |
| |
| generate_btn.click( |
| fn=generate_video, |
| inputs=[transcript_input, style_dropdown], |
| outputs=[video_output, cloudinary_url] |
| ) |
| |
| gr.Markdown("---") |
| gr.Markdown("### Supported Styles") |
| gr.Markdown(""" |
| - **Hormozi**: Gold highlighted word, white inactive, pop animation |
| - **Cinematic**: Premium white/gray with cyan glow |
| - **Netflix**: Netflix red active, white inactive |
| - **Neon**: Magenta/Cyan neon glow effect |
| """) |
|
|
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True) |
|
|