File size: 26,816 Bytes
45c5aeb
d5b44fb
 
44ddc4b
d5b44fb
44ddc4b
d5b44fb
45c5aeb
3a4bfcc
 
24d53ed
45c5aeb
 
 
 
 
3a4bfcc
cf2ccfe
45c5aeb
772b415
ce18e78
d5b44fb
3a4bfcc
d5b44fb
45c5aeb
3a4bfcc
14c703e
d5b44fb
3a4bfcc
d5b44fb
3a4bfcc
d5b44fb
 
45c5aeb
 
 
 
 
 
 
 
 
 
 
 
e7ed8ea
45c5aeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a4bfcc
 
45c5aeb
 
 
2194011
 
45c5aeb
 
 
 
3a4bfcc
45c5aeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a4bfcc
 
 
45c5aeb
3a4bfcc
45c5aeb
 
3a4bfcc
d5b44fb
45c5aeb
 
3a4bfcc
45c5aeb
3a4bfcc
 
 
 
45c5aeb
 
 
 
 
 
 
 
3a4bfcc
45c5aeb
3a4bfcc
 
 
d5b44fb
3a4bfcc
45c5aeb
2194011
8018ca1
 
45c5aeb
 
 
 
 
 
 
3a4bfcc
01469b7
3a4bfcc
 
 
 
45c5aeb
 
 
3a4bfcc
 
45c5aeb
fb39b48
45c5aeb
 
3a4bfcc
 
fb39b48
45c5aeb
 
 
 
 
 
3a4bfcc
45c5aeb
 
 
3a4bfcc
45c5aeb
 
3a4bfcc
44ddc4b
3a4bfcc
45c5aeb
 
3a4bfcc
 
 
 
 
45c5aeb
 
 
 
 
3a4bfcc
45c5aeb
3a4bfcc
 
 
 
 
 
 
45c5aeb
3a4bfcc
 
772b415
3a4bfcc
 
 
2194011
3a4bfcc
 
45c5aeb
 
 
 
 
3a4bfcc
45c5aeb
3a4bfcc
2194011
3a4bfcc
45c5aeb
772b415
2194011
45c5aeb
2194011
3a4bfcc
 
 
 
 
 
45c5aeb
48ce277
3a4bfcc
 
 
48ce277
3a4bfcc
2194011
 
45c5aeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2194011
45c5aeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a4bfcc
45c5aeb
 
 
 
 
3a4bfcc
 
45c5aeb
 
 
 
 
 
 
 
 
606c838
d5b44fb
45c5aeb
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
# advanced_video_transcreator_v3.4.py

import gradio as gr
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import google.generativeai as genai
import arabic_reshaper
import os
import time
import ffmpeg
import json
import easyocr
import requests
import io

# --- CONFIGURATION ---
API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyBtiZokUXvJbRmGPARHSrbdVBANxtBM7Bs")  # Replace with your actual API key or use os.getenv
ONE_API_KEY = os.getenv("ONE_API_KEY", "268976:66f4f58a2a905")  # Key for the Instagram download service
PERSIAN_FONT_PATH = "Vazir.ttf"
FADE_IN_DURATION_SECONDS = 1.0

# --- GLOBAL INITIALIZATION ---
reader = None
def initialize_easyocr_reader():
    """Initializes the EasyOCR reader if it hasn't been already."""
    global reader
    if reader is None:
        print("Loading EasyOCR model...")
        reader = easyocr.Reader(['en'], gpu=False, verbose=False)
        print("EasyOCR model loaded successfully!")
    return reader

# --- CORE AI AND VIDEO FUNCTIONS ---

def analyze_and_transcreate_with_gemini(video_path: str, english_caption: str, progress: gr.Progress):
    """
    Analyzes a video using the new comprehensive "Transcreation" prompt and extracts the result.
    This single call performs analysis, translation, and caption generation, incorporating the user-provided English caption.
    """
    if not API_KEY or API_KEY == "YOUR_GEMINI_API_KEY":
        raise gr.Error("GEMINI_API_KEY is not set.")

    try:
        genai.configure(api_key=API_KEY)
        model = genai.GenerativeModel('gemini-2.5-flash')

        progress(0.2, desc="[1/4] Performing deep analysis & transcreation with Gemini...")

        ### MODIFIED PROMPT (Requirements 1, 2, 3: Author Name, Category Definitions, English Caption) ###
        prompt_template = f"""
        Objective: Analyze the provided video (containing text) across all modalities (visuals, audio, existing text) and the user-provided English caption to generate a superior Persian translation and a suitable Instagram caption. The translation must be contextually perfect, stylistically appropriate, and culturally resonant, avoiding the feel of a literal or AI-driven translation. The caption should be concise, engaging, and aligned with the video's mood, content, and the provided English caption, without hashtags.

        User-Provided English Caption: "{english_caption if english_caption else 'No caption provided.'}"

        Instructions:

        1.  **Multi-Modal Analysis**: Perform a deep analysis of the video. Synthesize information from all three channels: visual, audio, and textual. Additionally, incorporate the user-provided English caption to inform the tone, context, and intent of the Instagram caption.
        2.  **Isolate Essential Text**: Use OCR to find all text, but identify only the **core, persistent message** intended for the audience. **You MUST INCLUDE any author, poet, or famous person's name (e.g., '- Rumi') in the essential text if present.** **You MUST IGNORE temporary text such as usernames that flash on screen, watermarks, or English subtitles at the bottom of the frame.** The essential text is typically the main quote or statement that stays on screen.
        3.  **Category Selection**: Choose the most appropriate content category based on the video's text, audio, and visuals. Use the following definitions:
            - **MEME_HUMOR**: Videos with a white text box at the top, often containing phrases like "POV", "Me when...", or similar humorous, casual text, typically with playful or comedic intent.
            - **COLD_MOTIVATIONAL**: Videos with dark themes (visuals or mood) and intense, strong music that evokes motivation or a driven mindset.
            - **WISE_QUOTE**: Videos with peaceful, calm music and literary, poetic grammar, often quoting famous figures.
            - **TWITTER_JOKE**: Videos with casual, friendly, simple text tone, accompanied by funny or lighthearted music.
        4.  **Synthesize and Guide**: Use the visual, audio, textual analysis, and the English caption(if provided)to define the exact emotional and stylistic parameters for the translation and Instagram caption.
        5.  **Instagram Caption**: Generate a concise, engaging Instagram caption in Persian that reflects the video's mood, content, cultural context, and the tone of the English caption (if provided). The caption should be standalone (not a direct translation of the text or English caption) and suitable for posting without hashtags.
        6.  **Format Output**: Respond ONLY with a single, raw JSON object as specified below. Do not include any explanatory text before or after the JSON.
        7.  **Author Formatting**: If an author's name is present (e.g., "- Rumi"), format the final translation so the author's name (in Persian) is on its own, separate line at the very end.

        JSON Structure:
        {{
          "asset_id": "video_frame_01",
          "content_category": "CHOOSE ONE: [MEME_HUMOR, COLD_MOTIVATIONAL, WISE_QUOTE, TWITTER_JOKE]",
          "source_language": "en",
          "target_language": "fa",
          "comprehensive_analysis": {{
            "visual_context": {{
              "mood_and_aesthetics": "Describe the emotional mood conveyed by the visuals. (e.g., 'Somber and melancholic, uses slow zooms and a desaturated color palette to evoke a sense of loneliness.')",
              "cinematic_style": "Describe the filming style. (e.g., 'UGC-style phone recording, shaky cam, feels raw and authentic.')",
              "subject_matter": "Briefly describe what is happening visually, independent of the text. (e.g., 'A person is walking alone on a rainy street at night.')"
            }},
            "audio_context": {{
              "music_analysis": "Describe the music's genre, tempo, and emotional impact. (e.g., 'Slow, ambient piano music, creates a feeling of introspection and sadness.')",
              "sfx_analysis": "Describe any relevant sound effects. (e.g., 'The sound of rain and distant city ambiance is prominent, enhancing the feeling of isolation.')"
            }},
            "textual_context": {{
              "full_text_detected": "The complete text from OCR, including ALL parts.",
              "essential_text": "The core message INCLUDING author attribution if present (e.g., 'The wound is the place where the light enters you - Rumi'). THIS IS THE MOST IMPORTANT FIELD. Remember to exclude temporary usernames and subtitles."
            }}
          }},
          "transcreation_directive": {{
            "target_emotional_impact": "Synthesize the analysis above to define the precise emotion the Persian translation should evoke. (e.g., 'The translation should feel like a quiet, personal realization; a mix of sadness and acceptance, not dramatic grief.')",
            "stylistic_guidance": {{
              "formality": "CHOOSE ONE: [FORMAL_LITERARY, MODERN_POETIC, COLLOQUIAL_CASUAL, PROFESSIONAL_INFORMATIVE]",
              "register": "Describe the linguistic 'flavor'. (e.g., 'Use sophisticated but natural vocabulary. Avoid slang but don't be overly academic. It should sound like a thoughtful, well-spoken friend.')"
            }},
            "cultural_adaptation_notes": "Provide guidance on adapting cultural nuances for a Persian audience. (e.g., 'The English concept of 'just being okay with it' can be translated to a more poetic Persian concept of resignation, like «کنار آمدن» or «پذیرفتن».')"
          }},
          "final_output": {{
            "recommended_translation": "ONLY the final, high-quality Persian translation goes here. It should be the direct result of following the transcreation_directive.",
            "translation_rationale": "Briefly explain WHY this translation was chosen, referencing the analysis.",
            "instagram_caption": "A concise, engaging Persian caption for the Instagram post, without hashtags, reflecting the video's mood, content, and the English caption (if provided)."
          }}
        }}
        """

        video_file = genai.upload_file(path=video_path)
        while video_file.state.name == "PROCESSING":
            time.sleep(2)
            video_file = genai.get_file(video_file.name)

        if video_file.state.name == "FAILED":
            raise gr.Error("Gemini file upload failed.")

        response = model.generate_content([prompt_template, video_file], request_options={"timeout": 180})
        genai.delete_file(video_file.name)

        analysis_json_text = response.text.strip()
        if analysis_json_text.startswith("```json"):
            analysis_json_text = analysis_json_text[7:-3].strip()

        analysis_data = json.loads(analysis_json_text)

        essential_text = analysis_data.get("comprehensive_analysis", {}).get("textual_context", {}).get("essential_text", "")
        final_translation = analysis_data.get("final_output", {}).get("recommended_translation", "")
        instagram_caption = analysis_data.get("final_output", {}).get("instagram_caption", "")

        if not essential_text or not final_translation or not instagram_caption:
            raise gr.Error("Gemini analysis did not return the essential text, final translation, or Instagram caption.")

        return analysis_data, essential_text, final_translation, instagram_caption

    except json.JSONDecodeError:
        error_message = f"Gemini returned invalid JSON. The response was:\n{response.text.strip()}"
        raise gr.Error(error_message)
    except Exception as e:
        error_message = f"An error occurred with the Gemini API: {str(e)}"
        raise gr.Error(error_message)

def detect_white_header_box(image: Image.Image, progress: gr.Progress):
    """
    Detects if a prominent white header box exists at the top of the video.
    Returns the bounding box of this header if found, otherwise returns None.
    """
    progress(0.35, desc="[2/4] Checking for white header box...")
    img_array = np.array(image.convert('L'))  # Convert to grayscale
    frame_width, frame_height = image.size

    # Analyze the top 25% of the image
    scan_height = int(frame_height * 0.25)
    top_section = img_array[0:scan_height, :]

    # Threshold the image to find very light areas (potential white box)
    _, thresh = cv2.threshold(top_section, 230, 255, cv2.THRESH_BINARY)
    
    # Find contours
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        # Check if the contour is a large, wide rectangle typical of a header
        if w > frame_width * 0.8 and h > frame_height * 0.05:
            print(f"Detected potential white header box of size {w}x{h}.")
            # Give it a little padding
            padding_x = int(frame_width * 0.02)
            padding_y = int(frame_height * 0.02)
            final_bbox = (
                max(0, x - padding_x), max(0, y - padding_y),
                min(frame_width, x + w + padding_x), min(frame_height, y + h + padding_y)
            )
            print(f"Using white header as final bounding box: {final_bbox}")
            return final_bbox
            
    print("No dominant white header box found. Proceeding with standard text detection.")
    return None

def get_bbox_for_essential_text(image: Image.Image, essential_text: str, progress: gr.Progress):
    """
    Uses EasyOCR to find the precise bounding box for the essential text identified by Gemini.
    """
    progress(0.4, desc="[2/4] Locating text with EasyOCR...")
    ocr_reader = initialize_easyocr_reader()
    img_array = np.array(image)
    results = ocr_reader.readtext(img_array)
    if not results: raise gr.Error("EasyOCR could not detect any text on the frame.")

    essential_words = set(char.lower() for char in essential_text if char.isalnum())
    min_x, min_y = float('inf'), float('inf')
    max_x, max_y = float('-inf'), float('-inf')
    found_match = False

    print(f"Gemini's essential text: '{essential_text}'")
    print("EasyOCR Results:")
    for (bbox, text, prob) in results:
        print(f"- Detected: '{text}'")
        text_words = set(char.lower() for char in text if char.isalnum())
        if len(essential_words.intersection(text_words)) > 0:
            found_match = True
            (tl, tr, br, bl) = bbox
            min_x = min(min_x, tl[0], bl[0])
            min_y = min(min_y, tl[1], tr[1])
            max_x = max(max_x, tr[0], br[0])
            max_y = max(max_y, bl[1], br[1])
            print(f"  ^-- Matched! Updating consolidated bbox.")

    if not found_match: raise gr.Error(f"EasyOCR ran but could not locate the essential text '{essential_text}' on the video frame.")

    original_height = max_y - min_y
    height_reduction = original_height * 0.10
    min_y += height_reduction / 2
    max_y -= height_reduction / 2
    print(f"Bbox height adjusted: Reduced by {height_reduction:.2f} pixels for a tighter fit.")

    frame_width, frame_height = image.size
    padding_x = int(frame_width * 0.02)
    padding_y = int(frame_height * 0.02)
    final_bbox = (
        max(0, int(min_x) - padding_x), max(0, int(min_y) - padding_y),
        min(frame_width, int(max_x) + padding_x), min(frame_height, int(max_y) + padding_y)
    )
    print(f"Final consolidated bbox (x1, y1, x2, y2): {final_bbox}")
    return final_bbox

def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> (Image.Image, tuple):
    """
    Creates an overlay with adaptive color and robust, auto-fitting wrapped Persian text.
    """
    overlay_width = bbox[2] - bbox[0]
    overlay_height = bbox[3] - bbox[1]

    try:
        sample_x = max(0, int(bbox[0]) - 5)
        sample_y = int((bbox[1] + bbox[3]) / 2)
        bg_color = original_image.getpixel((sample_x, sample_y))
    except (ValueError, IndexError): bg_color = (25, 25, 25)

    overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
    draw = ImageDraw.Draw(overlay_layer)

    luminance = (0.299 * bg_color[0] + 0.587 * bg_color[1] + 0.114 * bg_color[2])
    if luminance > 128:
        text_color, shadow_color = (0, 0, 0, 255), (200, 200, 200, 100)
        print("Light background detected. Using BLACK text.")
    else:
        text_color, shadow_color = (255, 255, 255, 255), (0, 0, 0, 180)
        print("Dark background detected. Using WHITE text.")

    if not os.path.exists(PERSIAN_FONT_PATH):
         raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please upload it.")

    target_width = overlay_width * 0.90
    target_height = overlay_height * 0.90
    font_size = 100
    final_wrapped_lines = []
    raw_lines = text_to_overlay.split('\n')

    while font_size > 10:
        font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
        max_line_width = 0
        reshaped_lines_for_calc = [arabic_reshaper.reshape(l) for l in raw_lines]
        for line in reshaped_lines_for_calc:
            max_line_width = max(max_line_width, font.getlength(line))
        line_heights = [font.getbbox(l)[3] for l in reshaped_lines_for_calc if l]
        total_height = sum(line_heights) + (len(raw_lines) - 1) * (font_size * 0.3)
        if total_height <= target_height and max_line_width <= target_width:
            final_wrapped_lines = raw_lines
            break
        else:
            font_size -= 2

    if not final_wrapped_lines:
        font_size = 10
        final_wrapped_lines = raw_lines
        print("Warning: Text was too long to fit perfectly. Using minimum font size.")

    final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
    print(f"Final font size: {font_size}px")
    final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
    line_heights_render = [final_font.getbbox(l)[3] for l in final_reshaped_lines]
    total_text_height = sum(line_heights_render) + (len(final_reshaped_lines) - 1) * (font_size * 0.3)
    y_start = (overlay_height - total_text_height) / 2
    current_y = y_start
    for i, reshaped_line in enumerate(final_reshaped_lines):
        line_width = final_font.getlength(reshaped_line)
        x_position = (overlay_width - line_width) / 2
        draw.text((x_position + 1, current_y + 1), reshaped_line, font=final_font, fill=shadow_color)
        draw.text((x_position, current_y), reshaped_line, font=final_font, fill=text_color)
        current_y += line_heights_render[i] + (font_size * 0.3)
    return overlay_layer, bbox

# --- MAIN VIDEO PROCESSING PIPELINE ---
def process_video(video_path, english_caption, progress=gr.Progress()):
    if video_path is None: raise gr.Error("Please upload or download a video file first.")

    progress(0, desc="Starting process...")
    analysis_data, essential_text, translated_text, instagram_caption = analyze_and_transcreate_with_gemini(video_path, english_caption, progress)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened(): raise gr.Error("Could not open video file.")
    frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps, total_frames = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
    ret, middle_frame_bgr = cap.read()
    if not ret: raise gr.Error("Could not read middle frame.")
    middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))

    # Prioritize white header box detection
    bbox = detect_white_header_box(middle_frame_rgb_pil, progress)
    if bbox is None:
        # Fallback to the original EasyOCR method if no header is found
        bbox = get_bbox_for_essential_text(middle_frame_rgb_pil, essential_text, progress)

    progress(0.5, desc="[3/4] Rendering translated text overlay...")
    overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
    overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)

    timestamp = int(time.time())
    temp_silent_path = f"temp_silent_{timestamp}.mp4"
    final_output_path = f"translated_video_{timestamp}.mp4"

    progress(0.6, desc="[4/4] Composing video with overlay...")
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
    frame_idx = 0
    x_min, y_min, x_max, y_max = overlay_position_box
    while True:
        ret, frame = cap.read()
        if not ret: break
        roi = frame[y_min:y_max, x_min:x_max]
        if roi.shape[:2] != (overlay_stamp_cv.shape[0], overlay_stamp_cv.shape[1]):
            h, w = roi.shape[:2]
            resized_overlay = cv2.resize(overlay_stamp_cv, (w, h))
        else: resized_overlay = overlay_stamp_cv
        alpha = resized_overlay[:, :, 3] / 255.0
        alpha_mask = cv2.merge([alpha, alpha, alpha])
        blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + resized_overlay[:, :, :3].astype(float) * alpha_mask)
        frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
        out.write(frame)
        frame_idx += 1
        progress(0.6 + (0.35 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
    cap.release(); out.release()

    progress(0.95, desc="Merging Audio and Applying Fade...")
    try:
        input_video = ffmpeg.input(temp_silent_path)
        input_audio = ffmpeg.input(video_path).audio
        (ffmpeg.output(
            input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
            input_audio, final_output_path, vcodec='libx264', acodec='copy', shortest=None
        ).run(overwrite_output=True, quiet=True))

    except ffmpeg.Error as e:
        print('ffmpeg stdout:', e.stdout.decode('utf8', errors='ignore'))
        print('ffmpeg stderr:', e.stderr.decode('utf8', errors='ignore'))
        raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8', errors='ignore')}")
    finally:
        if os.path.exists(temp_silent_path): os.remove(temp_silent_path)

    progress(1, desc="Done!")
    return final_output_path, analysis_data, instagram_caption

# --- INSTAGRAM DOWNLOADER FUNCTION ---
def download_instagram_video(ig_url: str, progress: gr.Progress = None):
    """Fetch video from Instagram post using One-API and save it locally."""
    if not ig_url:
        raise gr.Error("Please provide an Instagram URL.")
    if not ONE_API_KEY:
        raise gr.Error("ONE_API_KEY is not set for Instagram downloads.")

    if progress is not None:
        progress(0, desc="Downloading from Instagram...")
    try:
        shortcode = ig_url.split("/")[-2]
        url_one = "https://api.one-api.ir/instagram/v1/post/?shortcode=" + shortcode
        headers = {
            "accept": "application/json",
            "one-api-token": ONE_API_KEY,
            "Content-Type": "application/json"
        }
        response = requests.get(url_one, headers=headers, timeout=30)
        response.raise_for_status()
        
        result = response.json().get("result", {})
        media_list = result.get('media', [])
        
        if not media_list:
            raise ValueError("No media found in the API response.")

        # Find the first video URL in the media list
        video_url = None
        for media_item in media_list:
            if media_item.get("type") == "video":
                video_url = media_item.get("url")
                break
        
        if not video_url:
             raise ValueError("API response did not contain a direct video URL.")

        if progress is not None:
            progress(0.5, desc="Found video link. Downloading content...")
        video_response = requests.get(video_url, stream=True, timeout=60)
        video_response.raise_for_status()

        # Save the video to a temporary file
        timestamp = int(time.time())
        local_filename = f"ig_download_{timestamp}.mp4"
        with open(local_filename, 'wb') as f:
            for chunk in video_response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        print(f"Instagram video successfully downloaded to {local_filename}")
        if progress is not None:
            progress(1, desc="Download complete!")
        return local_filename

    except requests.exceptions.RequestException as e:
        raise gr.Error(f"Network error while downloading from Instagram: {str(e)}")
    except (ValueError, KeyError) as e:
        print(f"API parsing error: {response.text}")
        raise gr.Error(f"Could not process the Instagram API response: {str(e)}")
    except Exception as e:
        raise gr.Error(f"An unexpected error occurred during Instagram download: {str(e)}")

# --- GRADIO INTERFACE (Updated) ---
with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Video Transcreator") as demo:
    gr.Markdown("# 🎬 Advanced Video Transcreator v3.4")
    gr.Markdown(
        "**This version uses a powerful multi-modal prompt for superior, context-aware 'Transcreation'.**\n\n"
        "Upload a short video with English text, or provide an Instagram URL and an optional English caption. Clicking 'Download from URL' will download and automatically process the video. The app will analyze the video's mood, style, and caption to generate a perfectly integrated Persian translation and an Instagram caption. Author names (e.g., '- Rumi') are included in the translation and overlaid on a separate line."
    )
    
    with gr.Row():
        with gr.Column(scale=2):
            video_input = gr.Video(label="Upload Video or Use URL Below")
            with gr.Row():
                ig_url_input = gr.Textbox(label="Instagram Post URL", placeholder="e.g., https://www.instagram.com/p/C1a2b3Y4deF/")
                english_caption_input = gr.Textbox(label="English Caption (Optional)", placeholder="e.g., A moment of reflection with Rumi's wisdom")
                download_button = gr.Button("Download from URL")
        with gr.Column(scale=3):
            video_output = gr.Video(label="Translated Video Output")
            caption_output = gr.Textbox(label="Instagram Caption (No Hashtags)", lines=3, interactive=False)
            json_output = gr.JSON(label="Gemini Transcreation Analysis")
            
    translate_button = gr.Button("Analyze and Transcreate Video", variant="primary")

    # Define the logic flow
    def chain_download_and_process(ig_url, english_caption):
        """Chains Instagram download with video processing."""
        video_path = download_instagram_video(ig_url)
        return process_video(video_path, english_caption)

    download_button.click(
        fn=chain_download_and_process,
        inputs=[ig_url_input, english_caption_input],
        outputs=[video_output, json_output, caption_output]
    )
    
    translate_button.click(
        fn=process_video,
        inputs=[video_input, english_caption_input],
        outputs=[video_output, json_output, caption_output]
    )
    
    gr.Markdown("---")
    gr.Markdown(
        "### How it works:\n"
        "1. **Gemini Transcreation:** The video and optional English caption are sent to Gemini for a deep, multi-modal analysis. Gemini is specifically instructed to **include author names** (e.g., '- Rumi') in the essential text, **ignore temporary text** (like usernames or subtitles), and generate a Persian Instagram caption based on the video and caption input.\n"
        "2. **Category Classification:** The app selects a category (MEME_HUMOR, COLD_MOTIVATIONAL, WISE_QUOTE, TWITTER_JOKE) based on text, audio, and visuals, using clear definitions for accurate translation.\n"
        "3. **Smart BBox Detection:** The app first checks for a **prominent white header box**. If found, it uses that for a clean overlay. If not, it falls back to `EasyOCR` to find the *exact pixel location* of the essential text Gemini identified.\n"
        "4. **Render & Composite:** The Persian text, including author names on a separate line, is rendered with **adaptive color** inside the detected bounding box, with a font size that's **guaranteed to fit**, and placed precisely over the original.\n"
        "5. **Finalize with Fade-In:** The original audio is merged back into the new video, and a **1-second fade-in** is applied using `ffmpeg`.\n"
        "6. **Instagram Caption:** A concise, culturally appropriate caption is generated, incorporating the English caption (if provided), and displayed for use with the translated video."
    )

if __name__ == "__main__":
    if not os.path.exists(PERSIAN_FONT_PATH):
        print(f"WARNING: Font file '{PERSIAN_FONT_PATH}' not found. The app will likely fail. Please ensure it's in the same directory.")
    demo.launch(debug=True)