File size: 14,912 Bytes
91d209c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
"""
Whisper-based Video Trimming and Frame Extraction
Uses OpenAI's Whisper to detect last spoken word and find optimal transition frames
"""

import re
from typing import List, Tuple, Optional
import tempfile
import os

try:
    import whisper
    from moviepy.editor import VideoFileClip
    WHISPER_AVAILABLE = True
    USE_SYSTEM_WHISPER = False
except ImportError:
    # Try to use system Python's Whisper
    import subprocess
    import sys
    SYSTEM_PYTHON = "/opt/anaconda3/bin/python"
    if os.path.exists(SYSTEM_PYTHON):
        try:
            # Test if system Python has whisper
            result = subprocess.run(
                [SYSTEM_PYTHON, "-c", "import whisper; print('OK')"],
                capture_output=True,
                timeout=5
            )
            if result.returncode == 0:
                WHISPER_AVAILABLE = True
                USE_SYSTEM_WHISPER = True
            else:
                WHISPER_AVAILABLE = False
                USE_SYSTEM_WHISPER = False
        except:
            WHISPER_AVAILABLE = False
            USE_SYSTEM_WHISPER = False
    else:
        WHISPER_AVAILABLE = False
        USE_SYSTEM_WHISPER = False
    
    if not WHISPER_AVAILABLE:
        print("⚠️  Whisper not available. Install with: pip install openai-whisper moviepy")


def normalize_text(text: str) -> str:
    """Normalize text by removing punctuation and converting to lowercase"""
    return re.sub(r"[^\w\s]", "", text.lower().strip())


def transcribe_video(
    video_path: str,
    model_size: str = "base"
) -> Tuple[str, Optional[float]]:
    """
    Transcribe video audio and find the last word timestamp.
    
    Args:
        video_path: Path to video file
        model_size: Whisper model size (tiny, base, small, medium, large)
    
    Returns:
        Tuple of (full_transcription, last_word_end_time)
    """
    if not WHISPER_AVAILABLE:
        raise ImportError("Whisper not installed. Run: pip install openai-whisper moviepy")
    
    # Use system Python if needed
    if USE_SYSTEM_WHISPER:
        return _transcribe_video_system(video_path, model_size)
    
    print(f"🎀 Loading Whisper model ({model_size})...")
    model = whisper.load_model(model_size)
    
    print("🎀 Transcribing audio...")
    result = model.transcribe(video_path, word_timestamps=True)
    
    # Get full transcription text
    full_text = result.get("text", "").strip()
    
    # Get last word timestamp
    segments = result.get("segments", [])
    last_time = None
    
    if segments:
        # Find the end time of the last word
        for seg in reversed(segments):
            words = seg.get("words", [])
            if words:
                last_time = words[-1].get("end")
                break
    
    print(f"πŸ“ Transcribed: \"{full_text[:100]}...\"" if len(full_text) > 100 else f"πŸ“ Transcribed: \"{full_text}\"")
    if last_time:
        print(f"βœ… Last word ends at {last_time:.2f} seconds")
    
    return full_text, last_time


def _transcribe_video_system(
    video_path: str,
    model_size: str = "base"
) -> Tuple[str, Optional[float]]:
    """Transcribe video using system Python's Whisper"""
    import subprocess
    import json
    
    SYSTEM_PYTHON = "/opt/anaconda3/bin/python"
    
    print(f"🎀 Using system Whisper (model: {model_size})...")
    
    # Create a temporary Python script file to avoid shell escaping issues
    script_file = tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False)
    try:
        script_file.write(f"""
import whisper
import json
import sys

video_path = {json.dumps(video_path)}
model_size = {json.dumps(model_size)}

model = whisper.load_model(model_size)
result = model.transcribe(video_path, word_timestamps=True)

# Extract transcription and last word timestamp
full_text = result.get("text", "").strip()
segments = result.get("segments", [])
last_time = None

if segments:
    for seg in reversed(segments):
        words = seg.get("words", [])
        if words:
            last_time = words[-1].get("end")
            break

output = {{
    "text": full_text,
    "last_time": last_time
}}

print(json.dumps(output))
""")
        script_file.close()
        
        result = subprocess.run(
            [SYSTEM_PYTHON, script_file.name],
            capture_output=True,
            text=True,
            timeout=300  # 5 minute timeout
        )
        
        if result.returncode != 0:
            raise Exception(f"Whisper transcription failed: {result.stderr}")
        
        output = json.loads(result.stdout.strip())
        full_text = output.get("text", "").strip()
        last_time = output.get("last_time")
        
        print(f"πŸ“ Transcribed: \"{full_text[:100]}...\"" if len(full_text) > 100 else f"πŸ“ Transcribed: \"{full_text}\"")
        if last_time:
            print(f"βœ… Last word ends at {last_time:.2f} seconds")
        
        return full_text, last_time
        
    except subprocess.TimeoutExpired:
        raise Exception("Whisper transcription timed out")
    except json.JSONDecodeError as e:
        raise Exception(f"Failed to parse Whisper output: {str(e)}")
    except Exception as e:
        raise Exception(f"System Whisper error: {str(e)}")
    finally:
        # Clean up script file
        try:
            if os.path.exists(script_file.name):
                os.remove(script_file.name)
        except:
            pass


def find_last_word_timestamp(
    video_path: str,
    script: str,
    model_size: str = "base"
) -> Optional[float]:
    """
    Find the timestamp of the last spoken word in the script
    
    Args:
        video_path: Path to video file
        script: Expected script/dialogue
        model_size: Whisper model size (tiny, base, small, medium, large)
    
    Returns:
        Timestamp (seconds) of last word, or None if not found
    """
    if not WHISPER_AVAILABLE:
        raise ImportError("Whisper not installed. Run: pip install openai-whisper moviepy")
    
    # Use system Python if needed
    if USE_SYSTEM_WHISPER:
        return _find_last_word_timestamp_system(video_path, script, model_size)
    
    print(f"🎀 Loading Whisper model ({model_size})...")
    model = whisper.load_model(model_size)
    
    print("🎀 Transcribing audio...")
    result = model.transcribe(video_path, word_timestamps=True)
    
    segments = result.get("segments", [])
    if not segments:
        raise ValueError("No speech segments detected in video")
    
    # Find last word in script
    script_clean = normalize_text(script)
    script_words = script_clean.split()
    
    if not script_words:
        raise ValueError("Script is empty")
    
    last_word = script_words[-1]
    print(f"πŸ” Looking for last word: '{last_word}'")
    
    # Search for last occurrence of that word in transcription
    last_time = None
    for seg in segments:
        for word_info in seg.get("words", []):
            word_text = normalize_text(word_info["word"])
            if word_text == last_word:
                last_time = word_info["end"]
    
    if last_time is None:
        # Fallback: try to find any word from the end of script
        print(f"⚠️  Last word '{last_word}' not found, trying other words...")
        for i in range(min(5, len(script_words))):  # Try last 5 words
            word_to_find = script_words[-(i+1)]
            for seg in segments:
                for word_info in seg.get("words", []):
                    word_text = normalize_text(word_info["word"])
                    if word_text == word_to_find:
                        last_time = word_info["end"]
                        print(f"βœ… Found '{word_to_find}' at {last_time:.2f}s instead")
                        break
                if last_time:
                    break
            if last_time:
                break
    
    if last_time:
        print(f"βœ… Last spoken word ends at {last_time:.2f} seconds")
    
    return last_time


def extract_post_speech_frames(
    video_path: str,
    script: str,
    buffer_time: float = 0.3,
    num_frames: int = 3,
    model_size: str = "base"
) -> List[Tuple[float, str]]:
    """
    Extract frames from the post-speech zone (after last spoken word)
    
    Args:
        video_path: Path to video file
        script: Expected script/dialogue
        buffer_time: Time after last word to start extracting (seconds)
        num_frames: Number of frames to extract
        model_size: Whisper model size
    
    Returns:
        List of (timestamp, base64_data_url) tuples
    """
    if not WHISPER_AVAILABLE:
        raise ImportError("Whisper not installed. Run: pip install openai-whisper moviepy")
    
    # Find last word timestamp
    last_word_time = find_last_word_timestamp(video_path, script, model_size)
    
    if last_word_time is None:
        raise ValueError("Could not find last spoken word in video")
    
    # Get video duration
    clip = VideoFileClip(video_path)
    duration = clip.duration
    clip.close()
    
    # Calculate post-speech zone
    post_speech_start = min(last_word_time + buffer_time, duration - 0.5)
    post_speech_end = duration
    
    print(f"πŸ“ Post-speech zone: {post_speech_start:.2f}s to {post_speech_end:.2f}s")
    
    # Calculate frame timestamps
    available_time = post_speech_end - post_speech_start
    if available_time < 0.1:
        # Very little time, just use the end
        timestamps = [duration - 0.1]
    else:
        # Distribute frames evenly in post-speech zone
        if num_frames == 1:
            timestamps = [post_speech_end - 0.1]
        else:
            step = available_time / (num_frames - 1)
            timestamps = [post_speech_start + (i * step) for i in range(num_frames)]
    
    # Extract frames
    from utils.video_processor import extract_frame
    
    frames = []
    for i, timestamp in enumerate(timestamps):
        print(f"πŸ“Έ Extracting frame at {timestamp:.2f}s...")
        frame_data = extract_frame(video_path, timestamp, return_base64=True)
        
        # Create label based on position
        if i == 0 and len(timestamps) > 1:
            label = "Right After Speech"
        elif i == len(timestamps) - 1:
            label = "Final Frame"
        else:
            label = f"Frame {i+1}"
        
        frames.append((timestamp, frame_data, label))
    
    return frames


def trim_video_to_last_word(
    video_path: str,
    script: str,
    output_path: str,
    padding: float = 0.5,
    model_size: str = "base"
) -> str:
    """
    Trim video to end shortly after the last spoken word
    
    Args:
        video_path: Input video path
        script: Expected script/dialogue
        output_path: Output video path
        padding: Time to keep after last word (seconds)
        model_size: Whisper model size
    
    Returns:
        Path to trimmed video
    """
    if not WHISPER_AVAILABLE:
        raise ImportError("Whisper not installed. Run: pip install openai-whisper moviepy")
    
    # Find last word timestamp
    last_word_time = find_last_word_timestamp(video_path, script, model_size)
    
    if last_word_time is None:
        raise ValueError("Could not find last spoken word in video")
    
    # Calculate trim point
    trim_time = last_word_time + padding
    
    print(f"βœ‚οΈ  Trimming video to {trim_time:.2f} seconds...")
    
    # Use FFmpeg for trimming (more reliable than moviepy, especially with system Python)
    import subprocess
    
    # FFmpeg command to trim video
    cmd = [
        "ffmpeg",
        "-i", video_path,
        "-t", str(trim_time),  # Duration to keep
        "-c", "copy",  # Copy codecs (fast, no re-encoding)
        "-avoid_negative_ts", "make_zero",
        "-y",  # Overwrite output file
        output_path
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode != 0:
        # If copy codec fails, try re-encoding
        print("⚠️  Copy codec failed, re-encoding...")
        cmd = [
            "ffmpeg",
            "-i", video_path,
            "-t", str(trim_time),
            "-c:v", "libx264",
            "-c:a", "aac",
            "-y",
            output_path
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode != 0:
            raise Exception(f"FFmpeg trimming failed: {result.stderr}")
    
    print(f"βœ… Trimmed video saved to: {output_path}")
    
    return output_path


def _find_last_word_timestamp_system(
    video_path: str,
    script: str,
    model_size: str = "base"
) -> Optional[float]:
    """Find last word timestamp using system Python"""
    import subprocess
    import json
    import tempfile
    
    SYSTEM_PYTHON = "/opt/anaconda3/bin/python"
    
    print(f"🎀 Using system Whisper (model: {model_size})...")
    
    # Create temp file for JSON output
    temp_json = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
    temp_json.close()
    
    try:
        # Run whisper via system Python
        cmd = [
            SYSTEM_PYTHON, "-m", "whisper",
            video_path,
            "--model", model_size,
            "--output_format", "json",
            "--output_dir", os.path.dirname(temp_json.name),
            "--word_timestamps", "True"
        ]
        
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
        
        if result.returncode != 0:
            raise Exception(f"Whisper transcription failed: {result.stderr}")
        
        # Find JSON file
        base_name = os.path.splitext(os.path.basename(video_path))[0]
        json_path = os.path.join(os.path.dirname(temp_json.name), f"{base_name}.json")
        
        if not os.path.exists(json_path):
            raise Exception(f"JSON output not found: {json_path}")
        
        with open(json_path, 'r') as f:
            transcription_data = json.load(f)
        
        # Find last word
        script_clean = normalize_text(script)
        script_words = script_clean.split()
        
        if not script_words:
            return None
        
        last_word = script_words[-1]
        segments = transcription_data.get("segments", [])
        
        last_time = None
        for seg in segments:
            for word_info in seg.get("words", []):
                word_text = normalize_text(word_info.get("word", ""))
                if word_text == last_word:
                    last_time = word_info.get("end", 0)
        
        # Cleanup
        try:
            os.remove(json_path)
        except:
            pass
        
        return last_time
        
    finally:
        try:
            os.remove(temp_json.name)
        except:
            pass

def is_whisper_available() -> bool:
    """Check if Whisper is installed and available"""
    return WHISPER_AVAILABLE