File size: 17,790 Bytes
b79a002
 
bc7d51f
b79a002
 
 
bc7d51f
 
b79a002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c6d5dc
b79a002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55c9b3c
b79a002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55c9b3c
b79a002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2133b69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b79a002
2133b69
b79a002
 
 
2133b69
b79a002
2133b69
 
 
 
b79a002
2133b69
 
 
 
 
 
b79a002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc7d51f
 
b79a002
 
 
 
 
 
bc7d51f
 
 
 
 
 
 
 
 
 
 
 
 
 
b79a002
 
 
 
 
 
 
 
 
bc7d51f
b79a002
 
bc7d51f
 
b79a002
 
bc7d51f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b79a002
 
 
bc7d51f
b79a002
 
 
 
 
 
 
 
 
bc7d51f
b79a002
 
 
 
bc7d51f
b79a002
 
 
 
 
 
 
bc7d51f
b79a002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc7d51f
 
 
 
 
 
 
 
 
 
 
 
 
b79a002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc7d51f
 
 
 
 
 
 
b79a002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc7d51f
b79a002
63a4b52
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
#!/usr/bin/env python3
import os
import sys
import json
import re
import asyncio
import threading
import time
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv
from typing import List, Dict

from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
import uvicorn

try:
    from huggingface_hub import list_repo_files, hf_hub_download, upload_file
    from openai import OpenAI
except ImportError as e:
    print(f"Missing dependency: {e}")
    exit(1)

# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
DASHSCOPE_ENDPOINT = os.getenv("DASHSCOPE_ENDPOINT", "https://ws-wd3lvtd5jarz79e5.ap-southeast-1.maas.aliyuncs.com/compatible-mode/v1")
DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
MODEL_NAME = os.getenv("MODEL_NAME", "qwen3.6-plus")

if not HF_TOKEN or not DASHSCOPE_ENDPOINT or not DASHSCOPE_API_KEY:
    print("Error: Missing HF_TOKEN, DASHSCOPE_ENDPOINT, or DASHSCOPE_API_KEY in .env")
    exit(1)

app = FastAPI(title="Movie Highlight Extraction Service")

# Global state for processing
processing_state = {
    "is_running": False,
    "total_processed": 0,
    "current_file": None,
    "error_count": 0,
    "last_error": None,
    "processed_files": []
}

HF_DATASET_REPO = "factorstudios/movs"
TRANSCRIPTION_FOLDER = "transcriptions"
HIGHLIGHTS_FOLDER = "hooks"

def parse_segment_timestamp(time_str: str) -> str:
    """Parse and validate timestamp format (HH:MM:SS)."""
    try:
        # Remove any extra whitespace
        time_str = time_str.strip()
        parts = time_str.split(":")
        if len(parts) != 3:
            raise ValueError(f"Invalid format: {time_str}")
        h, m, s = int(parts[0]), int(parts[1]), int(parts[2])
        if h < 0 or m < 0 or m > 59 or s < 0 or s > 59:
            raise ValueError(f"Invalid time values: {time_str}")
        return f"{h:02d}:{m:02d}:{s:02d}"
    except Exception as e:
        print(f"Error parsing timestamp '{time_str}': {e}")
        return "00:00:00"

def extract_segments_from_response(response_text: str) -> List[Dict]:
    """Parse LLM response to extract 10 movie segments with timestamps."""
    segments = []
    
    # Try to find JSON array in response
    json_pattern = r'\[[\s\S]*\]'
    json_matches = re.findall(json_pattern, response_text)
    
    if json_matches:
        try:
            # Try to parse the JSON
            parsed = json.loads(json_matches[-1])  # Take last match
            if isinstance(parsed, list):
                for item in parsed:
                    if isinstance(item, dict):
                        segment = {
                            "segment_number": item.get("segment_number", len(segments) + 1),
                            "title": item.get("title", f"Segment {len(segments) + 1}"),
                            "start_time": parse_segment_timestamp(item.get("start_time", "00:00:00")),
                            "end_time": parse_segment_timestamp(item.get("end_time", "00:10:00")),  # 10 minutes
                            "description": item.get("description", ""),
                            "engagement_level": item.get("engagement_level", "high"),
                            "reason": item.get("reason", "")
                        }
                        segments.append(segment)
                        if len(segments) >= 10:
                            break
        except json.JSONDecodeError:
            pass
    
    # If no JSON found or parsing failed, try to extract from text patterns
    if len(segments) < 1:
        # Look for patterns like "Segment 1:" or "1. "
        segment_pattern = r'(?:Segment|Video|Scene)\s+\d+[:\s]+'
        parts = re.split(segment_pattern, response_text)[1:]  # Skip before first match
        
        for idx, part in enumerate(parts[:10], 1):
            # Try to extract timestamps
            time_pattern = r'(\d{1,2}):(\d{2}):(\d{2})\s*[-–]\s*(\d{1,2}):(\d{2}):(\d{2})'
            time_match = re.search(time_pattern, part)
            
            if time_match:
                start_time = f"{int(time_match.group(1)):02d}:{time_match.group(2)}:{time_match.group(3)}"
                end_time = f"{int(time_match.group(4)):02d}:{time_match.group(5)}:{time_match.group(6)}"
            else:
                start_time = "00:00:00"
                end_time = "00:10:00"  # 10 minutes default
            
            # Extract first sentence as title
            title_match = re.match(r'([^.\n]+)', part.strip())
            title = title_match.group(1)[:100] if title_match else f"Segment {idx}"
            
            segment = {
                "segment_number": idx,
                "title": title,
                "start_time": start_time,
                "end_time": end_time,
                "description": part.strip()[:500],
                "engagement_level": "high",
                "reason": "Engaging scene"
            }
            segments.append(segment)
    
    return segments[:10]  # Return max 10 segments

async def process_transcription_for_highlights(
    repo_id: str, 
    transcript_filename: str, 
    transcript_content: str
) -> bool:
    """Process a single transcription and extract highlights."""
    try:
        # Extract movie name from filename
        movie_name = transcript_filename.replace(".transcript.txt", "").replace(".txt", "")
        processing_state["current_file"] = movie_name
        
        print(f"\n{'='*80}")
        print(f"Processing: {movie_name}")
        print(f"{'='*80}")
        
        # Create LLM client
        client = OpenAI(
            api_key=DASHSCOPE_API_KEY,
            base_url=DASHSCOPE_ENDPOINT
        )
        
        # Create structured prompt for segment extraction
        system_prompt = """You are an expert film editor and marketing strategist who identifies KEY MOMENTS from movies that make viewers WANT to watch the entire film.

Your task: Analyze the full movie transcript and identify exactly 10 HIGH-IMPACT moments that would serve as compelling hooks to attract audiences. These should be:
- Plot turning points / major story beats
- Action sequences / high tension moments
- Emotional climaxes / character breakthroughs
- Shocking reveals / plot twists
- Character introductions / pivotal interactions
- Climactic confrontations or resolutions

CRITICAL REQUIREMENTS:
- You MUST respond with ONLY a valid JSON array. NO text before or after.
- DO NOT arrange segments by time order - arrange them by ENGAGEMENT and IMPACT level
- Segments can start ANYWHERE in the movie (5 min, 40 min, 90 min - doesn't matter)
- Each segment should be 8-15 MINUTES LONG to capture the complete moment with context
- Include exact timestamps from the transcript
- Prioritize QUALITY over chronology - pick genuinely attention-grabbing moments
- Segment 1 = MOST IMPACTFUL, Segment 10 = still engaging but slightly less

Each segment must include:
- segment_number: (1-10, ranked by engagement level, NOT by time)
- title: (compelling, hooks viewers immediately)
- start_time: (HH:MM:SS - exact start from transcript, can be anywhere)
- end_time: (HH:MM:SS - exact end from transcript, captures full moment)
- description: (2-3 sentences explaining why this moment is essential)
- engagement_level: (high/medium - high for critical moments)
- reason: (one line hook - what makes viewers want MORE)

JSON format:
[
    {"segment_number": 1, "title": "...", "start_time": "HH:MM:SS", "end_time": "HH:MM:SS", "description": "...", "engagement_level": "high", "reason": "..."}
]
"""
        
        user_message = f"""Analyze this COMPLETE movie transcript and extract exactly 10 KEY MOMENTS that would make someone want to watch the full film.

IMPORTANT: Do NOT arrange segments by when they appear in the movie. Arrange them by IMPACT and ENGAGEMENT.
- A key scene at 40 minutes into the movie can be segment 1 if it's the most engaging
- A scene at 10 minutes can be segment 8 if it's less impactful
- Order by viewer attention level, NOT by timeline

Find the 10 best moments regardless of where they fall in the runtime.

FULL TRANSCRIPT:
{transcript_content}

Return ONLY the JSON array with exactly 10 segments ranked by engagement level."""

        print("Sending transcript to LLM for highlight extraction...")
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            temperature=0.7,
            max_tokens=4000
        )
        
        response_text = response.choices[0].message.content.strip()
        print(f"LLM Response length: {len(response_text)} characters")
        
        # Extract segments from response
        segments = extract_segments_from_response(response_text)
        
        if not segments:
            print(f"Warning: No segments extracted from LLM response")
            return False
        
        print(f"Extracted {len(segments)} segments")
        
        # Prepare upload directory structure: hooks/movie-name/
        movie_highlights_folder = f"{HIGHLIGHTS_FOLDER}/{movie_name}"
        
        # Upload each segment as a JSON file
        for segment in segments:
            segment_filename = f"segment-{segment['segment_number']:02d}.json"
            segment_path = f"{movie_highlights_folder}/{segment_filename}"
            
            # Create temporary JSON file
            import tempfile
            with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
                json.dump(segment, f, indent=2)
                temp_path = f.name
            
            try:
                print(f"Uploading {segment_path}...")
                upload_file(
                    path_or_fileobj=temp_path,
                    path_in_repo=segment_path,
                    repo_id=repo_id,
                    repo_type="dataset",
                    token=HF_TOKEN,
                    commit_message=f"Add highlight segment {segment['segment_number']} for {movie_name}"
                )
                print(f"βœ“ Uploaded {segment_path}")
            finally:
                os.unlink(temp_path)
        
        processing_state["processed_files"].append(movie_name)
        processing_state["total_processed"] += 1
        print(f"βœ“ Successfully processed {movie_name} ({len(segments)} segments)")
        return True
        
    except Exception as e:
        processing_state["error_count"] += 1
        processing_state["last_error"] = str(e)
        print(f"βœ— Error processing {movie_name}: {e}")
        return False

async def scan_and_process_highlights():
    """Scan transcriptions folder and process each file for highlights."""
    if processing_state["is_running"]:
        print("Highlight processing already running, skipping...")
        return
    
    processing_state["is_running"] = True
    print("\n" + "="*80)
    print("STARTING HIGHLIGHT EXTRACTION SERVICE")
    print("="*80)
    
    try:
        # List all files in repository
        print(f"Connecting to {HF_DATASET_REPO}...")
        files = list_repo_files(
            repo_id=HF_DATASET_REPO,
            repo_type="dataset",
            token=HF_TOKEN
        )
        
        # Get existing hook folders
        print("Scanning for existing hook folders...")
        existing_hooks = set()
        for f in files:
            if f.startswith(f"{HIGHLIGHTS_FOLDER}/"):
                # Extract movie folder name
                parts = f.split("/")
                if len(parts) >= 2:
                    movie_name = parts[1]
                    existing_hooks.add(movie_name)
        
        print(f"βœ“ Found {len(existing_hooks)} movie folders in hooks/: {existing_hooks}")
        
        # Get all transcription files
        transcript_files = [
            f for f in files 
            if f.startswith(f"{TRANSCRIPTION_FOLDER}/") and f.endswith(".txt")
        ]
        
        print(f"Found {len(transcript_files)} transcription files")
        
        if not transcript_files:
            print("No transcription files found to process")
            processing_state["is_running"] = False
            return
        
        # Filter transcriptions to only those not yet processed
        unprocessed_transcripts = []
        for transcript_file in transcript_files:
            try:
                just_filename = os.path.basename(transcript_file)
                movie_name = just_filename.replace(".transcript.txt", "").replace(".txt", "")
                
                # Skip if hooks already exist for this movie
                if movie_name in existing_hooks:
                    print(f"  ⊘ {movie_name} (already has hooks)")
                    continue
                
                unprocessed_transcripts.append({
                    "path": transcript_file,
                    "filename": just_filename,
                    "movie_name": movie_name
                })
            except Exception as e:
                print(f"Error parsing transcript {transcript_file}: {e}")
                continue
        
        print(f"\nβœ“ Found {len(unprocessed_transcripts)} unprocessed movies")
        
        if not unprocessed_transcripts:
            print("βœ“ All transcriptions already have hooks!")
            processing_state["is_running"] = False
            return
        
        # Process each unprocessed transcription
        for transcript_info in unprocessed_transcripts:
            try:
                print(f"\nDownloading: {transcript_info['path']}")
                # Download transcript
                local_path = hf_hub_download(
                    repo_id=HF_DATASET_REPO,
                    filename=transcript_info["path"],
                    repo_type="dataset",
                    token=HF_TOKEN,
                    cache_dir="/tmp/highlight_transcripts"
                )
                
                # Read transcript content
                with open(local_path, 'r', encoding='utf-8') as f:
                    transcript_content = f.read()
                
                print(f"βœ“ Downloaded: {transcript_info['filename']}")
                
                # Process for highlights
                await process_transcription_for_highlights(
                    HF_DATASET_REPO,
                    transcript_info["filename"],
                    transcript_content
                )
                
                # Small delay between requests to avoid rate limiting
                await asyncio.sleep(2)
                
            except Exception as e:
                print(f"Error processing {transcript_info['path']}: {e}")
                processing_state["error_count"] += 1
                continue
        
        print("\n" + "="*80)
        print("HIGHLIGHT EXTRACTION COMPLETE")
        print(f"Processed: {processing_state['total_processed']}")
        print(f"Errors: {processing_state['error_count']}")
        print("="*80 + "\n")
        
    except Exception as e:
        print(f"Critical error in scan_and_process: {e}")
        processing_state["last_error"] = str(e)
    finally:
        processing_state["is_running"] = False

@app.on_event("startup")
async def startup_event():
    """Schedule highlight extraction on server startup with background thread."""
    print("\n" + "="*80)
    print("STARTUP EVENT TRIGGERED - Highlight Extraction Service")
    print("="*80)
    
    # Schedule scan in a background thread (more reliable for deployment)
    def run_scan():
        print("Starting highlight extraction scan...")
        asyncio.run(scan_and_process_highlights())
    
    scan_thread = threading.Thread(target=run_scan, daemon=True)
    scan_thread.start()
    print("βœ“ Background scan thread scheduled")

@app.get("/")
async def health():
    """Health check endpoint."""
    return JSONResponse({
        "status": "running",
        "service": "Movie Highlight Extraction Service",
        "is_processing": processing_state["is_running"],
        "total_processed": processing_state["total_processed"],
        "error_count": processing_state["error_count"],
        "current_file": processing_state["current_file"],
        "last_error": processing_state["last_error"],
        "processed_files": processing_state["processed_files"]
    })

@app.post("/trigger-extraction")
async def trigger_extraction():
    """Manually trigger a new highlight extraction scan."""
    if processing_state["is_running"]:
        return JSONResponse({
            "status": "already_running",
            "message": "Highlight extraction is already in progress"
        })
    
    # Use threading for consistent behavior
    def run_scan():
        asyncio.run(scan_and_process_highlights())
    
    scan_thread = threading.Thread(target=run_scan, daemon=True)
    scan_thread.start()
    
    return JSONResponse({
        "status": "started",
        "message": "Highlight extraction scan started"
    })

@app.get("/status")
async def get_status():
    """Get current processing status."""
    return JSONResponse({
        "is_running": processing_state["is_running"],
        "total_processed": processing_state["total_processed"],
        "error_count": processing_state["error_count"],
        "current_file": processing_state["current_file"],
        "last_error": processing_state["last_error"],
        "processed_files": processing_state["processed_files"]
    })

if __name__ == "__main__":
    print("Starting Movie Highlight Extraction Service on port 7860...")
    print("Will automatically scan and process transcriptions on startup")
    uvicorn.run(app, host="0.0.0.0", port=7860)