factorstudios commited on
Commit
b79a002
·
verified ·
1 Parent(s): 0557312

Create server.py

Browse files
Files changed (1) hide show
  1. server.py +369 -0
server.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import json
4
+ import re
5
+ import asyncio
6
+ from pathlib import Path
7
+ from datetime import datetime
8
+ from dotenv import load_dotenv
9
+ from typing import List, Dict
10
+
11
+ from fastapi import FastAPI, HTTPException
12
+ from fastapi.responses import JSONResponse
13
+ import uvicorn
14
+
15
+ try:
16
+ from huggingface_hub import list_repo_files, hf_hub_download, upload_file
17
+ from openai import OpenAI
18
+ except ImportError as e:
19
+ print(f"Missing dependency: {e}")
20
+ exit(1)
21
+
22
+ # Load environment variables
23
+ load_dotenv()
24
+ HF_TOKEN = os.getenv("HF_TOKEN")
25
+ DASHSCOPE_ENDPOINT = os.getenv("DASHSCOPE_ENDPOINT")
26
+ DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
27
+ MODEL_NAME = os.getenv("MODEL_NAME", "qwen3.6-plus")
28
+
29
+ if not HF_TOKEN or not DASHSCOPE_ENDPOINT or not DASHSCOPE_API_KEY:
30
+ print("Error: Missing HF_TOKEN, DASHSCOPE_ENDPOINT, or DASHSCOPE_API_KEY in .env")
31
+ exit(1)
32
+
33
+ app = FastAPI(title="Movie Highlight Extraction Service")
34
+
35
+ # Global state for processing
36
+ processing_state = {
37
+ "is_running": False,
38
+ "total_processed": 0,
39
+ "current_file": None,
40
+ "error_count": 0,
41
+ "last_error": None,
42
+ "processed_files": []
43
+ }
44
+
45
+ HF_DATASET_REPO = "factorstudios/movs"
46
+ TRANSCRIPTION_FOLDER = "transcriptions"
47
+ HIGHLIGHTS_FOLDER = "hooks"
48
+
49
+ def parse_segment_timestamp(time_str: str) -> str:
50
+ """Parse and validate timestamp format (HH:MM:SS)."""
51
+ try:
52
+ # Remove any extra whitespace
53
+ time_str = time_str.strip()
54
+ parts = time_str.split(":")
55
+ if len(parts) != 3:
56
+ raise ValueError(f"Invalid format: {time_str}")
57
+ h, m, s = int(parts[0]), int(parts[1]), int(parts[2])
58
+ if h < 0 or m < 0 or m > 59 or s < 0 or s > 59:
59
+ raise ValueError(f"Invalid time values: {time_str}")
60
+ return f"{h:02d}:{m:02d}:{s:02d}"
61
+ except Exception as e:
62
+ print(f"Error parsing timestamp '{time_str}': {e}")
63
+ return "00:00:00"
64
+
65
+ def extract_segments_from_response(response_text: str) -> List[Dict]:
66
+ """Parse LLM response to extract 10 movie segments with timestamps."""
67
+ segments = []
68
+
69
+ # Try to find JSON array in response
70
+ json_pattern = r'\[[\s\S]*\]'
71
+ json_matches = re.findall(json_pattern, response_text)
72
+
73
+ if json_matches:
74
+ try:
75
+ # Try to parse the JSON
76
+ parsed = json.loads(json_matches[-1]) # Take last match
77
+ if isinstance(parsed, list):
78
+ for item in parsed:
79
+ if isinstance(item, dict):
80
+ segment = {
81
+ "segment_number": item.get("segment_number", len(segments) + 1),
82
+ "title": item.get("title", f"Segment {len(segments) + 1}"),
83
+ "start_time": parse_segment_timestamp(item.get("start_time", "00:00:00")),
84
+ "end_time": parse_segment_timestamp(item.get("end_time", "00:01:00")),
85
+ "description": item.get("description", ""),
86
+ "engagement_level": item.get("engagement_level", "high"),
87
+ "reason": item.get("reason", "")
88
+ }
89
+ segments.append(segment)
90
+ if len(segments) >= 10:
91
+ break
92
+ except json.JSONDecodeError:
93
+ pass
94
+
95
+ # If no JSON found or parsing failed, try to extract from text patterns
96
+ if len(segments) < 1:
97
+ # Look for patterns like "Segment 1:" or "1. "
98
+ segment_pattern = r'(?:Segment|Video|Scene)\s+\d+[:\s]+'
99
+ parts = re.split(segment_pattern, response_text)[1:] # Skip before first match
100
+
101
+ for idx, part in enumerate(parts[:10], 1):
102
+ # Try to extract timestamps
103
+ time_pattern = r'(\d{1,2}):(\d{2}):(\d{2})\s*[-–]\s*(\d{1,2}):(\d{2}):(\d{2})'
104
+ time_match = re.search(time_pattern, part)
105
+
106
+ if time_match:
107
+ start_time = f"{int(time_match.group(1)):02d}:{time_match.group(2)}:{time_match.group(3)}"
108
+ end_time = f"{int(time_match.group(4)):02d}:{time_match.group(5)}:{time_match.group(6)}"
109
+ else:
110
+ start_time = "00:00:00"
111
+ end_time = "00:01:00"
112
+
113
+ # Extract first sentence as title
114
+ title_match = re.match(r'([^.\n]+)', part.strip())
115
+ title = title_match.group(1)[:100] if title_match else f"Segment {idx}"
116
+
117
+ segment = {
118
+ "segment_number": idx,
119
+ "title": title,
120
+ "start_time": start_time,
121
+ "end_time": end_time,
122
+ "description": part.strip()[:500],
123
+ "engagement_level": "high",
124
+ "reason": "Engaging scene"
125
+ }
126
+ segments.append(segment)
127
+
128
+ return segments[:10] # Return max 10 segments
129
+
130
+ async def process_transcription_for_highlights(
131
+ repo_id: str,
132
+ transcript_filename: str,
133
+ transcript_content: str
134
+ ) -> bool:
135
+ """Process a single transcription and extract highlights."""
136
+ try:
137
+ # Extract movie name from filename
138
+ movie_name = transcript_filename.replace(".transcript.txt", "").replace(".txt", "")
139
+ processing_state["current_file"] = movie_name
140
+
141
+ print(f"\n{'='*80}")
142
+ print(f"Processing: {movie_name}")
143
+ print(f"{'='*80}")
144
+
145
+ # Create LLM client
146
+ client = OpenAI(
147
+ api_key=DASHSCOPE_API_KEY,
148
+ base_url=DASHSCOPE_ENDPOINT
149
+ )
150
+
151
+ # Create structured prompt for segment extraction
152
+ system_prompt = """You are a movie marketing expert who identifies the most engaging and thrilling segments of movies.
153
+ You will receive a full movie transcript with timestamps. Your task is to identify exactly 10 of the most compelling moments that would make audiences want to watch the full movie.
154
+
155
+ IMPORTANT: You MUST respond with a valid JSON array. Do not include any text before or after the JSON array.
156
+
157
+ Each segment must have:
158
+ - segment_number: (1-10)
159
+ - title: (engaging, compelling title for this moment)
160
+ - start_time: (HH:MM:SS format - when this segment starts)
161
+ - end_time: (HH:MM:SS format - when this segment ends)
162
+ - description: (brief description of why this is engaging)
163
+ - engagement_level: (high/medium)
164
+ - reason: (one-line reason this will hook viewers)
165
+
166
+ Return ONLY the JSON array. Example format:
167
+ [
168
+ {"segment_number": 1, "title": "Epic Action Scene", "start_time": "00:15:32", "end_time": "00:18:45", "description": "...", "engagement_level": "high", "reason": "..."},
169
+ {"segment_number": 2, "title": "Emotional Climax", "start_time": "00:45:12", "end_time": "00:48:30", "description": "...", "engagement_level": "high", "reason": "..."}
170
+ ]
171
+ """
172
+
173
+ user_message = f"""Please extract exactly 10 of the most engaging segments from this movie transcript.
174
+
175
+ TRANSCRIPT:
176
+ {transcript_content[:15000]}
177
+
178
+ Return a JSON array with exactly 10 segments following the format specified. Each segment must have accurate start and end times from the transcript."""
179
+
180
+ print("Sending transcript to LLM for highlight extraction...")
181
+ response = client.chat.completions.create(
182
+ model=MODEL_NAME,
183
+ messages=[
184
+ {"role": "system", "content": system_prompt},
185
+ {"role": "user", "content": user_message}
186
+ ],
187
+ temperature=0.7,
188
+ max_tokens=4000
189
+ )
190
+
191
+ response_text = response.choices[0].message.content.strip()
192
+ print(f"LLM Response length: {len(response_text)} characters")
193
+
194
+ # Extract segments from response
195
+ segments = extract_segments_from_response(response_text)
196
+
197
+ if not segments:
198
+ print(f"Warning: No segments extracted from LLM response")
199
+ return False
200
+
201
+ print(f"Extracted {len(segments)} segments")
202
+
203
+ # Prepare upload directory structure: hooks/movie-name/
204
+ movie_highlights_folder = f"{HIGHLIGHTS_FOLDER}/{movie_name}"
205
+
206
+ # Upload each segment as a JSON file
207
+ for segment in segments:
208
+ segment_filename = f"segment-{segment['segment_number']:02d}.json"
209
+ segment_path = f"{movie_highlights_folder}/{segment_filename}"
210
+
211
+ # Create temporary JSON file
212
+ import tempfile
213
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
214
+ json.dump(segment, f, indent=2)
215
+ temp_path = f.name
216
+
217
+ try:
218
+ print(f"Uploading {segment_path}...")
219
+ upload_file(
220
+ path_or_fileobj=temp_path,
221
+ path_in_repo=segment_path,
222
+ repo_id=repo_id,
223
+ repo_type="dataset",
224
+ token=HF_TOKEN,
225
+ commit_message=f"Add highlight segment {segment['segment_number']} for {movie_name}"
226
+ )
227
+ print(f"✓ Uploaded {segment_path}")
228
+ finally:
229
+ os.unlink(temp_path)
230
+
231
+ processing_state["processed_files"].append(movie_name)
232
+ processing_state["total_processed"] += 1
233
+ print(f"✓ Successfully processed {movie_name} ({len(segments)} segments)")
234
+ return True
235
+
236
+ except Exception as e:
237
+ processing_state["error_count"] += 1
238
+ processing_state["last_error"] = str(e)
239
+ print(f"✗ Error processing {movie_name}: {e}")
240
+ return False
241
+
242
+ async def scan_and_process_highlights():
243
+ """Scan transcriptions folder and process each file for highlights."""
244
+ if processing_state["is_running"]:
245
+ print("Highlight processing already running, skipping...")
246
+ return
247
+
248
+ processing_state["is_running"] = True
249
+ print("\n" + "="*80)
250
+ print("STARTING HIGHLIGHT EXTRACTION SERVICE")
251
+ print("="*80)
252
+
253
+ try:
254
+ # List all transcription files
255
+ print(f"Scanning {HF_DATASET_REPO}/{TRANSCRIPTION_FOLDER}/ for transcription files...")
256
+
257
+ files = list_repo_files(
258
+ repo_id=HF_DATASET_REPO,
259
+ repo_type="dataset",
260
+ token=HF_TOKEN
261
+ )
262
+
263
+ transcript_files = [
264
+ f for f in files
265
+ if f.startswith(f"{TRANSCRIPTION_FOLDER}/") and f.endswith(".txt")
266
+ ]
267
+
268
+ print(f"Found {len(transcript_files)} transcription files")
269
+
270
+ if not transcript_files:
271
+ print("No transcription files found to process")
272
+ return
273
+
274
+ # Process each transcription
275
+ for transcript_file in transcript_files:
276
+ try:
277
+ # Download transcript
278
+ local_path = hf_hub_download(
279
+ repo_id=HF_DATASET_REPO,
280
+ filename=transcript_file,
281
+ repo_type="dataset",
282
+ token=HF_TOKEN,
283
+ cache_dir="/tmp/highlight_transcripts"
284
+ )
285
+
286
+ # Read transcript content
287
+ with open(local_path, 'r', encoding='utf-8') as f:
288
+ transcript_content = f.read()
289
+
290
+ # Extract just the filename
291
+ just_filename = os.path.basename(transcript_file)
292
+
293
+ # Process for highlights
294
+ await process_transcription_for_highlights(
295
+ HF_DATASET_REPO,
296
+ just_filename,
297
+ transcript_content
298
+ )
299
+
300
+ # Small delay between requests to avoid rate limiting
301
+ await asyncio.sleep(2)
302
+
303
+ except Exception as e:
304
+ print(f"Error downloading {transcript_file}: {e}")
305
+ processing_state["error_count"] += 1
306
+ continue
307
+
308
+ print("\n" + "="*80)
309
+ print("HIGHLIGHT EXTRACTION COMPLETE")
310
+ print(f"Processed: {processing_state['total_processed']}")
311
+ print(f"Errors: {processing_state['error_count']}")
312
+ print("="*80 + "\n")
313
+
314
+ except Exception as e:
315
+ print(f"Critical error in scan_and_process: {e}")
316
+ processing_state["last_error"] = str(e)
317
+ finally:
318
+ processing_state["is_running"] = False
319
+
320
+ @app.on_event("startup")
321
+ async def startup_event():
322
+ """Start highlight extraction on server startup."""
323
+ asyncio.create_task(scan_and_process_highlights())
324
+
325
+ @app.get("/")
326
+ async def health():
327
+ """Health check endpoint."""
328
+ return JSONResponse({
329
+ "status": "running",
330
+ "service": "Movie Highlight Extraction Service",
331
+ "is_processing": processing_state["is_running"],
332
+ "total_processed": processing_state["total_processed"],
333
+ "error_count": processing_state["error_count"],
334
+ "current_file": processing_state["current_file"],
335
+ "last_error": processing_state["last_error"],
336
+ "processed_files": processing_state["processed_files"]
337
+ })
338
+
339
+ @app.post("/trigger-extraction")
340
+ async def trigger_extraction():
341
+ """Manually trigger a new highlight extraction scan."""
342
+ if processing_state["is_running"]:
343
+ return JSONResponse({
344
+ "status": "already_running",
345
+ "message": "Highlight extraction is already in progress"
346
+ })
347
+
348
+ asyncio.create_task(scan_and_process_highlights())
349
+ return JSONResponse({
350
+ "status": "started",
351
+ "message": "Highlight extraction scan started"
352
+ })
353
+
354
+ @app.get("/status")
355
+ async def get_status():
356
+ """Get current processing status."""
357
+ return JSONResponse({
358
+ "is_running": processing_state["is_running"],
359
+ "total_processed": processing_state["total_processed"],
360
+ "error_count": processing_state["error_count"],
361
+ "current_file": processing_state["current_file"],
362
+ "last_error": processing_state["last_error"],
363
+ "processed_files": processing_state["processed_files"]
364
+ })
365
+
366
+ if __name__ == "__main__":
367
+ print("Starting Movie Highlight Extraction Service on port 7861...")
368
+ print("Will automatically scan and process transcriptions on startup")
369
+ uvicorn.run(app, host="0.0.0.0", port=7861)