Spaces:

factorstudios
/

lighter

Sleeping

App Files Files Community

lighter / server.py

factorstudios

Update server.py

bc7d51f verified 2 days ago

raw

history blame contribute delete

17.8 kB

	#!/usr/bin/env python3
	import os
	import sys
	import json
	import re
	import asyncio
	import threading
	import time
	from pathlib import Path
	from datetime import datetime
	from dotenv import load_dotenv
	from typing import List, Dict

	from fastapi import FastAPI, HTTPException
	from fastapi.responses import JSONResponse
	import uvicorn

	try:
	from huggingface_hub import list_repo_files, hf_hub_download, upload_file
	from openai import OpenAI
	except ImportError as e:
	print(f"Missing dependency: {e}")
	exit(1)

	# Load environment variables
	load_dotenv()
	HF_TOKEN = os.getenv("HF_TOKEN")
	DASHSCOPE_ENDPOINT = os.getenv("DASHSCOPE_ENDPOINT", "https://ws-wd3lvtd5jarz79e5.ap-southeast-1.maas.aliyuncs.com/compatible-mode/v1")
	DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
	MODEL_NAME = os.getenv("MODEL_NAME", "qwen3.6-plus")

	if not HF_TOKEN or not DASHSCOPE_ENDPOINT or not DASHSCOPE_API_KEY:
	print("Error: Missing HF_TOKEN, DASHSCOPE_ENDPOINT, or DASHSCOPE_API_KEY in .env")
	exit(1)

	app = FastAPI(title="Movie Highlight Extraction Service")

	# Global state for processing
	processing_state = {
	"is_running": False,
	"total_processed": 0,
	"current_file": None,
	"error_count": 0,
	"last_error": None,
	"processed_files": []
	}

	HF_DATASET_REPO = "factorstudios/movs"
	TRANSCRIPTION_FOLDER = "transcriptions"
	HIGHLIGHTS_FOLDER = "hooks"

	def parse_segment_timestamp(time_str: str) -> str:
	"""Parse and validate timestamp format (HH:MM:SS)."""
	try:
	# Remove any extra whitespace
	time_str = time_str.strip()
	parts = time_str.split(":")
	if len(parts) != 3:
	raise ValueError(f"Invalid format: {time_str}")
	h, m, s = int(parts[0]), int(parts[1]), int(parts[2])
	if h < 0 or m < 0 or m > 59 or s < 0 or s > 59:
	raise ValueError(f"Invalid time values: {time_str}")
	return f"{h:02d}:{m:02d}:{s:02d}"
	except Exception as e:
	print(f"Error parsing timestamp '{time_str}': {e}")
	return "00:00:00"

	def extract_segments_from_response(response_text: str) -> List[Dict]:
	"""Parse LLM response to extract 10 movie segments with timestamps."""
	segments = []

	# Try to find JSON array in response
	json_pattern = r'\[[\s\S]*\]'
	json_matches = re.findall(json_pattern, response_text)

	if json_matches:
	try:
	# Try to parse the JSON
	parsed = json.loads(json_matches[-1]) # Take last match
	if isinstance(parsed, list):
	for item in parsed:
	if isinstance(item, dict):
	segment = {
	"segment_number": item.get("segment_number", len(segments) + 1),
	"title": item.get("title", f"Segment {len(segments) + 1}"),
	"start_time": parse_segment_timestamp(item.get("start_time", "00:00:00")),
	"end_time": parse_segment_timestamp(item.get("end_time", "00:10:00")), # 10 minutes
	"description": item.get("description", ""),
	"engagement_level": item.get("engagement_level", "high"),
	"reason": item.get("reason", "")
	}
	segments.append(segment)
	if len(segments) >= 10:
	break
	except json.JSONDecodeError:
	pass

	# If no JSON found or parsing failed, try to extract from text patterns
	if len(segments) < 1:
	# Look for patterns like "Segment 1:" or "1. "
	segment_pattern = r'(?:Segment\|Video\|Scene)\s+\d+[:\s]+'
	parts = re.split(segment_pattern, response_text)[1:] # Skip before first match

	for idx, part in enumerate(parts[:10], 1):
	# Try to extract timestamps
	time_pattern = r'(\d{1,2}):(\d{2}):(\d{2})\s[-–]\s(\d{1,2}):(\d{2}):(\d{2})'
	time_match = re.search(time_pattern, part)

	if time_match:
	start_time = f"{int(time_match.group(1)):02d}:{time_match.group(2)}:{time_match.group(3)}"
	end_time = f"{int(time_match.group(4)):02d}:{time_match.group(5)}:{time_match.group(6)}"
	else:
	start_time = "00:00:00"
	end_time = "00:10:00" # 10 minutes default

	# Extract first sentence as title
	title_match = re.match(r'([^.\n]+)', part.strip())
	title = title_match.group(1)[:100] if title_match else f"Segment {idx}"

	segment = {
	"segment_number": idx,
	"title": title,
	"start_time": start_time,
	"end_time": end_time,
	"description": part.strip()[:500],
	"engagement_level": "high",
	"reason": "Engaging scene"
	}
	segments.append(segment)

	return segments[:10] # Return max 10 segments

	async def process_transcription_for_highlights(
	repo_id: str,
	transcript_filename: str,
	transcript_content: str
	) -> bool:
	"""Process a single transcription and extract highlights."""
	try:
	# Extract movie name from filename
	movie_name = transcript_filename.replace(".transcript.txt", "").replace(".txt", "")
	processing_state["current_file"] = movie_name

	print(f"\n{'='*80}")
	print(f"Processing: {movie_name}")
	print(f"{'='*80}")

	# Create LLM client
	client = OpenAI(
	api_key=DASHSCOPE_API_KEY,
	base_url=DASHSCOPE_ENDPOINT
	)

	# Create structured prompt for segment extraction
	system_prompt = """You are an expert film editor and marketing strategist who identifies KEY MOMENTS from movies that make viewers WANT to watch the entire film.

	Your task: Analyze the full movie transcript and identify exactly 10 HIGH-IMPACT moments that would serve as compelling hooks to attract audiences. These should be:
	- Plot turning points / major story beats
	- Action sequences / high tension moments
	- Emotional climaxes / character breakthroughs
	- Shocking reveals / plot twists
	- Character introductions / pivotal interactions
	- Climactic confrontations or resolutions

	CRITICAL REQUIREMENTS:
	- You MUST respond with ONLY a valid JSON array. NO text before or after.
	- DO NOT arrange segments by time order - arrange them by ENGAGEMENT and IMPACT level
	- Segments can start ANYWHERE in the movie (5 min, 40 min, 90 min - doesn't matter)
	- Each segment should be 8-15 MINUTES LONG to capture the complete moment with context
	- Include exact timestamps from the transcript
	- Prioritize QUALITY over chronology - pick genuinely attention-grabbing moments
	- Segment 1 = MOST IMPACTFUL, Segment 10 = still engaging but slightly less

	Each segment must include:
	- segment_number: (1-10, ranked by engagement level, NOT by time)
	- title: (compelling, hooks viewers immediately)
	- start_time: (HH:MM:SS - exact start from transcript, can be anywhere)
	- end_time: (HH:MM:SS - exact end from transcript, captures full moment)
	- description: (2-3 sentences explaining why this moment is essential)
	- engagement_level: (high/medium - high for critical moments)
	- reason: (one line hook - what makes viewers want MORE)

	JSON format:
	[
	{"segment_number": 1, "title": "...", "start_time": "HH:MM:SS", "end_time": "HH:MM:SS", "description": "...", "engagement_level": "high", "reason": "..."}
	]
	"""

	user_message = f"""Analyze this COMPLETE movie transcript and extract exactly 10 KEY MOMENTS that would make someone want to watch the full film.

	IMPORTANT: Do NOT arrange segments by when they appear in the movie. Arrange them by IMPACT and ENGAGEMENT.
	- A key scene at 40 minutes into the movie can be segment 1 if it's the most engaging
	- A scene at 10 minutes can be segment 8 if it's less impactful
	- Order by viewer attention level, NOT by timeline

	Find the 10 best moments regardless of where they fall in the runtime.

	FULL TRANSCRIPT:
	{transcript_content}

	Return ONLY the JSON array with exactly 10 segments ranked by engagement level."""

	print("Sending transcript to LLM for highlight extraction...")
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_message}
	],
	temperature=0.7,
	max_tokens=4000
	)

	response_text = response.choices[0].message.content.strip()
	print(f"LLM Response length: {len(response_text)} characters")

	# Extract segments from response
	segments = extract_segments_from_response(response_text)

	if not segments:
	print(f"Warning: No segments extracted from LLM response")
	return False

	print(f"Extracted {len(segments)} segments")

	# Prepare upload directory structure: hooks/movie-name/
	movie_highlights_folder = f"{HIGHLIGHTS_FOLDER}/{movie_name}"

	# Upload each segment as a JSON file
	for segment in segments:
	segment_filename = f"segment-{segment['segment_number']:02d}.json"
	segment_path = f"{movie_highlights_folder}/{segment_filename}"

	# Create temporary JSON file
	import tempfile
	with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
	json.dump(segment, f, indent=2)
	temp_path = f.name

	try:
	print(f"Uploading {segment_path}...")
	upload_file(
	path_or_fileobj=temp_path,
	path_in_repo=segment_path,
	repo_id=repo_id,
	repo_type="dataset",
	token=HF_TOKEN,
	commit_message=f"Add highlight segment {segment['segment_number']} for {movie_name}"
	)
	print(f"✓ Uploaded {segment_path}")
	finally:
	os.unlink(temp_path)

	processing_state["processed_files"].append(movie_name)
	processing_state["total_processed"] += 1
	print(f"✓ Successfully processed {movie_name} ({len(segments)} segments)")
	return True

	except Exception as e:
	processing_state["error_count"] += 1
	processing_state["last_error"] = str(e)
	print(f"✗ Error processing {movie_name}: {e}")
	return False

	async def scan_and_process_highlights():
	"""Scan transcriptions folder and process each file for highlights."""
	if processing_state["is_running"]:
	print("Highlight processing already running, skipping...")
	return

	processing_state["is_running"] = True
	print("\n" + "="*80)
	print("STARTING HIGHLIGHT EXTRACTION SERVICE")
	print("="*80)

	try:
	# List all files in repository
	print(f"Connecting to {HF_DATASET_REPO}...")
	files = list_repo_files(
	repo_id=HF_DATASET_REPO,
	repo_type="dataset",
	token=HF_TOKEN
	)

	# Get existing hook folders
	print("Scanning for existing hook folders...")
	existing_hooks = set()
	for f in files:
	if f.startswith(f"{HIGHLIGHTS_FOLDER}/"):
	# Extract movie folder name
	parts = f.split("/")
	if len(parts) >= 2:
	movie_name = parts[1]
	existing_hooks.add(movie_name)

	print(f"✓ Found {len(existing_hooks)} movie folders in hooks/: {existing_hooks}")

	# Get all transcription files
	transcript_files = [
	f for f in files
	if f.startswith(f"{TRANSCRIPTION_FOLDER}/") and f.endswith(".txt")
	]

	print(f"Found {len(transcript_files)} transcription files")

	if not transcript_files:
	print("No transcription files found to process")
	processing_state["is_running"] = False
	return

	# Filter transcriptions to only those not yet processed
	unprocessed_transcripts = []
	for transcript_file in transcript_files:
	try:
	just_filename = os.path.basename(transcript_file)
	movie_name = just_filename.replace(".transcript.txt", "").replace(".txt", "")

	# Skip if hooks already exist for this movie
	if movie_name in existing_hooks:
	print(f" ⊘ {movie_name} (already has hooks)")
	continue

	unprocessed_transcripts.append({
	"path": transcript_file,
	"filename": just_filename,
	"movie_name": movie_name
	})
	except Exception as e:
	print(f"Error parsing transcript {transcript_file}: {e}")
	continue

	print(f"\n✓ Found {len(unprocessed_transcripts)} unprocessed movies")

	if not unprocessed_transcripts:
	print("✓ All transcriptions already have hooks!")
	processing_state["is_running"] = False
	return

	# Process each unprocessed transcription
	for transcript_info in unprocessed_transcripts:
	try:
	print(f"\nDownloading: {transcript_info['path']}")
	# Download transcript
	local_path = hf_hub_download(
	repo_id=HF_DATASET_REPO,
	filename=transcript_info["path"],
	repo_type="dataset",
	token=HF_TOKEN,
	cache_dir="/tmp/highlight_transcripts"
	)

	# Read transcript content
	with open(local_path, 'r', encoding='utf-8') as f:
	transcript_content = f.read()

	print(f"✓ Downloaded: {transcript_info['filename']}")

	# Process for highlights
	await process_transcription_for_highlights(
	HF_DATASET_REPO,
	transcript_info["filename"],
	transcript_content
	)

	# Small delay between requests to avoid rate limiting
	await asyncio.sleep(2)

	except Exception as e:
	print(f"Error processing {transcript_info['path']}: {e}")
	processing_state["error_count"] += 1
	continue

	print("\n" + "="*80)
	print("HIGHLIGHT EXTRACTION COMPLETE")
	print(f"Processed: {processing_state['total_processed']}")
	print(f"Errors: {processing_state['error_count']}")
	print("="*80 + "\n")

	except Exception as e:
	print(f"Critical error in scan_and_process: {e}")
	processing_state["last_error"] = str(e)
	finally:
	processing_state["is_running"] = False

	@app.on_event("startup")
	async def startup_event():
	"""Schedule highlight extraction on server startup with background thread."""
	print("\n" + "="*80)
	print("STARTUP EVENT TRIGGERED - Highlight Extraction Service")
	print("="*80)

	# Schedule scan in a background thread (more reliable for deployment)
	def run_scan():
	print("Starting highlight extraction scan...")
	asyncio.run(scan_and_process_highlights())

	scan_thread = threading.Thread(target=run_scan, daemon=True)
	scan_thread.start()
	print("✓ Background scan thread scheduled")

	@app.get("/")
	async def health():
	"""Health check endpoint."""
	return JSONResponse({
	"status": "running",
	"service": "Movie Highlight Extraction Service",
	"is_processing": processing_state["is_running"],
	"total_processed": processing_state["total_processed"],
	"error_count": processing_state["error_count"],
	"current_file": processing_state["current_file"],
	"last_error": processing_state["last_error"],
	"processed_files": processing_state["processed_files"]
	})

	@app.post("/trigger-extraction")
	async def trigger_extraction():
	"""Manually trigger a new highlight extraction scan."""
	if processing_state["is_running"]:
	return JSONResponse({
	"status": "already_running",
	"message": "Highlight extraction is already in progress"
	})

	# Use threading for consistent behavior
	def run_scan():
	asyncio.run(scan_and_process_highlights())

	scan_thread = threading.Thread(target=run_scan, daemon=True)
	scan_thread.start()

	return JSONResponse({
	"status": "started",
	"message": "Highlight extraction scan started"
	})

	@app.get("/status")
	async def get_status():
	"""Get current processing status."""
	return JSONResponse({
	"is_running": processing_state["is_running"],
	"total_processed": processing_state["total_processed"],
	"error_count": processing_state["error_count"],
	"current_file": processing_state["current_file"],
	"last_error": processing_state["last_error"],
	"processed_files": processing_state["processed_files"]
	})

	if __name__ == "__main__":
	print("Starting Movie Highlight Extraction Service on port 7860...")
	print("Will automatically scan and process transcriptions on startup")
	uvicorn.run(app, host="0.0.0.0", port=7860)