Spaces:

factorstudios
/

lighter

Sleeping

App Files Files Community

factorstudios commited on 4 days ago

Commit

b79a002

verified ·

1 Parent(s): 0557312

Create server.py

Browse files

Files changed (1) hide show

server.py +369 -0

server.py ADDED Viewed

	@@ -0,0 +1,369 @@

+#!/usr/bin/env python3
+import os
+import json
+import re
+import asyncio
+from pathlib import Path
+from datetime import datetime
+from dotenv import load_dotenv
+from typing import List, Dict
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse
+import uvicorn
+try:
+    from huggingface_hub import list_repo_files, hf_hub_download, upload_file
+    from openai import OpenAI
+except ImportError as e:
+    print(f"Missing dependency: {e}")
+    exit(1)
+# Load environment variables
+load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN")
+DASHSCOPE_ENDPOINT = os.getenv("DASHSCOPE_ENDPOINT")
+DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
+MODEL_NAME = os.getenv("MODEL_NAME", "qwen3.6-plus")
+if not HF_TOKEN or not DASHSCOPE_ENDPOINT or not DASHSCOPE_API_KEY:
+    print("Error: Missing HF_TOKEN, DASHSCOPE_ENDPOINT, or DASHSCOPE_API_KEY in .env")
+    exit(1)
+app = FastAPI(title="Movie Highlight Extraction Service")
+# Global state for processing
+processing_state = {
+    "is_running": False,
+    "total_processed": 0,
+    "current_file": None,
+    "error_count": 0,
+    "last_error": None,
+    "processed_files": []
+}
+HF_DATASET_REPO = "factorstudios/movs"
+TRANSCRIPTION_FOLDER = "transcriptions"
+HIGHLIGHTS_FOLDER = "hooks"
+def parse_segment_timestamp(time_str: str) -> str:
+    """Parse and validate timestamp format (HH:MM:SS)."""
+    try:
+        # Remove any extra whitespace
+        time_str = time_str.strip()
+        parts = time_str.split(":")
+        if len(parts) != 3:
+            raise ValueError(f"Invalid format: {time_str}")
+        h, m, s = int(parts[0]), int(parts[1]), int(parts[2])
+        if h < 0 or m < 0 or m > 59 or s < 0 or s > 59:
+            raise ValueError(f"Invalid time values: {time_str}")
+        return f"{h:02d}:{m:02d}:{s:02d}"
+    except Exception as e:
+        print(f"Error parsing timestamp '{time_str}': {e}")
+        return "00:00:00"
+def extract_segments_from_response(response_text: str) -> List[Dict]:
+    """Parse LLM response to extract 10 movie segments with timestamps."""
+    segments = []
+    # Try to find JSON array in response
+    json_pattern = r'\[[\s\S]*\]'
+    json_matches = re.findall(json_pattern, response_text)
+    if json_matches:
+        try:
+            # Try to parse the JSON
+            parsed = json.loads(json_matches[-1])  # Take last match
+            if isinstance(parsed, list):
+                for item in parsed:
+                    if isinstance(item, dict):
+                        segment = {
+                            "segment_number": item.get("segment_number", len(segments) + 1),
+                            "title": item.get("title", f"Segment {len(segments) + 1}"),
+                            "start_time": parse_segment_timestamp(item.get("start_time", "00:00:00")),
+                            "end_time": parse_segment_timestamp(item.get("end_time", "00:01:00")),
+                            "description": item.get("description", ""),
+                            "engagement_level": item.get("engagement_level", "high"),
+                            "reason": item.get("reason", "")
+                        }
+                        segments.append(segment)
+                        if len(segments) >= 10:
+                            break
+        except json.JSONDecodeError:
+            pass
+    # If no JSON found or parsing failed, try to extract from text patterns
+    if len(segments) < 1:
+        # Look for patterns like "Segment 1:" or "1. "
+        segment_pattern = r'(?:Segment|Video|Scene)\s+\d+[:\s]+'
+        parts = re.split(segment_pattern, response_text)[1:]  # Skip before first match
+        for idx, part in enumerate(parts[:10], 1):
+            # Try to extract timestamps
+            time_pattern = r'(\d{1,2}):(\d{2}):(\d{2})\s*[-–]\s*(\d{1,2}):(\d{2}):(\d{2})'
+            time_match = re.search(time_pattern, part)
+            if time_match:
+                start_time = f"{int(time_match.group(1)):02d}:{time_match.group(2)}:{time_match.group(3)}"
+                end_time = f"{int(time_match.group(4)):02d}:{time_match.group(5)}:{time_match.group(6)}"
+            else:
+                start_time = "00:00:00"
+                end_time = "00:01:00"
+            # Extract first sentence as title
+            title_match = re.match(r'([^.\n]+)', part.strip())
+            title = title_match.group(1)[:100] if title_match else f"Segment {idx}"
+            segment = {
+                "segment_number": idx,
+                "title": title,
+                "start_time": start_time,
+                "end_time": end_time,
+                "description": part.strip()[:500],
+                "engagement_level": "high",
+                "reason": "Engaging scene"
+            }
+            segments.append(segment)
+    return segments[:10]  # Return max 10 segments
+async def process_transcription_for_highlights(
+    repo_id: str,
+    transcript_filename: str,
+    transcript_content: str
+) -> bool:
+    """Process a single transcription and extract highlights."""
+    try:
+        # Extract movie name from filename
+        movie_name = transcript_filename.replace(".transcript.txt", "").replace(".txt", "")
+        processing_state["current_file"] = movie_name
+        print(f"\n{'='*80}")
+        print(f"Processing: {movie_name}")
+        print(f"{'='*80}")
+        # Create LLM client
+        client = OpenAI(
+            api_key=DASHSCOPE_API_KEY,
+            base_url=DASHSCOPE_ENDPOINT
+        )
+        # Create structured prompt for segment extraction
+        system_prompt = """You are a movie marketing expert who identifies the most engaging and thrilling segments of movies.
+You will receive a full movie transcript with timestamps. Your task is to identify exactly 10 of the most compelling moments that would make audiences want to watch the full movie.
+IMPORTANT: You MUST respond with a valid JSON array. Do not include any text before or after the JSON array.
+Each segment must have:
+- segment_number: (1-10)
+- title: (engaging, compelling title for this moment)
+- start_time: (HH:MM:SS format - when this segment starts)
+- end_time: (HH:MM:SS format - when this segment ends)
+- description: (brief description of why this is engaging)
+- engagement_level: (high/medium)
+- reason: (one-line reason this will hook viewers)
+Return ONLY the JSON array. Example format:
+[
+    {"segment_number": 1, "title": "Epic Action Scene", "start_time": "00:15:32", "end_time": "00:18:45", "description": "...", "engagement_level": "high", "reason": "..."},
+    {"segment_number": 2, "title": "Emotional Climax", "start_time": "00:45:12", "end_time": "00:48:30", "description": "...", "engagement_level": "high", "reason": "..."}
+]
+"""
+        user_message = f"""Please extract exactly 10 of the most engaging segments from this movie transcript.
+TRANSCRIPT:
+{transcript_content[:15000]}
+Return a JSON array with exactly 10 segments following the format specified. Each segment must have accurate start and end times from the transcript."""
+        print("Sending transcript to LLM for highlight extraction...")
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_message}
+            ],
+            temperature=0.7,
+            max_tokens=4000
+        )
+        response_text = response.choices[0].message.content.strip()
+        print(f"LLM Response length: {len(response_text)} characters")
+        # Extract segments from response
+        segments = extract_segments_from_response(response_text)
+        if not segments:
+            print(f"Warning: No segments extracted from LLM response")
+            return False
+        print(f"Extracted {len(segments)} segments")
+        # Prepare upload directory structure: hooks/movie-name/
+        movie_highlights_folder = f"{HIGHLIGHTS_FOLDER}/{movie_name}"
+        # Upload each segment as a JSON file
+        for segment in segments:
+            segment_filename = f"segment-{segment['segment_number']:02d}.json"
+            segment_path = f"{movie_highlights_folder}/{segment_filename}"
+            # Create temporary JSON file
+            import tempfile
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+                json.dump(segment, f, indent=2)
+                temp_path = f.name
+            try:
+                print(f"Uploading {segment_path}...")
+                upload_file(
+                    path_or_fileobj=temp_path,
+                    path_in_repo=segment_path,
+                    repo_id=repo_id,
+                    repo_type="dataset",
+                    token=HF_TOKEN,
+                    commit_message=f"Add highlight segment {segment['segment_number']} for {movie_name}"
+                )
+                print(f"✓ Uploaded {segment_path}")
+            finally:
+                os.unlink(temp_path)
+        processing_state["processed_files"].append(movie_name)
+        processing_state["total_processed"] += 1
+        print(f"✓ Successfully processed {movie_name} ({len(segments)} segments)")
+        return True
+    except Exception as e:
+        processing_state["error_count"] += 1
+        processing_state["last_error"] = str(e)
+        print(f"✗ Error processing {movie_name}: {e}")
+        return False
+async def scan_and_process_highlights():
+    """Scan transcriptions folder and process each file for highlights."""
+    if processing_state["is_running"]:
+        print("Highlight processing already running, skipping...")
+        return
+    processing_state["is_running"] = True
+    print("\n" + "="*80)
+    print("STARTING HIGHLIGHT EXTRACTION SERVICE")
+    print("="*80)
+    try:
+        # List all transcription files
+        print(f"Scanning {HF_DATASET_REPO}/{TRANSCRIPTION_FOLDER}/ for transcription files...")
+        files = list_repo_files(
+            repo_id=HF_DATASET_REPO,
+            repo_type="dataset",
+            token=HF_TOKEN
+        )
+        transcript_files = [
+            f for f in files
+            if f.startswith(f"{TRANSCRIPTION_FOLDER}/") and f.endswith(".txt")
+        ]
+        print(f"Found {len(transcript_files)} transcription files")
+        if not transcript_files:
+            print("No transcription files found to process")
+            return
+        # Process each transcription
+        for transcript_file in transcript_files:
+            try:
+                # Download transcript
+                local_path = hf_hub_download(
+                    repo_id=HF_DATASET_REPO,
+                    filename=transcript_file,
+                    repo_type="dataset",
+                    token=HF_TOKEN,
+                    cache_dir="/tmp/highlight_transcripts"
+                )
+                # Read transcript content
+                with open(local_path, 'r', encoding='utf-8') as f:
+                    transcript_content = f.read()
+                # Extract just the filename
+                just_filename = os.path.basename(transcript_file)
+                # Process for highlights
+                await process_transcription_for_highlights(
+                    HF_DATASET_REPO,
+                    just_filename,
+                    transcript_content
+                )
+                # Small delay between requests to avoid rate limiting
+                await asyncio.sleep(2)
+            except Exception as e:
+                print(f"Error downloading {transcript_file}: {e}")
+                processing_state["error_count"] += 1
+                continue
+        print("\n" + "="*80)
+        print("HIGHLIGHT EXTRACTION COMPLETE")
+        print(f"Processed: {processing_state['total_processed']}")
+        print(f"Errors: {processing_state['error_count']}")
+        print("="*80 + "\n")
+    except Exception as e:
+        print(f"Critical error in scan_and_process: {e}")
+        processing_state["last_error"] = str(e)
+    finally:
+        processing_state["is_running"] = False
+@app.on_event("startup")
+async def startup_event():
+    """Start highlight extraction on server startup."""
+    asyncio.create_task(scan_and_process_highlights())
+@app.get("/")
+async def health():
+    """Health check endpoint."""
+    return JSONResponse({
+        "status": "running",
+        "service": "Movie Highlight Extraction Service",
+        "is_processing": processing_state["is_running"],
+        "total_processed": processing_state["total_processed"],
+        "error_count": processing_state["error_count"],
+        "current_file": processing_state["current_file"],
+        "last_error": processing_state["last_error"],
+        "processed_files": processing_state["processed_files"]
+    })
+@app.post("/trigger-extraction")
+async def trigger_extraction():
+    """Manually trigger a new highlight extraction scan."""
+    if processing_state["is_running"]:
+        return JSONResponse({
+            "status": "already_running",
+            "message": "Highlight extraction is already in progress"
+        })
+    asyncio.create_task(scan_and_process_highlights())
+    return JSONResponse({
+        "status": "started",
+        "message": "Highlight extraction scan started"
+    })
+@app.get("/status")
+async def get_status():
+    """Get current processing status."""
+    return JSONResponse({
+        "is_running": processing_state["is_running"],
+        "total_processed": processing_state["total_processed"],
+        "error_count": processing_state["error_count"],
+        "current_file": processing_state["current_file"],
+        "last_error": processing_state["last_error"],
+        "processed_files": processing_state["processed_files"]
+    })
+if __name__ == "__main__":
+    print("Starting Movie Highlight Extraction Service on port 7861...")
+    print("Will automatically scan and process transcriptions on startup")
+    uvicorn.run(app, host="0.0.0.0", port=7861)