Spaces:

Elvoro
/

Tools

Running

File size: 9,608 Bytes

"""
ScriptProcessor - Orchestrates VO script processing using Gemini prompts.

Flow:
1. split() - Uses vo_segment_splitter.md to break VO into segments
2. get_metadata() - Uses vo_segment_enricher.md to enrich each segment
3. match_video() - Uses vo_video_matcher.md to find matching library video
4. generate_prompt() - Uses vo_video_generator.md to create video gen spec
"""

import json
import os
from typing import List, Dict, Optional
from pathlib import Path

from src.logger_config import logger
from google_src import ai_studio_sdk


class ScriptProcessor:
    """Processes voice-over scripts through prompt-based pipeline."""
    
    def __init__(self):
        self._prompt_dir = Path(__file__).parent.parent / "prompt"
        self._prompts = {}
        self._load_prompts()
    
    def _load_prompts(self):
        """Load all prompt templates from markdown files."""
        prompt_files = {
            "split": "vo_segment_splitter.md",
            "metadata": "vo_segment_enricher.md",
            "match": "vo_video_matcher.md",
            "prompt_gen": "vo_video_generator.md"
        }
        
        for key, filename in prompt_files.items():
            filepath = self._prompt_dir / filename
            if filepath.exists():
                self._prompts[key] = filepath.read_text()
                logger.debug(f"Loaded prompt: {filename}")
            else:
                logger.warning(f"Prompt file not found: {filepath}")
                self._prompts[key] = ""
    
    def _call_gemini(self, prompt: str) -> str:
        """Call Gemini and return response text."""
        response = ai_studio_sdk.generate(prompt)
        if not response:
            raise ValueError("Gemini returned empty response")
        return response.strip()
    
    def _parse_json(self, text: str) -> dict | list:
        """Parse JSON from Gemini response, handling markdown code blocks."""
        # Strip markdown code blocks if present
        text = text.strip()
        if text.startswith("```json"):
            text = text[7:]
        elif text.startswith("```"):
            text = text[3:]
        if text.endswith("```"):
            text = text[:-3]
        
        return json.loads(text.strip())
    
    def split(self, vo_script: str) -> List[str]:
        """
        Split voice-over script into segments.
        
        Uses vo_segment_splitter.md prompt.
        
        Args:
            vo_script: Full voice-over script text.
            
        Returns:
            List of segment strings.
        """
        logger.debug("ScriptProcessor: Splitting VO script into segments")
        
        prompt = self._prompts["split"].replace("{VO_SCRIPT}", vo_script)
        response = self._call_gemini(prompt)
        
        segments = self._parse_json(response)
        
        if not isinstance(segments, list):
            raise ValueError(f"Expected list of segments, got: {type(segments)}")
        
        logger.debug(f"ScriptProcessor: Split into {len(segments)} segments")
        return segments
    
    def get_metadata(self, segment: str) -> Dict:
        """
        Get video metadata for a segment.
        
        Uses vo_segment_enricher.md prompt.
        
        Args:
            segment: Single segment text.
            
        Returns:
            Metadata dict with visual_intent, category, subjects, etc.
        """
        logger.debug(f"ScriptProcessor: Getting metadata for: {segment[:50]}...")
        
        prompt = self._prompts["metadata"].replace("{SEGMENT_TEXT}", segment)
        response = self._call_gemini(prompt)
        
        metadata = self._parse_json(response)
        
        if not isinstance(metadata, dict):
            raise ValueError(f"Expected metadata dict, got: {type(metadata)}")
        
        logger.debug(f"ScriptProcessor: Metadata category={metadata.get('category')}")
        return metadata
    
    def match_video(self, metadata: Dict, library_items: List[Dict]) -> Dict:
        """
        Try to match segment metadata against video library.
        
        Uses vo_video_matcher.md prompt.
        
        Args:
            metadata: Segment metadata from get_metadata().
            library_items: List of video library items with match_keys.
            
        Returns:
            Match result dict with decision, matched_video_id, confidence, reason.
        """
        logger.debug(f"ScriptProcessor: Matching against {len(library_items)} library items")
        
        prompt = self._prompts["match"]
        prompt = prompt.replace("{SEGMENT_METADATA_JSON}", json.dumps(metadata, indent=2))
        prompt = prompt.replace("{VIDEO_LIBRARY_MATCH_KEYS_ARRAY_JSON}", json.dumps(library_items, indent=2))
        
        response = self._call_gemini(prompt)
        result = self._parse_json(response)
        
        if not isinstance(result, dict):
            raise ValueError(f"Expected match result dict, got: {type(result)}")
        
        decision = result.get("decision", "no_match")
        confidence = result.get("confidence", 0)
        logger.debug(f"ScriptProcessor: Match decision={decision}, confidence={confidence}")
        
        return result
    
    def generate_prompt(self, metadata: Dict) -> Dict:
        """
        Generate video generation specification.
        
        Uses vo_video_generator.md prompt.
        
        Args:
            metadata: Segment metadata from get_metadata().
            
        Returns:
            Video generation spec with scene_prompt, match_keys, video_parameters, etc.
        """
        logger.debug(f"ScriptProcessor: Generating video prompt for: {metadata.get('segment_text', '')[:50]}...")
        
        prompt = self._prompts["prompt_gen"].replace("{SEGMENT_METADATA_JSON}", json.dumps(metadata, indent=2))
        response = self._call_gemini(prompt)
        
        spec = self._parse_json(response)
        
        if not isinstance(spec, dict):
            raise ValueError(f"Expected spec dict, got: {type(spec)}")
        
        logger.debug(f"ScriptProcessor: Generated prompt for scene_type={spec.get('video_parameters', {}).get('style')}")
        return spec
    
    def extract_match_keys(self, video_rows: List[Dict]) -> List[Dict]:
        """
        Extract match keys from raw video library rows.
        
        Args:
            video_rows: List of dicts (raw rows from Google Sheet)
            
        Returns:
            List of dicts with video_id and match_keys.
        """
        result = []
        for idx, row in enumerate(video_rows):
            video_id = row.get("VIDEO_LINK", f"vid_{idx}")
            
            # Parse comma-separated fields back to lists
            subjects = [s.strip() for s in str(row.get("SUBJECTS", "")).split(",") if s.strip()]
            environment = [e.strip() for e in str(row.get("ENVIRONMENT", "")).split(",") if e.strip()]
            tone = [t.strip() for t in str(row.get("TONE", "")).split(",") if t.strip()]
            
            match_keys = {
                "abstract_level": row.get("ABSTRACT_LEVEL", ""),
                "scene_prompt": row.get("SCENE_PROMPT", ""),
                "system_prompt": row.get("SYSTEM_PROMPT", ""),
                "negative_prompt": row.get("NEGATIVE_PROMPT", ""),
                "segment_text": row.get("SEGMENT_TEXT", ""),
                "visual_intent": row.get("VISUAL_INTENT", ""),
                "category": row.get("CATEGORY", ""),
                "sub_category": row.get("SUB_CATEGORY", ""),
                "scene_type": row.get("SCENE_TYPE", ""),
                "subjects": subjects,
                "environment": environment,
                "tone": tone,
            }
            
            result.append({
                "video_id": video_id,
                "match_keys": match_keys,
            })
        
        return result
    
    def process_segment(self, segment: str, library_items: Optional[List[Dict]] = None) -> Dict:
        """
        Process a single segment end-to-end.
        
        Args:
            segment: Segment text.
            library_items: Optional list of library items for matching.
            
        Returns:
            Dict with:
                - metadata: Segment metadata
                - match: Match result (or None if library empty)
                - prompt_spec: Video gen spec (or None if matched)
        """
        result = {"segment": segment, "metadata": None, "match": None, "prompt_spec": None}
        
        # Get metadata
        result["metadata"] = self.get_metadata(segment)
        
        # Try matching if library not empty
        if library_items and len(library_items) > 0:
            result["match"] = self.match_video(result["metadata"], library_items)
            
            if result["match"].get("decision") == "reuse":
                logger.debug(f"ScriptProcessor: Reusing video {result['match'].get('matched_video_id')}")
                return result
        
        # Generate prompt for new video
        result["prompt_spec"] = self.generate_prompt(result["metadata"])
        return result


# Module-level singleton
_script_processor: Optional[ScriptProcessor] = None


def get_script_processor() -> ScriptProcessor:
    """Get singleton ScriptProcessor instance."""
    global _script_processor
    if _script_processor is None:
        _script_processor = ScriptProcessor()
    return _script_processor


def reset_script_processor() -> None:
    """Reset singleton (useful for testing)."""
    global _script_processor
    _script_processor = None