Spaces:
Sleeping
Sleeping
| """ | |
| GPT-4o Prompt Generation API | |
| Structured, validated segment generation for video prompts | |
| """ | |
| from fastapi import APIRouter, HTTPException, UploadFile, File, Form | |
| from fastapi.responses import JSONResponse | |
| from pydantic import BaseModel | |
| from typing import Optional | |
| import base64 | |
| from utils.prompt_generator import ( | |
| VeoInputs, | |
| generate_segments_payload, | |
| split_script_into_segments | |
| ) | |
| from openai import OpenAI | |
| import os | |
| import json | |
| router = APIRouter() | |
| class PromptGenerationRequest(BaseModel): | |
| """Request for prompt generation""" | |
| script: str | |
| style: str = "clean, lifestyle UGC" | |
| jsonFormat: str = "standard" | |
| continuationMode: bool = True | |
| voiceType: Optional[str] = None | |
| energyLevel: Optional[str] = None | |
| settingMode: str = "single" | |
| cameraStyle: Optional[str] = "handheld steadicam" | |
| energyArc: Optional[str] = None | |
| narrativeStyle: Optional[str] = "direct address" | |
| accentRegion: Optional[str] = None | |
| model: str = "gpt-4o" | |
| async def generate_prompts_api( | |
| script: str = Form(...), | |
| style: str = Form("clean, lifestyle UGC"), | |
| jsonFormat: str = Form("standard"), | |
| continuationMode: str = Form("true"), | |
| voiceType: Optional[str] = Form(None), | |
| energyLevel: Optional[str] = Form(None), | |
| settingMode: str = Form("single"), | |
| cameraStyle: Optional[str] = Form("handheld steadicam"), | |
| energyArc: Optional[str] = Form(None), | |
| narrativeStyle: Optional[str] = Form("direct address"), | |
| accentRegion: Optional[str] = Form(None), | |
| model: str = Form("gpt-4o"), | |
| image: UploadFile = File(...) | |
| ): | |
| """ | |
| Generate structured video prompts using GPT-4o | |
| This endpoint: | |
| 1. Splits the script into 8-second segments | |
| 2. Generates detailed production prompts using GPT-4o | |
| 3. Validates the output against strict rules | |
| 4. Returns structured JSON for video generation | |
| Accepts multipart/form-data with: | |
| - script: The video script text | |
| - style: Visual style description | |
| - image: Character reference image (required) | |
| - Other optional parameters for fine-tuning | |
| Returns: | |
| Validated segments payload ready for video generation | |
| """ | |
| try: | |
| # Read image | |
| image_bytes = await image.read() | |
| print(f"π· Received reference image: {len(image_bytes)} bytes") | |
| # Convert continuationMode string to boolean | |
| continuation_mode = continuationMode.lower() == "true" | |
| # Create inputs from form data | |
| inputs = VeoInputs( | |
| script=script, | |
| style=style, | |
| jsonFormat=jsonFormat, | |
| continuationMode=continuation_mode, | |
| voiceType=voiceType if voiceType else None, | |
| energyLevel=energyLevel if energyLevel else None, | |
| settingMode=settingMode, | |
| cameraStyle=cameraStyle if cameraStyle else None, | |
| energyArc=energyArc if energyArc else None, | |
| narrativeStyle=narrativeStyle if narrativeStyle else None, | |
| accentRegion=accentRegion if accentRegion else None | |
| ) | |
| # Check environment mode | |
| environment = os.getenv('ENVIRONMENT', 'dev').lower() | |
| is_dev_mode = environment == 'dev' or environment == 'development' | |
| # Generate payload | |
| payload = generate_segments_payload( | |
| inputs=inputs, | |
| image_bytes=image_bytes, | |
| model=model | |
| ) | |
| # Add environment mode to response | |
| payload['environment'] = environment | |
| payload['is_dev_mode'] = is_dev_mode | |
| payload['max_segments'] = 2 if is_dev_mode else None | |
| # Validation warnings (if any) are logged to console but don't block | |
| return JSONResponse(content=payload) | |
| except Exception as e: | |
| # API/network errors only (validation is non-blocking now) | |
| raise HTTPException( | |
| status_code=500, | |
| detail=f"Prompt generation failed: {str(e)}" | |
| ) | |
| async def split_script_api( | |
| script: str = Form(...), | |
| seconds_per_segment: int = Form(8), | |
| words_per_second: float = Form(2.2) | |
| ): | |
| """ | |
| Split script into segments for preview | |
| Useful for checking how the script will be divided before generation | |
| """ | |
| try: | |
| segments = split_script_into_segments( | |
| script, | |
| seconds_per_segment=seconds_per_segment, | |
| words_per_second=words_per_second | |
| ) | |
| return { | |
| "segments": segments, | |
| "count": len(segments), | |
| "total_words": sum(len(s.split()) for s in segments) | |
| } | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=500, | |
| detail=f"Script splitting failed: {str(e)}" | |
| ) | |
| async def validate_payload_api(payload: dict): | |
| """ | |
| Validate a segments payload against strict rules | |
| Use this to check if a manually created or modified payload is valid | |
| """ | |
| try: | |
| from utils.prompt_generator import validate_segments_payload | |
| expected_segments = len(payload.get("segments", [])) | |
| errors = validate_segments_payload(payload, expected_segments) | |
| if errors: | |
| return { | |
| "valid": False, | |
| "errors": errors | |
| } | |
| return { | |
| "valid": True, | |
| "message": "Payload is valid" | |
| } | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=500, | |
| detail=f"Validation failed: {str(e)}" | |
| ) | |
| async def prompt_status(): | |
| """ | |
| Check if GPT-4o prompt generation is available | |
| """ | |
| import os | |
| openai_key = os.getenv('OPENAI_API_KEY') | |
| return { | |
| "available": bool(openai_key), | |
| "message": "GPT-4o is configured" if openai_key | |
| else "Add OPENAI_API_KEY to .env.local" | |
| } | |
| async def refine_prompt_for_continuity( | |
| segmentPrompt: str = Form(...), # JSON string of the next segment | |
| lastFrame: UploadFile = File(...), # Last frame image from previous video | |
| transcribedDialogue: str = Form(default=""), # Whisper transcription from previous segment | |
| expectedDialogue: str = Form(default="") # Expected dialogue from previous segment | |
| ): | |
| """ | |
| Refine a segment prompt to match the actual visual AND audio from the previous segment. | |
| This ensures perfect continuity by having GPT-4o analyze: | |
| 1. The last frame (visual consistency) | |
| 2. The transcribed dialogue (audio consistency - what was actually said) | |
| """ | |
| try: | |
| # Read the image | |
| image_bytes = await lastFrame.read() | |
| encoded_image = base64.b64encode(image_bytes).decode('utf-8') | |
| # Parse the segment prompt | |
| try: | |
| segment_data = json.loads(segmentPrompt) | |
| except json.JSONDecodeError: | |
| raise HTTPException( | |
| status_code=400, | |
| detail="Invalid JSON in segmentPrompt" | |
| ) | |
| # Initialize OpenAI client | |
| client = OpenAI(api_key=os.getenv('OPENAI_API_KEY')) | |
| # Build audio context if available | |
| audio_context = "" | |
| if transcribedDialogue.strip(): | |
| audio_context = f""" | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| AUDIO CONTINUITY CONTEXT (WHAT WAS ACTUALLY SPOKEN) | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Previous segment's dialogue (from Whisper transcription): | |
| \"{transcribedDialogue.strip()}\" | |
| Expected dialogue was: | |
| \"{expectedDialogue.strip() if expectedDialogue.strip() else 'Not provided'}\" | |
| IMPORTANT: The next segment should continue naturally from what was ACTUALLY said. | |
| If there are differences between expected and transcribed dialogue, use the TRANSCRIBED version | |
| as the ground truth for continuity (it's what the viewer actually heard). | |
| """ | |
| # Build the refinement prompt | |
| refinement_instructions = f""" | |
| You are a video continuity expert. Your task is to UPDATE the provided segment prompt to ensure PERFECT VISUAL AND AUDIO CONTINUITY with the previous video segment. | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| VISUAL CONTINUITY (from attached image) | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Analyze the image carefully - this is the ACTUAL last frame from the previous video. | |
| 1. Update the character_description to match the ACTUAL person in the image: | |
| - Physical appearance (EXACT age, hair color/style, facial features, skin tone) | |
| - Clothing (EXACTLY what they're wearing - color, style, pattern) | |
| - Current state (their actual expression and posture at this moment) | |
| - Voice matching (adjust to match their appearance) | |
| 2. Update the scene_continuity to match the ACTUAL environment: | |
| - Environment (describe what you see - bedroom, office, outdoor, etc.) | |
| - Camera position (maintain the SAME angle/framing) | |
| - Lighting state (match the EXACT lighting conditions in the image) | |
| - Props and background elements (describe what's actually visible) | |
| - Spatial relationships (match the actual layout) | |
| {audio_context} | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ORIGINAL PROMPT TO UPDATE | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| {json.dumps(segment_data, indent=2)} | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CRITICAL RULES | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| - Be EXTREMELY specific about what you see in the image | |
| - If the image shows a young woman with red hair, describe EXACTLY that | |
| - If it's a sunset beach scene, describe EXACTLY that setting | |
| - If they're wearing a beige blazer, describe EXACTLY that clothing | |
| - Match colors, styles, and details PRECISELY to what's visible | |
| - Maintain the SAME camera angle and distance | |
| - Keep the action_timeline.dialogue EXACTLY as provided (this is the NEXT segment's dialogue) | |
| - Update segment_info.continuity_markers to reflect the visual state | |
| - Adjust synchronized_actions to fit the actual character appearance | |
| π¨ CRITICAL: NO BLUR TRANSITIONS AT SEGMENT START π¨ | |
| - The video MUST start immediately at 0:00 with a SHARP, CLEAR, IN-FOCUS frame | |
| - NO fade-in, NO blur transition, NO gradual focus effect at the start | |
| - The first frame (0:00) must be as clear and sharp as any other frame | |
| - camera_movement MUST describe movement that starts from a clear, sharp state | |
| The goal is SEAMLESS video extension with ZERO visual or audio discontinuity. | |
| Return ONLY the updated JSON segment object with the same structure. No explanation, just the corrected JSON. | |
| """ | |
| print(f"π Refining prompt for visual continuity...") | |
| # Call GPT-4o with vision | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": refinement_instructions | |
| }, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{encoded_image}" | |
| } | |
| } | |
| ] | |
| } | |
| ], | |
| response_format={"type": "json_object"}, | |
| temperature=0.3, # Lower temperature for precise matching | |
| ) | |
| # Parse the response | |
| refined_prompt = json.loads(response.choices[0].message.content) | |
| print(f"β Prompt refined for visual continuity") | |
| return JSONResponse(content={ | |
| "refined_prompt": refined_prompt, | |
| "original_prompt": segment_data | |
| }) | |
| except Exception as e: | |
| print(f"β Prompt refinement error: {str(e)}") | |
| raise HTTPException( | |
| status_code=500, | |
| detail=f"Prompt refinement failed: {str(e)}" | |
| ) | |