Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- src/audio_gen.py +44 -0
- src/caption_gen.py +42 -0
- src/image_gen.py +35 -0
- src/prompt_generator.py +235 -0
- src/video_gen.py +37 -0
src/audio_gen.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import List
|
| 3 |
+
import replicate
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
replicate_client = replicate.Client(api_token= os.getenv("REPLICATE_API_KEY"))
|
| 9 |
+
|
| 10 |
+
def audio_generation(
|
| 11 |
+
scripts: str,
|
| 12 |
+
voice_id: str,
|
| 13 |
+
speed: float,
|
| 14 |
+
volume: float,
|
| 15 |
+
pitch: int,
|
| 16 |
+
emotion: str
|
| 17 |
+
):
|
| 18 |
+
output = replicate_client.run(
|
| 19 |
+
"minimax/speech-02-turbo",
|
| 20 |
+
input={
|
| 21 |
+
"text": f"""{scripts}""",
|
| 22 |
+
"pitch": pitch,
|
| 23 |
+
"speed": speed,
|
| 24 |
+
"volume": volume,
|
| 25 |
+
"bitrate": 128000,
|
| 26 |
+
"channel": "mono",
|
| 27 |
+
"emotion": emotion,
|
| 28 |
+
"voice_id": voice_id,
|
| 29 |
+
"sample_rate": 32000,
|
| 30 |
+
"language_boost": "English",
|
| 31 |
+
"english_normalization": True
|
| 32 |
+
}
|
| 33 |
+
)
|
| 34 |
+
urls: List[str] = []
|
| 35 |
+
if isinstance(output, list) and output:
|
| 36 |
+
first = output[0]
|
| 37 |
+
url = getattr(first, "url", str(first))
|
| 38 |
+
urls = [url]
|
| 39 |
+
elif isinstance(output, str):
|
| 40 |
+
urls = [output]
|
| 41 |
+
elif hasattr(output, "url"):
|
| 42 |
+
urls = [getattr(output, "url")]
|
| 43 |
+
if urls:
|
| 44 |
+
return urls[0]
|
src/caption_gen.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import replicate
|
| 3 |
+
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
replicate_client = replicate.Client(api_token= os.getenv("REPLICATE_API_KEY"))
|
| 9 |
+
|
| 10 |
+
def caption_generation(
|
| 11 |
+
video: bytes,
|
| 12 |
+
caption_size: int,
|
| 13 |
+
highlight_color: str):
|
| 14 |
+
output = replicate_client.run(
|
| 15 |
+
"shreejalmaharjan-27/tiktok-short-captions:46bf1c12c77ad1782d6f87828d4d8ba4d48646b8e1271b490cb9e95ccdbc4504",
|
| 16 |
+
input={
|
| 17 |
+
"model": "large-v3",
|
| 18 |
+
"video": f"data:video/mp4;base64,{video}",
|
| 19 |
+
"language": "auto",
|
| 20 |
+
"temperature": 0,
|
| 21 |
+
"caption_size": caption_size,
|
| 22 |
+
"highlight_color": highlight_color,
|
| 23 |
+
"suppress_tokens": "-1",
|
| 24 |
+
"logprob_threshold": -1,
|
| 25 |
+
"no_speech_threshold": 0.6,
|
| 26 |
+
"condition_on_previous_text": True,
|
| 27 |
+
"compression_ratio_threshold": 2.4,
|
| 28 |
+
"temperature_increment_on_fallback": 0.2
|
| 29 |
+
}
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
urls = []
|
| 33 |
+
if isinstance(output, list) and output:
|
| 34 |
+
first = output[0]
|
| 35 |
+
url = getattr(first, "url", str(first))
|
| 36 |
+
urls = [url]
|
| 37 |
+
elif isinstance(output, str):
|
| 38 |
+
urls = [output]
|
| 39 |
+
elif hasattr(output, "url"):
|
| 40 |
+
urls = [getattr(output, "url")]
|
| 41 |
+
if urls:
|
| 42 |
+
return urls[0]
|
src/image_gen.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import List
|
| 3 |
+
import replicate
|
| 4 |
+
import base64
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
load_dotenv()
|
| 9 |
+
replicate_client = replicate.Client(api_token= os.getenv("REPLICATE_API_KEY"))
|
| 10 |
+
|
| 11 |
+
def image_generation_change_background(
|
| 12 |
+
img_bytes: bytes,
|
| 13 |
+
prompt: str,
|
| 14 |
+
aspect_ratio: str):
|
| 15 |
+
encoded_image = base64.b64encode(img_bytes).decode("utf-8")
|
| 16 |
+
|
| 17 |
+
output = replicate_client.run(
|
| 18 |
+
"google/nano-banana",
|
| 19 |
+
input={
|
| 20 |
+
"image_input": [f"data:image/jpeg;base64,{encoded_image}"],
|
| 21 |
+
"prompt": prompt,
|
| 22 |
+
"aspect_ratio": aspect_ratio,
|
| 23 |
+
}
|
| 24 |
+
)
|
| 25 |
+
urls: List[str] = []
|
| 26 |
+
if isinstance(output, list) and output:
|
| 27 |
+
first = output[0]
|
| 28 |
+
url = getattr(first, "url", str(first))
|
| 29 |
+
urls = [url]
|
| 30 |
+
elif isinstance(output, str):
|
| 31 |
+
urls = [output]
|
| 32 |
+
elif hasattr(output, "url"):
|
| 33 |
+
urls = [getattr(output, "url")]
|
| 34 |
+
if urls:
|
| 35 |
+
return urls[0]
|
src/prompt_generator.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional, Dict, Any
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
from openai import OpenAI
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
import base64
|
| 8 |
+
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 12 |
+
|
| 13 |
+
class VeoInputs(BaseModel):
|
| 14 |
+
script: str
|
| 15 |
+
style: str
|
| 16 |
+
jsonFormat: str = 'standard'
|
| 17 |
+
continuationMode: bool = True
|
| 18 |
+
voiceType: Optional[str] = None
|
| 19 |
+
energyLevel: Optional[str] = None
|
| 20 |
+
settingMode: str = 'single'
|
| 21 |
+
cameraStyle: Optional[str] = None
|
| 22 |
+
energyArc: Optional[str] = None
|
| 23 |
+
narrativeStyle: Optional[str] = None
|
| 24 |
+
accentRegion: Optional[str] = None
|
| 25 |
+
|
| 26 |
+
class ContinuityMarkers(BaseModel):
|
| 27 |
+
start_position: str
|
| 28 |
+
end_position: str
|
| 29 |
+
start_expression: str
|
| 30 |
+
end_expression: str
|
| 31 |
+
start_gesture: str
|
| 32 |
+
end_gesture: str
|
| 33 |
+
location_status: str
|
| 34 |
+
|
| 35 |
+
class SegmentInfo(BaseModel):
|
| 36 |
+
segment_number: int
|
| 37 |
+
total_segments: int
|
| 38 |
+
duration: str
|
| 39 |
+
location: str
|
| 40 |
+
continuity_markers: ContinuityMarkers
|
| 41 |
+
|
| 42 |
+
class CharacterDescription(BaseModel):
|
| 43 |
+
current_state: str # 100+ words, segment-specific
|
| 44 |
+
voice_matching: str # 100+ words, segment-specific
|
| 45 |
+
|
| 46 |
+
class SynchronizedActions(BaseModel):
|
| 47 |
+
# Use legal Python identifiers; map to exact JSON keys with aliases
|
| 48 |
+
f0000_0002: str = Field(alias="0:00-0:02")
|
| 49 |
+
f0002_0004: str = Field(alias="0:02-0:04")
|
| 50 |
+
f0004_0006: str = Field(alias="0:04-0:06")
|
| 51 |
+
f0006_0008: str = Field(alias="0:06-0:08")
|
| 52 |
+
|
| 53 |
+
class Config:
|
| 54 |
+
populate_by_name = True
|
| 55 |
+
|
| 56 |
+
class ActionTimeline(BaseModel):
|
| 57 |
+
dialogue: str
|
| 58 |
+
synchronized_actions: SynchronizedActions
|
| 59 |
+
micro_expressions: str # 50+ words
|
| 60 |
+
breathing_rhythm: str
|
| 61 |
+
location_transition: str
|
| 62 |
+
continuity_checkpoint: str
|
| 63 |
+
|
| 64 |
+
class SceneContinuity(BaseModel):
|
| 65 |
+
environment: str # 250+ words
|
| 66 |
+
camera_position: str # 75+ words
|
| 67 |
+
camera_movement: str # detailed movement path
|
| 68 |
+
lighting_state: str # 50+ words
|
| 69 |
+
background_elements: str # 50+ words
|
| 70 |
+
spatial_relationships: str
|
| 71 |
+
|
| 72 |
+
class Segment(BaseModel):
|
| 73 |
+
segment_info: SegmentInfo
|
| 74 |
+
character_description: CharacterDescription
|
| 75 |
+
scene_continuity: SceneContinuity
|
| 76 |
+
action_timeline: ActionTimeline
|
| 77 |
+
|
| 78 |
+
class SegmentsPayload(BaseModel):
|
| 79 |
+
segments: List[Segment]
|
| 80 |
+
|
| 81 |
+
def split_script_into_segments(script: str, seconds_per_segment: int = 8, words_per_second: float = 2.2) -> List[str]:
|
| 82 |
+
"""
|
| 83 |
+
Packs sentences into ~seconds * words_per_second buckets (≈ 17-20 words/8s).
|
| 84 |
+
Adjust words_per_second if your VO tempo differs.
|
| 85 |
+
"""
|
| 86 |
+
sentences = re.split(r'(?<=[.!?])\s+', script.strip())
|
| 87 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 88 |
+
target = max(14, int(seconds_per_segment * words_per_second)) # minimal guard
|
| 89 |
+
|
| 90 |
+
segments, cur, cur_len = [], [], 0
|
| 91 |
+
for s in sentences:
|
| 92 |
+
w = len(s.split())
|
| 93 |
+
if cur and cur_len + w > target:
|
| 94 |
+
segments.append(" ".join(cur))
|
| 95 |
+
cur, cur_len = [], 0
|
| 96 |
+
cur.append(s)
|
| 97 |
+
cur_len += w
|
| 98 |
+
if cur:
|
| 99 |
+
segments.append(" ".join(cur))
|
| 100 |
+
return segments or [script.strip()]
|
| 101 |
+
|
| 102 |
+
def build_prompt(inputs: VeoInputs, segment_texts: List[str]) -> str:
|
| 103 |
+
N = len(segment_texts)
|
| 104 |
+
knobs = inputs.model_dump()
|
| 105 |
+
header = f"""
|
| 106 |
+
You are a senior performance-marketing video director who writes segment-accurate, production-grade JSON prompts for Veo 3.
|
| 107 |
+
Return ONLY JSON that parses into the provided schema. Do not add fields. No markdown.
|
| 108 |
+
|
| 109 |
+
Task: Build prompts for exactly {N} segments of 8 seconds each.
|
| 110 |
+
Hard rules for EVERY segment:
|
| 111 |
+
- "duration" MUST be "00:00-00:8"
|
| 112 |
+
- "current_state" = 100+ words, segment-specific
|
| 113 |
+
- "voice_matching" = 100+ words, segment-specific
|
| 114 |
+
- "environment" = 250+ words; "camera_position" = 75+ words; "lighting_state" = 50+ words min
|
| 115 |
+
- "camera_movement" = concrete, timestamped path (pan/tilt/dolly/handheld/steadicam)
|
| 116 |
+
- "synchronized_actions" must have exactly these keys: "0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08","0:08-0:10"
|
| 117 |
+
- Dialogue must fit in 10s naturally with breath points.
|
| 118 |
+
- If continuationMode is true, include a continuity checkpoint aligning next segment’s start.
|
| 119 |
+
- Set "segment_info.total_segments" = {N} on each segment.
|
| 120 |
+
- Based on the character image provide select everything as asked.
|
| 121 |
+
FULL SCRIPT:
|
| 122 |
+
\"\"\"{inputs.script.strip()}\"\"\"
|
| 123 |
+
|
| 124 |
+
AUTHORITATIVE SETTINGS (must be reflected):
|
| 125 |
+
{knobs}
|
| 126 |
+
|
| 127 |
+
SEGMENT LINES (cover in exactly 8 seconds each):
|
| 128 |
+
"""
|
| 129 |
+
seg_lines = "\n".join([f"- Segment {i+1}: {t}" for i, t in enumerate(segment_texts)])
|
| 130 |
+
|
| 131 |
+
footer = """
|
| 132 |
+
OUTPUT:
|
| 133 |
+
Return JSON only as:
|
| 134 |
+
{
|
| 135 |
+
"segments": [ { ... per-segment object exactly matching the schema ... } ]
|
| 136 |
+
}
|
| 137 |
+
"""
|
| 138 |
+
return header + seg_lines + footer
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# ---------- Validator (segment count, durations, keys, word counts, uniformity) ----------
|
| 142 |
+
|
| 143 |
+
MIN_WORDS = {
|
| 144 |
+
("character_description", "physical"): 200,
|
| 145 |
+
("character_description", "clothing"): 150,
|
| 146 |
+
("character_description", "current_state"): 100,
|
| 147 |
+
("character_description", "voice_matching"): 100,
|
| 148 |
+
("scene_continuity", "environment"): 250,
|
| 149 |
+
("scene_continuity", "camera_position"): 75,
|
| 150 |
+
("scene_continuity", "lighting_state"): 50,
|
| 151 |
+
("scene_continuity", "props_in_frame"): 75,
|
| 152 |
+
("scene_continuity", "background_elements"): 50,
|
| 153 |
+
("action_timeline", "micro_expressions"): 50,
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
def _word_count(text: str) -> int:
|
| 157 |
+
return len(re.findall(r"\b\w+\b", text or ""))
|
| 158 |
+
|
| 159 |
+
def validate_segments_payload(payload: Dict[str, Any], expected_segments: int) -> List[str]:
|
| 160 |
+
errors: List[str] = []
|
| 161 |
+
segs = payload.get("segments", [])
|
| 162 |
+
if len(segs) != expected_segments:
|
| 163 |
+
errors.append(f"Expected {expected_segments} segments, got {len(segs)}.")
|
| 164 |
+
|
| 165 |
+
required_sync_keys = {"0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08", "0:08-0:10"}
|
| 166 |
+
physical_blocks, clothing_blocks = [], []
|
| 167 |
+
|
| 168 |
+
for i, seg in enumerate(segs, start=1):
|
| 169 |
+
si = seg.get("segment_info", {})
|
| 170 |
+
if si.get("duration") != "00:00-00:10":
|
| 171 |
+
errors.append(f"Segment {i}: duration must be 00:00-00:10.")
|
| 172 |
+
if si.get("total_segments") != expected_segments:
|
| 173 |
+
errors.append(f"Segment {i}: total_segments should be {expected_segments}, got {si.get('total_segments')}.")
|
| 174 |
+
|
| 175 |
+
sync = seg.get("action_timeline", {}).get("synchronized_actions", {})
|
| 176 |
+
if set(sync.keys()) != required_sync_keys:
|
| 177 |
+
errors.append(f"Segment {i}: synchronized_actions must have keys {sorted(required_sync_keys)}.")
|
| 178 |
+
|
| 179 |
+
# Word-count checks
|
| 180 |
+
for (section, field), minw in MIN_WORDS.items():
|
| 181 |
+
text = seg.get(section, {}).get(field, "")
|
| 182 |
+
wc = _word_count(text)
|
| 183 |
+
if wc < minw:
|
| 184 |
+
errors.append(f"Segment {i}: {section}.{field} must be >= {minw} words (got {wc}).")
|
| 185 |
+
|
| 186 |
+
ch = seg.get("character_description", {})
|
| 187 |
+
physical_blocks.append(ch.get("physical", ""))
|
| 188 |
+
clothing_blocks.append(ch.get("clothing", ""))
|
| 189 |
+
|
| 190 |
+
# Uniformity across segments
|
| 191 |
+
if expected_segments > 1:
|
| 192 |
+
if len(set(physical_blocks)) > 1:
|
| 193 |
+
errors.append("`character_description.physical` must be EXACTLY identical across all segments.")
|
| 194 |
+
if len(set(clothing_blocks)) > 1:
|
| 195 |
+
errors.append("`character_description.clothing` must be EXACTLY identical across all segments.")
|
| 196 |
+
|
| 197 |
+
return errors
|
| 198 |
+
|
| 199 |
+
def generate_segments_payload(
|
| 200 |
+
inputs: VeoInputs,
|
| 201 |
+
image_path: str = None,
|
| 202 |
+
model: str = "gpt-4o",
|
| 203 |
+
) -> Dict[str, Any]:
|
| 204 |
+
segment_texts = split_script_into_segments(inputs.script, seconds_per_segment=8)
|
| 205 |
+
N = len(segment_texts)
|
| 206 |
+
print(N)
|
| 207 |
+
|
| 208 |
+
encoded_image = base64.b64encode(image_path).decode("utf-8")
|
| 209 |
+
|
| 210 |
+
def _call_llm(user_prompt: str):
|
| 211 |
+
return gpt_client.beta.chat.completions.parse(
|
| 212 |
+
model=model,
|
| 213 |
+
response_format=SegmentsPayload,
|
| 214 |
+
messages=[
|
| 215 |
+
{"role": "system", "content": "You are a precise JSON-only generator that must satisfy a strict schema and explicit segment count."},
|
| 216 |
+
{
|
| 217 |
+
"role": "user",
|
| 218 |
+
"content": [
|
| 219 |
+
{"type": "text", "text": user_prompt},
|
| 220 |
+
{
|
| 221 |
+
"type": "image_url",
|
| 222 |
+
"image_url": {
|
| 223 |
+
"url": f"data:image/jpeg;base64,{encoded_image}"
|
| 224 |
+
},
|
| 225 |
+
},
|
| 226 |
+
],
|
| 227 |
+
},
|
| 228 |
+
],
|
| 229 |
+
).choices[0].message.parsed
|
| 230 |
+
|
| 231 |
+
user_prompt = build_prompt(inputs, segment_texts)
|
| 232 |
+
parsed_obj = _call_llm(user_prompt)
|
| 233 |
+
payload = parsed_obj.model_dump(by_alias=True)
|
| 234 |
+
|
| 235 |
+
return payload
|
src/video_gen.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import base64
|
| 3 |
+
from typing import List
|
| 4 |
+
import replicate
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
load_dotenv()
|
| 9 |
+
replicate_client = replicate.Client(api_token= os.getenv("REPLICATE_API_KEY"))
|
| 10 |
+
|
| 11 |
+
def video_generation(
|
| 12 |
+
image_bytes,
|
| 13 |
+
prompt: str,
|
| 14 |
+
aspect_ratio: str,
|
| 15 |
+
resolution: str
|
| 16 |
+
):
|
| 17 |
+
encoded_image = base64.b64encode(image_bytes).decode("utf-8")
|
| 18 |
+
output = replicate_client.run(
|
| 19 |
+
"google/veo-3",
|
| 20 |
+
input={
|
| 21 |
+
"image": f"data:image/png;base64,{encoded_image}",
|
| 22 |
+
"prompt": prompt,
|
| 23 |
+
"resolution": resolution,
|
| 24 |
+
"aspect_ratio": aspect_ratio
|
| 25 |
+
}
|
| 26 |
+
)
|
| 27 |
+
urls: List[str] = []
|
| 28 |
+
if isinstance(output, list) and output:
|
| 29 |
+
first = output[0]
|
| 30 |
+
url = getattr(first, "url", str(first))
|
| 31 |
+
urls = [url]
|
| 32 |
+
elif isinstance(output, str):
|
| 33 |
+
urls = [output]
|
| 34 |
+
elif hasattr(output, "url"):
|
| 35 |
+
urls = [getattr(output, "url")]
|
| 36 |
+
if urls:
|
| 37 |
+
return urls[0]
|