userIdc2024 commited on
Commit
d856b59
·
verified ·
1 Parent(s): 2f44af4

Upload 5 files

Browse files
src/audio_gen.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ import replicate
4
+ from dotenv import load_dotenv
5
+
6
+
7
+ load_dotenv()
8
+ replicate_client = replicate.Client(api_token= os.getenv("REPLICATE_API_KEY"))
9
+
10
+ def audio_generation(
11
+ scripts: str,
12
+ voice_id: str,
13
+ speed: float,
14
+ volume: float,
15
+ pitch: int,
16
+ emotion: str
17
+ ):
18
+ output = replicate_client.run(
19
+ "minimax/speech-02-turbo",
20
+ input={
21
+ "text": f"""{scripts}""",
22
+ "pitch": pitch,
23
+ "speed": speed,
24
+ "volume": volume,
25
+ "bitrate": 128000,
26
+ "channel": "mono",
27
+ "emotion": emotion,
28
+ "voice_id": voice_id,
29
+ "sample_rate": 32000,
30
+ "language_boost": "English",
31
+ "english_normalization": True
32
+ }
33
+ )
34
+ urls: List[str] = []
35
+ if isinstance(output, list) and output:
36
+ first = output[0]
37
+ url = getattr(first, "url", str(first))
38
+ urls = [url]
39
+ elif isinstance(output, str):
40
+ urls = [output]
41
+ elif hasattr(output, "url"):
42
+ urls = [getattr(output, "url")]
43
+ if urls:
44
+ return urls[0]
src/caption_gen.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import replicate
3
+
4
+ from dotenv import load_dotenv
5
+
6
+
7
+ load_dotenv()
8
+ replicate_client = replicate.Client(api_token= os.getenv("REPLICATE_API_KEY"))
9
+
10
+ def caption_generation(
11
+ video: bytes,
12
+ caption_size: int,
13
+ highlight_color: str):
14
+ output = replicate_client.run(
15
+ "shreejalmaharjan-27/tiktok-short-captions:46bf1c12c77ad1782d6f87828d4d8ba4d48646b8e1271b490cb9e95ccdbc4504",
16
+ input={
17
+ "model": "large-v3",
18
+ "video": f"data:video/mp4;base64,{video}",
19
+ "language": "auto",
20
+ "temperature": 0,
21
+ "caption_size": caption_size,
22
+ "highlight_color": highlight_color,
23
+ "suppress_tokens": "-1",
24
+ "logprob_threshold": -1,
25
+ "no_speech_threshold": 0.6,
26
+ "condition_on_previous_text": True,
27
+ "compression_ratio_threshold": 2.4,
28
+ "temperature_increment_on_fallback": 0.2
29
+ }
30
+ )
31
+
32
+ urls = []
33
+ if isinstance(output, list) and output:
34
+ first = output[0]
35
+ url = getattr(first, "url", str(first))
36
+ urls = [url]
37
+ elif isinstance(output, str):
38
+ urls = [output]
39
+ elif hasattr(output, "url"):
40
+ urls = [getattr(output, "url")]
41
+ if urls:
42
+ return urls[0]
src/image_gen.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ import replicate
4
+ import base64
5
+ from dotenv import load_dotenv
6
+
7
+
8
+ load_dotenv()
9
+ replicate_client = replicate.Client(api_token= os.getenv("REPLICATE_API_KEY"))
10
+
11
+ def image_generation_change_background(
12
+ img_bytes: bytes,
13
+ prompt: str,
14
+ aspect_ratio: str):
15
+ encoded_image = base64.b64encode(img_bytes).decode("utf-8")
16
+
17
+ output = replicate_client.run(
18
+ "google/nano-banana",
19
+ input={
20
+ "image_input": [f"data:image/jpeg;base64,{encoded_image}"],
21
+ "prompt": prompt,
22
+ "aspect_ratio": aspect_ratio,
23
+ }
24
+ )
25
+ urls: List[str] = []
26
+ if isinstance(output, list) and output:
27
+ first = output[0]
28
+ url = getattr(first, "url", str(first))
29
+ urls = [url]
30
+ elif isinstance(output, str):
31
+ urls = [output]
32
+ elif hasattr(output, "url"):
33
+ urls = [getattr(output, "url")]
34
+ if urls:
35
+ return urls[0]
src/prompt_generator.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Dict, Any
2
+ from pydantic import BaseModel, Field
3
+ from openai import OpenAI
4
+ import os
5
+ import re
6
+ from dotenv import load_dotenv
7
+ import base64
8
+
9
+ load_dotenv()
10
+
11
+ gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
12
+
13
+ class VeoInputs(BaseModel):
14
+ script: str
15
+ style: str
16
+ jsonFormat: str = 'standard'
17
+ continuationMode: bool = True
18
+ voiceType: Optional[str] = None
19
+ energyLevel: Optional[str] = None
20
+ settingMode: str = 'single'
21
+ cameraStyle: Optional[str] = None
22
+ energyArc: Optional[str] = None
23
+ narrativeStyle: Optional[str] = None
24
+ accentRegion: Optional[str] = None
25
+
26
+ class ContinuityMarkers(BaseModel):
27
+ start_position: str
28
+ end_position: str
29
+ start_expression: str
30
+ end_expression: str
31
+ start_gesture: str
32
+ end_gesture: str
33
+ location_status: str
34
+
35
+ class SegmentInfo(BaseModel):
36
+ segment_number: int
37
+ total_segments: int
38
+ duration: str
39
+ location: str
40
+ continuity_markers: ContinuityMarkers
41
+
42
+ class CharacterDescription(BaseModel):
43
+ current_state: str # 100+ words, segment-specific
44
+ voice_matching: str # 100+ words, segment-specific
45
+
46
+ class SynchronizedActions(BaseModel):
47
+ # Use legal Python identifiers; map to exact JSON keys with aliases
48
+ f0000_0002: str = Field(alias="0:00-0:02")
49
+ f0002_0004: str = Field(alias="0:02-0:04")
50
+ f0004_0006: str = Field(alias="0:04-0:06")
51
+ f0006_0008: str = Field(alias="0:06-0:08")
52
+
53
+ class Config:
54
+ populate_by_name = True
55
+
56
+ class ActionTimeline(BaseModel):
57
+ dialogue: str
58
+ synchronized_actions: SynchronizedActions
59
+ micro_expressions: str # 50+ words
60
+ breathing_rhythm: str
61
+ location_transition: str
62
+ continuity_checkpoint: str
63
+
64
+ class SceneContinuity(BaseModel):
65
+ environment: str # 250+ words
66
+ camera_position: str # 75+ words
67
+ camera_movement: str # detailed movement path
68
+ lighting_state: str # 50+ words
69
+ background_elements: str # 50+ words
70
+ spatial_relationships: str
71
+
72
+ class Segment(BaseModel):
73
+ segment_info: SegmentInfo
74
+ character_description: CharacterDescription
75
+ scene_continuity: SceneContinuity
76
+ action_timeline: ActionTimeline
77
+
78
+ class SegmentsPayload(BaseModel):
79
+ segments: List[Segment]
80
+
81
+ def split_script_into_segments(script: str, seconds_per_segment: int = 8, words_per_second: float = 2.2) -> List[str]:
82
+ """
83
+ Packs sentences into ~seconds * words_per_second buckets (≈ 17-20 words/8s).
84
+ Adjust words_per_second if your VO tempo differs.
85
+ """
86
+ sentences = re.split(r'(?<=[.!?])\s+', script.strip())
87
+ sentences = [s.strip() for s in sentences if s.strip()]
88
+ target = max(14, int(seconds_per_segment * words_per_second)) # minimal guard
89
+
90
+ segments, cur, cur_len = [], [], 0
91
+ for s in sentences:
92
+ w = len(s.split())
93
+ if cur and cur_len + w > target:
94
+ segments.append(" ".join(cur))
95
+ cur, cur_len = [], 0
96
+ cur.append(s)
97
+ cur_len += w
98
+ if cur:
99
+ segments.append(" ".join(cur))
100
+ return segments or [script.strip()]
101
+
102
+ def build_prompt(inputs: VeoInputs, segment_texts: List[str]) -> str:
103
+ N = len(segment_texts)
104
+ knobs = inputs.model_dump()
105
+ header = f"""
106
+ You are a senior performance-marketing video director who writes segment-accurate, production-grade JSON prompts for Veo 3.
107
+ Return ONLY JSON that parses into the provided schema. Do not add fields. No markdown.
108
+
109
+ Task: Build prompts for exactly {N} segments of 8 seconds each.
110
+ Hard rules for EVERY segment:
111
+ - "duration" MUST be "00:00-00:8"
112
+ - "current_state" = 100+ words, segment-specific
113
+ - "voice_matching" = 100+ words, segment-specific
114
+ - "environment" = 250+ words; "camera_position" = 75+ words; "lighting_state" = 50+ words min
115
+ - "camera_movement" = concrete, timestamped path (pan/tilt/dolly/handheld/steadicam)
116
+ - "synchronized_actions" must have exactly these keys: "0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08","0:08-0:10"
117
+ - Dialogue must fit in 10s naturally with breath points.
118
+ - If continuationMode is true, include a continuity checkpoint aligning next segment’s start.
119
+ - Set "segment_info.total_segments" = {N} on each segment.
120
+ - Based on the character image provide select everything as asked.
121
+ FULL SCRIPT:
122
+ \"\"\"{inputs.script.strip()}\"\"\"
123
+
124
+ AUTHORITATIVE SETTINGS (must be reflected):
125
+ {knobs}
126
+
127
+ SEGMENT LINES (cover in exactly 8 seconds each):
128
+ """
129
+ seg_lines = "\n".join([f"- Segment {i+1}: {t}" for i, t in enumerate(segment_texts)])
130
+
131
+ footer = """
132
+ OUTPUT:
133
+ Return JSON only as:
134
+ {
135
+ "segments": [ { ... per-segment object exactly matching the schema ... } ]
136
+ }
137
+ """
138
+ return header + seg_lines + footer
139
+
140
+
141
+ # ---------- Validator (segment count, durations, keys, word counts, uniformity) ----------
142
+
143
+ MIN_WORDS = {
144
+ ("character_description", "physical"): 200,
145
+ ("character_description", "clothing"): 150,
146
+ ("character_description", "current_state"): 100,
147
+ ("character_description", "voice_matching"): 100,
148
+ ("scene_continuity", "environment"): 250,
149
+ ("scene_continuity", "camera_position"): 75,
150
+ ("scene_continuity", "lighting_state"): 50,
151
+ ("scene_continuity", "props_in_frame"): 75,
152
+ ("scene_continuity", "background_elements"): 50,
153
+ ("action_timeline", "micro_expressions"): 50,
154
+ }
155
+
156
+ def _word_count(text: str) -> int:
157
+ return len(re.findall(r"\b\w+\b", text or ""))
158
+
159
+ def validate_segments_payload(payload: Dict[str, Any], expected_segments: int) -> List[str]:
160
+ errors: List[str] = []
161
+ segs = payload.get("segments", [])
162
+ if len(segs) != expected_segments:
163
+ errors.append(f"Expected {expected_segments} segments, got {len(segs)}.")
164
+
165
+ required_sync_keys = {"0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08", "0:08-0:10"}
166
+ physical_blocks, clothing_blocks = [], []
167
+
168
+ for i, seg in enumerate(segs, start=1):
169
+ si = seg.get("segment_info", {})
170
+ if si.get("duration") != "00:00-00:10":
171
+ errors.append(f"Segment {i}: duration must be 00:00-00:10.")
172
+ if si.get("total_segments") != expected_segments:
173
+ errors.append(f"Segment {i}: total_segments should be {expected_segments}, got {si.get('total_segments')}.")
174
+
175
+ sync = seg.get("action_timeline", {}).get("synchronized_actions", {})
176
+ if set(sync.keys()) != required_sync_keys:
177
+ errors.append(f"Segment {i}: synchronized_actions must have keys {sorted(required_sync_keys)}.")
178
+
179
+ # Word-count checks
180
+ for (section, field), minw in MIN_WORDS.items():
181
+ text = seg.get(section, {}).get(field, "")
182
+ wc = _word_count(text)
183
+ if wc < minw:
184
+ errors.append(f"Segment {i}: {section}.{field} must be >= {minw} words (got {wc}).")
185
+
186
+ ch = seg.get("character_description", {})
187
+ physical_blocks.append(ch.get("physical", ""))
188
+ clothing_blocks.append(ch.get("clothing", ""))
189
+
190
+ # Uniformity across segments
191
+ if expected_segments > 1:
192
+ if len(set(physical_blocks)) > 1:
193
+ errors.append("`character_description.physical` must be EXACTLY identical across all segments.")
194
+ if len(set(clothing_blocks)) > 1:
195
+ errors.append("`character_description.clothing` must be EXACTLY identical across all segments.")
196
+
197
+ return errors
198
+
199
+ def generate_segments_payload(
200
+ inputs: VeoInputs,
201
+ image_path: str = None,
202
+ model: str = "gpt-4o",
203
+ ) -> Dict[str, Any]:
204
+ segment_texts = split_script_into_segments(inputs.script, seconds_per_segment=8)
205
+ N = len(segment_texts)
206
+ print(N)
207
+
208
+ encoded_image = base64.b64encode(image_path).decode("utf-8")
209
+
210
+ def _call_llm(user_prompt: str):
211
+ return gpt_client.beta.chat.completions.parse(
212
+ model=model,
213
+ response_format=SegmentsPayload,
214
+ messages=[
215
+ {"role": "system", "content": "You are a precise JSON-only generator that must satisfy a strict schema and explicit segment count."},
216
+ {
217
+ "role": "user",
218
+ "content": [
219
+ {"type": "text", "text": user_prompt},
220
+ {
221
+ "type": "image_url",
222
+ "image_url": {
223
+ "url": f"data:image/jpeg;base64,{encoded_image}"
224
+ },
225
+ },
226
+ ],
227
+ },
228
+ ],
229
+ ).choices[0].message.parsed
230
+
231
+ user_prompt = build_prompt(inputs, segment_texts)
232
+ parsed_obj = _call_llm(user_prompt)
233
+ payload = parsed_obj.model_dump(by_alias=True)
234
+
235
+ return payload
src/video_gen.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ from typing import List
4
+ import replicate
5
+ from dotenv import load_dotenv
6
+
7
+
8
+ load_dotenv()
9
+ replicate_client = replicate.Client(api_token= os.getenv("REPLICATE_API_KEY"))
10
+
11
+ def video_generation(
12
+ image_bytes,
13
+ prompt: str,
14
+ aspect_ratio: str,
15
+ resolution: str
16
+ ):
17
+ encoded_image = base64.b64encode(image_bytes).decode("utf-8")
18
+ output = replicate_client.run(
19
+ "google/veo-3",
20
+ input={
21
+ "image": f"data:image/png;base64,{encoded_image}",
22
+ "prompt": prompt,
23
+ "resolution": resolution,
24
+ "aspect_ratio": aspect_ratio
25
+ }
26
+ )
27
+ urls: List[str] = []
28
+ if isinstance(output, list) and output:
29
+ first = output[0]
30
+ url = getattr(first, "url", str(first))
31
+ urls = [url]
32
+ elif isinstance(output, str):
33
+ urls = [output]
34
+ elif hasattr(output, "url"):
35
+ urls = [getattr(output, "url")]
36
+ if urls:
37
+ return urls[0]