Spaces:

userIdc2024
/

Video-Generator-Tools

Sleeping

App Files Files Community

userIdc2024 commited on Oct 10, 2025

Commit

d856b59

verified ·

1 Parent(s): 2f44af4

Upload 5 files

Browse files

Files changed (5) hide show

src/audio_gen.py +44 -0
src/caption_gen.py +42 -0
src/image_gen.py +35 -0
src/prompt_generator.py +235 -0
src/video_gen.py +37 -0

src/audio_gen.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+from typing import List
+import replicate
+from dotenv import load_dotenv
+load_dotenv()
+replicate_client = replicate.Client(api_token= os.getenv("REPLICATE_API_KEY"))
+def audio_generation(
+    scripts: str,
+    voice_id: str,
+    speed: float,
+    volume: float,
+    pitch: int,
+    emotion: str
+    ):
+    output = replicate_client.run(
+        "minimax/speech-02-turbo",
+        input={
+            "text": f"""{scripts}""",
+            "pitch": pitch,
+            "speed": speed,
+            "volume": volume,
+            "bitrate": 128000,
+            "channel": "mono",
+            "emotion": emotion,
+            "voice_id": voice_id,
+            "sample_rate": 32000,
+            "language_boost": "English",
+            "english_normalization": True
+        }
+    )
+    urls: List[str] = []
+    if isinstance(output, list) and output:
+        first = output[0]
+        url = getattr(first, "url", str(first))
+        urls = [url]
+    elif isinstance(output, str):
+        urls = [output]
+    elif hasattr(output, "url"):
+        urls = [getattr(output, "url")]
+    if urls:
+        return urls[0]

src/caption_gen.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import replicate
+from dotenv import load_dotenv
+load_dotenv()
+replicate_client = replicate.Client(api_token= os.getenv("REPLICATE_API_KEY"))
+def caption_generation(
+    video: bytes,
+    caption_size: int,
+    highlight_color: str):
+    output = replicate_client.run(
+        "shreejalmaharjan-27/tiktok-short-captions:46bf1c12c77ad1782d6f87828d4d8ba4d48646b8e1271b490cb9e95ccdbc4504",
+        input={
+            "model": "large-v3",
+            "video": f"data:video/mp4;base64,{video}",
+            "language": "auto",
+            "temperature": 0,
+            "caption_size": caption_size,
+            "highlight_color": highlight_color,
+            "suppress_tokens": "-1",
+            "logprob_threshold": -1,
+            "no_speech_threshold": 0.6,
+            "condition_on_previous_text": True,
+            "compression_ratio_threshold": 2.4,
+            "temperature_increment_on_fallback": 0.2
+        }
+    )
+    urls = []
+    if isinstance(output, list) and output:
+        first = output[0]
+        url = getattr(first, "url", str(first))
+        urls = [url]
+    elif isinstance(output, str):
+        urls = [output]
+    elif hasattr(output, "url"):
+        urls = [getattr(output, "url")]
+    if urls:
+        return urls[0]

src/image_gen.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+from typing import List
+import replicate
+import base64
+from dotenv import load_dotenv
+load_dotenv()
+replicate_client = replicate.Client(api_token= os.getenv("REPLICATE_API_KEY"))
+def image_generation_change_background(
+    img_bytes: bytes,
+    prompt: str,
+    aspect_ratio: str):
+    encoded_image = base64.b64encode(img_bytes).decode("utf-8")
+    output = replicate_client.run(
+        "google/nano-banana",
+        input={
+            "image_input": [f"data:image/jpeg;base64,{encoded_image}"],
+            "prompt": prompt,
+            "aspect_ratio": aspect_ratio,
+        }
+    )
+    urls: List[str] = []
+    if isinstance(output, list) and output:
+        first = output[0]
+        url = getattr(first, "url", str(first))
+        urls = [url]
+    elif isinstance(output, str):
+        urls = [output]
+    elif hasattr(output, "url"):
+        urls = [getattr(output, "url")]
+    if urls:
+        return urls[0]

src/prompt_generator.py ADDED Viewed

	@@ -0,0 +1,235 @@

+from typing import List, Optional, Dict, Any
+from pydantic import BaseModel, Field
+from openai import OpenAI
+import os
+import re
+from dotenv import load_dotenv
+import base64
+load_dotenv()
+gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+class VeoInputs(BaseModel):
+    script: str
+    style: str
+    jsonFormat: str = 'standard'
+    continuationMode: bool = True
+    voiceType: Optional[str] = None
+    energyLevel: Optional[str] = None
+    settingMode: str = 'single'
+    cameraStyle: Optional[str] = None
+    energyArc: Optional[str] = None
+    narrativeStyle: Optional[str] = None
+    accentRegion: Optional[str] = None
+class ContinuityMarkers(BaseModel):
+    start_position: str
+    end_position: str
+    start_expression: str
+    end_expression: str
+    start_gesture: str
+    end_gesture: str
+    location_status: str
+class SegmentInfo(BaseModel):
+    segment_number: int
+    total_segments: int
+    duration: str
+    location: str
+    continuity_markers: ContinuityMarkers
+class CharacterDescription(BaseModel):
+    current_state: str     # 100+ words, segment-specific
+    voice_matching: str    # 100+ words, segment-specific
+class SynchronizedActions(BaseModel):
+    # Use legal Python identifiers; map to exact JSON keys with aliases
+    f0000_0002: str = Field(alias="0:00-0:02")
+    f0002_0004: str = Field(alias="0:02-0:04")
+    f0004_0006: str = Field(alias="0:04-0:06")
+    f0006_0008: str = Field(alias="0:06-0:08")
+    class Config:
+        populate_by_name = True
+class ActionTimeline(BaseModel):
+    dialogue: str
+    synchronized_actions: SynchronizedActions
+    micro_expressions: str   # 50+ words
+    breathing_rhythm: str
+    location_transition: str
+    continuity_checkpoint: str
+class SceneContinuity(BaseModel):
+    environment: str           # 250+ words
+    camera_position: str       # 75+ words
+    camera_movement: str       # detailed movement path
+    lighting_state: str        # 50+ words
+    background_elements: str   # 50+ words
+    spatial_relationships: str
+class Segment(BaseModel):
+    segment_info: SegmentInfo
+    character_description: CharacterDescription
+    scene_continuity: SceneContinuity
+    action_timeline: ActionTimeline
+class SegmentsPayload(BaseModel):
+    segments: List[Segment]
+def split_script_into_segments(script: str, seconds_per_segment: int = 8, words_per_second: float = 2.2) -> List[str]:
+    """
+    Packs sentences into ~seconds * words_per_second buckets (≈ 17-20 words/8s).
+    Adjust words_per_second if your VO tempo differs.
+    """
+    sentences = re.split(r'(?<=[.!?])\s+', script.strip())
+    sentences = [s.strip() for s in sentences if s.strip()]
+    target = max(14, int(seconds_per_segment * words_per_second))  # minimal guard
+    segments, cur, cur_len = [], [], 0
+    for s in sentences:
+        w = len(s.split())
+        if cur and cur_len + w > target:
+            segments.append(" ".join(cur))
+            cur, cur_len = [], 0
+        cur.append(s)
+        cur_len += w
+    if cur:
+        segments.append(" ".join(cur))
+    return segments or [script.strip()]
+def build_prompt(inputs: VeoInputs, segment_texts: List[str]) -> str:
+    N = len(segment_texts)
+    knobs = inputs.model_dump()
+    header = f"""
+You are a senior performance-marketing video director who writes segment-accurate, production-grade JSON prompts for Veo 3.
+Return ONLY JSON that parses into the provided schema. Do not add fields. No markdown.
+Task: Build prompts for exactly {N} segments of 8 seconds each.
+Hard rules for EVERY segment:
+- "duration" MUST be "00:00-00:8"
+- "current_state" = 100+ words, segment-specific
+- "voice_matching" = 100+ words, segment-specific
+- "environment" = 250+ words; "camera_position" = 75+ words; "lighting_state" = 50+ words min
+- "camera_movement" = concrete, timestamped path (pan/tilt/dolly/handheld/steadicam)
+- "synchronized_actions" must have exactly these keys: "0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08","0:08-0:10"
+- Dialogue must fit in 10s naturally with breath points.
+- If continuationMode is true, include a continuity checkpoint aligning next segment’s start.
+- Set "segment_info.total_segments" = {N} on each segment.
+- Based on the character image provide select everything as asked.
+FULL SCRIPT:
+\"\"\"{inputs.script.strip()}\"\"\"
+AUTHORITATIVE SETTINGS (must be reflected):
+{knobs}
+SEGMENT LINES (cover in exactly 8 seconds each):
+"""
+    seg_lines = "\n".join([f"- Segment {i+1}: {t}" for i, t in enumerate(segment_texts)])
+    footer = """
+OUTPUT:
+Return JSON only as:
+{
+  "segments": [ { ... per-segment object exactly matching the schema ... } ]
+}
+"""
+    return header + seg_lines + footer
+# ---------- Validator (segment count, durations, keys, word counts, uniformity) ----------
+MIN_WORDS = {
+    ("character_description", "physical"): 200,
+    ("character_description", "clothing"): 150,
+    ("character_description", "current_state"): 100,
+    ("character_description", "voice_matching"): 100,
+    ("scene_continuity", "environment"): 250,
+    ("scene_continuity", "camera_position"): 75,
+    ("scene_continuity", "lighting_state"): 50,
+    ("scene_continuity", "props_in_frame"): 75,
+    ("scene_continuity", "background_elements"): 50,
+    ("action_timeline", "micro_expressions"): 50,
+}
+def _word_count(text: str) -> int:
+    return len(re.findall(r"\b\w+\b", text or ""))
+def validate_segments_payload(payload: Dict[str, Any], expected_segments: int) -> List[str]:
+    errors: List[str] = []
+    segs = payload.get("segments", [])
+    if len(segs) != expected_segments:
+        errors.append(f"Expected {expected_segments} segments, got {len(segs)}.")
+    required_sync_keys = {"0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08", "0:08-0:10"}
+    physical_blocks, clothing_blocks = [], []
+    for i, seg in enumerate(segs, start=1):
+        si = seg.get("segment_info", {})
+        if si.get("duration") != "00:00-00:10":
+            errors.append(f"Segment {i}: duration must be 00:00-00:10.")
+        if si.get("total_segments") != expected_segments:
+            errors.append(f"Segment {i}: total_segments should be {expected_segments}, got {si.get('total_segments')}.")
+        sync = seg.get("action_timeline", {}).get("synchronized_actions", {})
+        if set(sync.keys()) != required_sync_keys:
+            errors.append(f"Segment {i}: synchronized_actions must have keys {sorted(required_sync_keys)}.")
+        # Word-count checks
+        for (section, field), minw in MIN_WORDS.items():
+            text = seg.get(section, {}).get(field, "")
+            wc = _word_count(text)
+            if wc < minw:
+                errors.append(f"Segment {i}: {section}.{field} must be >= {minw} words (got {wc}).")
+        ch = seg.get("character_description", {})
+        physical_blocks.append(ch.get("physical", ""))
+        clothing_blocks.append(ch.get("clothing", ""))
+    # Uniformity across segments
+    if expected_segments > 1:
+        if len(set(physical_blocks)) > 1:
+            errors.append("`character_description.physical` must be EXACTLY identical across all segments.")
+        if len(set(clothing_blocks)) > 1:
+            errors.append("`character_description.clothing` must be EXACTLY identical across all segments.")
+    return errors
+def generate_segments_payload(
+    inputs: VeoInputs,
+    image_path: str = None,
+    model: str = "gpt-4o",
+) -> Dict[str, Any]:
+    segment_texts = split_script_into_segments(inputs.script, seconds_per_segment=8)
+    N = len(segment_texts)
+    print(N)
+    encoded_image = base64.b64encode(image_path).decode("utf-8")
+    def _call_llm(user_prompt: str):
+        return gpt_client.beta.chat.completions.parse(
+        model=model,
+        response_format=SegmentsPayload,
+        messages=[
+            {"role": "system", "content": "You are a precise JSON-only generator that must satisfy a strict schema and explicit segment count."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": user_prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{encoded_image}"
+                        },
+                    },
+                ],
+            },
+        ],
+        ).choices[0].message.parsed
+    user_prompt = build_prompt(inputs, segment_texts)
+    parsed_obj = _call_llm(user_prompt)
+    payload = parsed_obj.model_dump(by_alias=True)
+    return payload

src/video_gen.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import base64
+from typing import List
+import replicate
+from dotenv import load_dotenv
+load_dotenv()
+replicate_client = replicate.Client(api_token= os.getenv("REPLICATE_API_KEY"))
+def video_generation(
+    image_bytes,
+    prompt: str,
+    aspect_ratio: str,
+    resolution: str
+):
+    encoded_image = base64.b64encode(image_bytes).decode("utf-8")
+    output = replicate_client.run(
+        "google/veo-3",
+        input={
+            "image": f"data:image/png;base64,{encoded_image}",
+            "prompt": prompt,
+            "resolution": resolution,
+            "aspect_ratio": aspect_ratio
+        }
+    )
+    urls: List[str] = []
+    if isinstance(output, list) and output:
+        first = output[0]
+        url = getattr(first, "url", str(first))
+        urls = [url]
+    elif isinstance(output, str):
+        urls = [output]
+    elif hasattr(output, "url"):
+        urls = [getattr(output, "url")]
+    if urls:
+        return urls[0]