feat: Merge system and negative prompts into a single enriched prompt for Grok video generation and refine prompt templates to prevent text.
Browse files
src/grok_src/grok_video_generator.py
CHANGED
|
@@ -20,19 +20,56 @@ class GrokVideoGenerator:
|
|
| 20 |
|
| 21 |
self.client = xai_sdk.Client(api_key=self.api_key)
|
| 22 |
|
| 23 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
"""
|
| 25 |
Generates a video using Grok's API.
|
| 26 |
|
| 27 |
Args:
|
| 28 |
-
prompt: Text prompt for the video.
|
| 29 |
duration: Duration in seconds (1-15).
|
| 30 |
output_path: Local path to save the video.
|
| 31 |
image_url: Optional image URL for image-to-video.
|
|
|
|
|
|
|
| 32 |
|
| 33 |
Returns:
|
| 34 |
Dictionary containing video details.
|
| 35 |
"""
|
|
|
|
|
|
|
|
|
|
| 36 |
if get_config_value("test_automation", False):
|
| 37 |
logger.info("Generating MOCK Grok video response...")
|
| 38 |
return {
|
|
@@ -48,17 +85,17 @@ class GrokVideoGenerator:
|
|
| 48 |
]),
|
| 49 |
"task_id": "mock_grok_task_123",
|
| 50 |
"duration": duration,
|
| 51 |
-
"prompt":
|
| 52 |
"status": "success",
|
| 53 |
"created_at": time.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
| 54 |
"model": "grok-imagine-video-mock"
|
| 55 |
}
|
| 56 |
|
| 57 |
try:
|
| 58 |
-
logger.info(f"Starting Grok video generation for prompt: {
|
| 59 |
|
| 60 |
kwargs = {
|
| 61 |
-
"prompt":
|
| 62 |
"model": "grok-imagine-video",
|
| 63 |
"duration": max(4, math.ceil(duration)) if duration else 5,
|
| 64 |
"resolution": "720p",
|
|
|
|
| 20 |
|
| 21 |
self.client = xai_sdk.Client(api_key=self.api_key)
|
| 22 |
|
| 23 |
+
def _build_enriched_prompt(self, scene_prompt: str, negative_prompt: str = "", system_prompt: str = "") -> str:
|
| 24 |
+
"""
|
| 25 |
+
Merge system_prompt + scene_prompt + negative_prompt into a single prompt
|
| 26 |
+
since Grok's API only accepts one prompt field.
|
| 27 |
+
"""
|
| 28 |
+
logger.info(f"🏗️ Building Grok Prompt | Scene: {len(scene_prompt)} chars | System: {len(system_prompt)} chars | Negative: {len(negative_prompt)} chars")
|
| 29 |
+
logger.debug(f"Input Negative Prompt: {negative_prompt}")
|
| 30 |
+
|
| 31 |
+
parts = []
|
| 32 |
+
|
| 33 |
+
# 1. System Prompt (Context/Style)
|
| 34 |
+
if system_prompt:
|
| 35 |
+
parts.append(system_prompt.strip())
|
| 36 |
+
|
| 37 |
+
# 2. Scene Prompt (Core content)
|
| 38 |
+
parts.append(scene_prompt.strip())
|
| 39 |
+
|
| 40 |
+
# 3. Negative Prompt (Constraints)
|
| 41 |
+
# Explicitly phrasing it as negative constraints for the model
|
| 42 |
+
final_negative = negative_prompt.strip()
|
| 43 |
+
base_negative = "Do not render any readable text, screen content, UI elements, or written words in the video."
|
| 44 |
+
|
| 45 |
+
if final_negative:
|
| 46 |
+
parts.append(f"Avoid: {final_negative}. {base_negative}")
|
| 47 |
+
else:
|
| 48 |
+
parts.append(f"Avoid: {base_negative}")
|
| 49 |
+
|
| 50 |
+
enriched = " ".join(parts)
|
| 51 |
+
# Log the actual final prompt to be sure
|
| 52 |
+
logger.info(f"📝 FINAL Enriched Grok prompt: {enriched}")
|
| 53 |
+
return enriched
|
| 54 |
+
|
| 55 |
+
def generate_video(self, prompt: str, duration: int = 5, output_path: Optional[str] = None, image_url: Optional[str] = None, negative_prompt: str = "", system_prompt: str = "") -> Dict:
|
| 56 |
"""
|
| 57 |
Generates a video using Grok's API.
|
| 58 |
|
| 59 |
Args:
|
| 60 |
+
prompt: Text prompt for the video (scene_prompt).
|
| 61 |
duration: Duration in seconds (1-15).
|
| 62 |
output_path: Local path to save the video.
|
| 63 |
image_url: Optional image URL for image-to-video.
|
| 64 |
+
negative_prompt: Things to avoid (merged into prompt since Grok has no separate field).
|
| 65 |
+
system_prompt: Style/quality instructions (merged into prompt).
|
| 66 |
|
| 67 |
Returns:
|
| 68 |
Dictionary containing video details.
|
| 69 |
"""
|
| 70 |
+
# Grok API only has a single "prompt" field, so merge system + scene + negative
|
| 71 |
+
enriched_prompt = self._build_enriched_prompt(prompt, negative_prompt, system_prompt)
|
| 72 |
+
|
| 73 |
if get_config_value("test_automation", False):
|
| 74 |
logger.info("Generating MOCK Grok video response...")
|
| 75 |
return {
|
|
|
|
| 85 |
]),
|
| 86 |
"task_id": "mock_grok_task_123",
|
| 87 |
"duration": duration,
|
| 88 |
+
"prompt": enriched_prompt,
|
| 89 |
"status": "success",
|
| 90 |
"created_at": time.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
| 91 |
"model": "grok-imagine-video-mock"
|
| 92 |
}
|
| 93 |
|
| 94 |
try:
|
| 95 |
+
logger.info(f"Starting Grok video generation for prompt: {enriched_prompt[:100]}...")
|
| 96 |
|
| 97 |
kwargs = {
|
| 98 |
+
"prompt": enriched_prompt,
|
| 99 |
"model": "grok-imagine-video",
|
| 100 |
"duration": max(4, math.ceil(duration)) if duration else 5,
|
| 101 |
"resolution": "720p",
|
src/pipelines/voiceover_ai_pipeline.py
CHANGED
|
@@ -150,9 +150,10 @@ class VoiceOverAIPipeline(AIContentAutomationBase):
|
|
| 150 |
result = await generate_video_process(
|
| 151 |
prompt=spec.get("scene_prompt", ""),
|
| 152 |
duration=video_duration,
|
| 153 |
-
# Pass extra args as kwargs for providers that support them
|
| 154 |
aspect_ratio=spec.get("video_parameters", {}).get("aspect_ratio", "9:16"),
|
| 155 |
negative_prompt=spec.get("negative_prompt", ""),
|
|
|
|
| 156 |
)
|
| 157 |
|
| 158 |
# Download and upload to GCS for permanent storage
|
|
|
|
| 150 |
result = await generate_video_process(
|
| 151 |
prompt=spec.get("scene_prompt", ""),
|
| 152 |
duration=video_duration,
|
| 153 |
+
# Pass extra args as kwargs for providers that support them
|
| 154 |
aspect_ratio=spec.get("video_parameters", {}).get("aspect_ratio", "9:16"),
|
| 155 |
negative_prompt=spec.get("negative_prompt", ""),
|
| 156 |
+
system_prompt=spec.get("system_prompt", ""),
|
| 157 |
)
|
| 158 |
|
| 159 |
# Download and upload to GCS for permanent storage
|
src/prompt/vo_video_generator.md
CHANGED
|
@@ -5,10 +5,11 @@ Given ONE enriched voice-over segment metadata object, generate a complete and r
|
|
| 5 |
|
| 6 |
Rules:
|
| 7 |
- Do NOT generate the video itself.
|
| 8 |
-
- Do NOT
|
| 9 |
-
-
|
| 10 |
-
- The
|
| 11 |
-
- The
|
|
|
|
| 12 |
- Video must be suitable for short-form vertical social media.
|
| 13 |
- Prefer realistic, clean, premium visuals.
|
| 14 |
- Use the metadata fields directly; do not add new concepts.
|
|
@@ -68,9 +69,9 @@ Enriched segment metadata:
|
|
| 68 |
**Output**
|
| 69 |
{
|
| 70 |
"generation_provider": "xai",
|
| 71 |
-
"system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium.
|
| 72 |
-
"scene_prompt": "A modern content creator sitting at a desk in a home studio,
|
| 73 |
-
"negative_prompt": "text
|
| 74 |
"video_parameters": {
|
| 75 |
"aspect_ratio": "9:16",
|
| 76 |
"duration_sec": 2.5,
|
|
@@ -112,9 +113,9 @@ Enriched segment metadata:
|
|
| 112 |
**Output**
|
| 113 |
{
|
| 114 |
"generation_provider": "xai",
|
| 115 |
-
"system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium.
|
| 116 |
-
"scene_prompt": "A content creator in an indoor workspace looking frustrated while setting up a camera or
|
| 117 |
-
"negative_prompt": "text
|
| 118 |
"video_parameters": {
|
| 119 |
"aspect_ratio": "9:16",
|
| 120 |
"duration_sec": 2.5,
|
|
@@ -156,9 +157,9 @@ Enriched segment metadata:
|
|
| 156 |
**Output**
|
| 157 |
{
|
| 158 |
"generation_provider": "xai",
|
| 159 |
-
"system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium.
|
| 160 |
-
"scene_prompt": "A fast-paced montage of premium luxury-style visuals such as elegant interiors, modern cityscapes, and refined lifestyle shots, conveying high value and exclusivity
|
| 161 |
-
"negative_prompt": "text
|
| 162 |
"video_parameters": {
|
| 163 |
"aspect_ratio": "9:16",
|
| 164 |
"duration_sec": 2.5,
|
|
|
|
| 5 |
|
| 6 |
Rules:
|
| 7 |
- Do NOT generate the video itself.
|
| 8 |
+
- Do NOT include any text, words, letters, numbers, or subtitles in the visual scene.
|
| 9 |
+
- Do NOT invent brand names, logos, UI, screens with text, or text overlays.
|
| 10 |
+
- The system_prompt should explicitly forbidding text and UI.
|
| 11 |
+
- The scene_prompt should describe ONLY the visual scene implied by the metadata, focusing on cinematic lighting and composition.
|
| 12 |
+
- The negative_prompt should be exhaustive against text, logos, watermarks, and UI elements.
|
| 13 |
- Video must be suitable for short-form vertical social media.
|
| 14 |
- Prefer realistic, clean, premium visuals.
|
| 15 |
- Use the metadata fields directly; do not add new concepts.
|
|
|
|
| 69 |
**Output**
|
| 70 |
{
|
| 71 |
"generation_provider": "xai",
|
| 72 |
+
"system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. absolutely NO text, logos, UI overlays, screens with text, or watermarks. Natural camera movement and realistic lighting.",
|
| 73 |
+
"scene_prompt": "A modern content creator sitting at a clean desk in a home studio, typing on a sleek laptop with a blank or blurred screen, smartphone placed nearby. Clean workspace, neutral background, cinematic lighting, shallow depth of field.",
|
| 74 |
+
"negative_prompt": "text, words, letters, numbers, alphabets, subtitles, captions, logos, watermarks, UI elements, brand names, screen text, computer interface, icons",
|
| 75 |
"video_parameters": {
|
| 76 |
"aspect_ratio": "9:16",
|
| 77 |
"duration_sec": 2.5,
|
|
|
|
| 113 |
**Output**
|
| 114 |
{
|
| 115 |
"generation_provider": "xai",
|
| 116 |
+
"system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. absolutely NO text, logos, UI overlays, screens with text, or watermarks. Natural camera movement.",
|
| 117 |
+
"scene_prompt": "A content creator in an indoor workspace looking frustrated while setting up a camera or looking at a laptop with blurred screen. Lighting adds a dramatic mood. No visible text on screens.",
|
| 118 |
+
"negative_prompt": "text, words, letters, numbers, alphabets, subtitles, captions, logos, watermarks, UI elements, brand names, screen text, computer interface",
|
| 119 |
"video_parameters": {
|
| 120 |
"aspect_ratio": "9:16",
|
| 121 |
"duration_sec": 2.5,
|
|
|
|
| 157 |
**Output**
|
| 158 |
{
|
| 159 |
"generation_provider": "xai",
|
| 160 |
+
"system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. absolutely NO text, logos, UI overlays, or watermarks. Natural camera movement.",
|
| 161 |
+
"scene_prompt": "A fast-paced montage of premium luxury-style visuals such as elegant interiors, modern cityscapes, and refined lifestyle shots, conveying high value and exclusivity. No text or charts.",
|
| 162 |
+
"negative_prompt": "text, words, letters, numbers, alphabets, subtitles, captions, logos, watermarks, UI elements, brand names, price tags",
|
| 163 |
"video_parameters": {
|
| 164 |
"aspect_ratio": "9:16",
|
| 165 |
"duration_sec": 2.5,
|
src/video_generation_process.py
CHANGED
|
@@ -33,8 +33,14 @@ async def generate_video_process(prompt: str, duration: int, image_input: str =
|
|
| 33 |
from src.grok_src.grok_video_generator import GrokVideoGenerator
|
| 34 |
logger.info("Using Grok SDK for video generation...")
|
| 35 |
generator = GrokVideoGenerator()
|
| 36 |
-
#
|
| 37 |
-
return generator.generate_video(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
except Exception as e:
|
| 39 |
logger.error(f"Grok video generation failed: {e}")
|
| 40 |
# Fallback to Runway or raise?
|
|
|
|
| 33 |
from src.grok_src.grok_video_generator import GrokVideoGenerator
|
| 34 |
logger.info("Using Grok SDK for video generation...")
|
| 35 |
generator = GrokVideoGenerator()
|
| 36 |
+
# Forward negative_prompt and system_prompt so they get merged into the Grok prompt
|
| 37 |
+
return generator.generate_video(
|
| 38 |
+
prompt,
|
| 39 |
+
duration=duration,
|
| 40 |
+
image_url=image_input,
|
| 41 |
+
negative_prompt=kwargs.get("negative_prompt", ""),
|
| 42 |
+
system_prompt=kwargs.get("system_prompt", ""),
|
| 43 |
+
)
|
| 44 |
except Exception as e:
|
| 45 |
logger.error(f"Grok video generation failed: {e}")
|
| 46 |
# Fallback to Runway or raise?
|