jebin2 commited on
Commit
fcf62f5
·
1 Parent(s): ee25523

feat: Merge system and negative prompts into a single enriched prompt for Grok video generation and refine prompt templates to prevent text.

Browse files
src/grok_src/grok_video_generator.py CHANGED
@@ -20,19 +20,56 @@ class GrokVideoGenerator:
20
 
21
  self.client = xai_sdk.Client(api_key=self.api_key)
22
 
23
- def generate_video(self, prompt: str, duration: int = 5, output_path: Optional[str] = None, image_url: Optional[str] = None) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  """
25
  Generates a video using Grok's API.
26
 
27
  Args:
28
- prompt: Text prompt for the video.
29
  duration: Duration in seconds (1-15).
30
  output_path: Local path to save the video.
31
  image_url: Optional image URL for image-to-video.
 
 
32
 
33
  Returns:
34
  Dictionary containing video details.
35
  """
 
 
 
36
  if get_config_value("test_automation", False):
37
  logger.info("Generating MOCK Grok video response...")
38
  return {
@@ -48,17 +85,17 @@ class GrokVideoGenerator:
48
  ]),
49
  "task_id": "mock_grok_task_123",
50
  "duration": duration,
51
- "prompt": prompt,
52
  "status": "success",
53
  "created_at": time.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
54
  "model": "grok-imagine-video-mock"
55
  }
56
 
57
  try:
58
- logger.info(f"Starting Grok video generation for prompt: {prompt[:50]}...")
59
 
60
  kwargs = {
61
- "prompt": prompt,
62
  "model": "grok-imagine-video",
63
  "duration": max(4, math.ceil(duration)) if duration else 5,
64
  "resolution": "720p",
 
20
 
21
  self.client = xai_sdk.Client(api_key=self.api_key)
22
 
23
+ def _build_enriched_prompt(self, scene_prompt: str, negative_prompt: str = "", system_prompt: str = "") -> str:
24
+ """
25
+ Merge system_prompt + scene_prompt + negative_prompt into a single prompt
26
+ since Grok's API only accepts one prompt field.
27
+ """
28
+ logger.info(f"🏗️ Building Grok Prompt | Scene: {len(scene_prompt)} chars | System: {len(system_prompt)} chars | Negative: {len(negative_prompt)} chars")
29
+ logger.debug(f"Input Negative Prompt: {negative_prompt}")
30
+
31
+ parts = []
32
+
33
+ # 1. System Prompt (Context/Style)
34
+ if system_prompt:
35
+ parts.append(system_prompt.strip())
36
+
37
+ # 2. Scene Prompt (Core content)
38
+ parts.append(scene_prompt.strip())
39
+
40
+ # 3. Negative Prompt (Constraints)
41
+ # Explicitly phrasing it as negative constraints for the model
42
+ final_negative = negative_prompt.strip()
43
+ base_negative = "Do not render any readable text, screen content, UI elements, or written words in the video."
44
+
45
+ if final_negative:
46
+ parts.append(f"Avoid: {final_negative}. {base_negative}")
47
+ else:
48
+ parts.append(f"Avoid: {base_negative}")
49
+
50
+ enriched = " ".join(parts)
51
+ # Log the actual final prompt to be sure
52
+ logger.info(f"📝 FINAL Enriched Grok prompt: {enriched}")
53
+ return enriched
54
+
55
+ def generate_video(self, prompt: str, duration: int = 5, output_path: Optional[str] = None, image_url: Optional[str] = None, negative_prompt: str = "", system_prompt: str = "") -> Dict:
56
  """
57
  Generates a video using Grok's API.
58
 
59
  Args:
60
+ prompt: Text prompt for the video (scene_prompt).
61
  duration: Duration in seconds (1-15).
62
  output_path: Local path to save the video.
63
  image_url: Optional image URL for image-to-video.
64
+ negative_prompt: Things to avoid (merged into prompt since Grok has no separate field).
65
+ system_prompt: Style/quality instructions (merged into prompt).
66
 
67
  Returns:
68
  Dictionary containing video details.
69
  """
70
+ # Grok API only has a single "prompt" field, so merge system + scene + negative
71
+ enriched_prompt = self._build_enriched_prompt(prompt, negative_prompt, system_prompt)
72
+
73
  if get_config_value("test_automation", False):
74
  logger.info("Generating MOCK Grok video response...")
75
  return {
 
85
  ]),
86
  "task_id": "mock_grok_task_123",
87
  "duration": duration,
88
+ "prompt": enriched_prompt,
89
  "status": "success",
90
  "created_at": time.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
91
  "model": "grok-imagine-video-mock"
92
  }
93
 
94
  try:
95
+ logger.info(f"Starting Grok video generation for prompt: {enriched_prompt[:100]}...")
96
 
97
  kwargs = {
98
+ "prompt": enriched_prompt,
99
  "model": "grok-imagine-video",
100
  "duration": max(4, math.ceil(duration)) if duration else 5,
101
  "resolution": "720p",
src/pipelines/voiceover_ai_pipeline.py CHANGED
@@ -150,9 +150,10 @@ class VoiceOverAIPipeline(AIContentAutomationBase):
150
  result = await generate_video_process(
151
  prompt=spec.get("scene_prompt", ""),
152
  duration=video_duration,
153
- # Pass extra args as kwargs for providers that support them (e.g. Fal)
154
  aspect_ratio=spec.get("video_parameters", {}).get("aspect_ratio", "9:16"),
155
  negative_prompt=spec.get("negative_prompt", ""),
 
156
  )
157
 
158
  # Download and upload to GCS for permanent storage
 
150
  result = await generate_video_process(
151
  prompt=spec.get("scene_prompt", ""),
152
  duration=video_duration,
153
+ # Pass extra args as kwargs for providers that support them
154
  aspect_ratio=spec.get("video_parameters", {}).get("aspect_ratio", "9:16"),
155
  negative_prompt=spec.get("negative_prompt", ""),
156
+ system_prompt=spec.get("system_prompt", ""),
157
  )
158
 
159
  # Download and upload to GCS for permanent storage
src/prompt/vo_video_generator.md CHANGED
@@ -5,10 +5,11 @@ Given ONE enriched voice-over segment metadata object, generate a complete and r
5
 
6
  Rules:
7
  - Do NOT generate the video itself.
8
- - Do NOT invent brand names, logos, UI, or text overlays.
9
- - The system_prompt should be stable and reusable across clips.
10
- - The scene_prompt should describe ONLY the visual scene implied by the metadata.
11
- - The negative_prompt should prevent text, logos, watermarks, and UI elements.
 
12
  - Video must be suitable for short-form vertical social media.
13
  - Prefer realistic, clean, premium visuals.
14
  - Use the metadata fields directly; do not add new concepts.
@@ -68,9 +69,9 @@ Enriched segment metadata:
68
  **Output**
69
  {
70
  "generation_provider": "xai",
71
- "system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. No text, logos, UI overlays, or watermarks. Natural camera movement and realistic lighting.",
72
- "scene_prompt": "A modern content creator sitting at a desk in a home studio, planning or editing social media content on a laptop, smartphone placed nearby. Clean workspace, neutral background, calm and focused atmosphere.",
73
- "negative_prompt": "text overlays, captions, logos, watermarks, UI elements, brand names, subtitles",
74
  "video_parameters": {
75
  "aspect_ratio": "9:16",
76
  "duration_sec": 2.5,
@@ -112,9 +113,9 @@ Enriched segment metadata:
112
  **Output**
113
  {
114
  "generation_provider": "xai",
115
- "system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. No text, logos, UI overlays, or watermarks. Natural camera movement and realistic lighting.",
116
- "scene_prompt": "A content creator in an indoor workspace looking frustrated while setting up a camera or reviewing footage on a laptop, conveying time-consuming effort and high cost. Natural, relatable environment.",
117
- "negative_prompt": "text overlays, captions, logos, watermarks, UI elements, brand names, subtitles",
118
  "video_parameters": {
119
  "aspect_ratio": "9:16",
120
  "duration_sec": 2.5,
@@ -156,9 +157,9 @@ Enriched segment metadata:
156
  **Output**
157
  {
158
  "generation_provider": "xai",
159
- "system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. No text, logos, UI overlays, or watermarks. Natural camera movement and realistic lighting.",
160
- "scene_prompt": "A fast-paced montage of premium luxury-style visuals such as elegant interiors, modern cityscapes, and refined lifestyle shots, conveying high value and exclusivity without any text or branding.",
161
- "negative_prompt": "text overlays, captions, logos, watermarks, UI elements, brand names, subtitles, price tags",
162
  "video_parameters": {
163
  "aspect_ratio": "9:16",
164
  "duration_sec": 2.5,
 
5
 
6
  Rules:
7
  - Do NOT generate the video itself.
8
+ - Do NOT include any text, words, letters, numbers, or subtitles in the visual scene.
9
+ - Do NOT invent brand names, logos, UI, screens with text, or text overlays.
10
+ - The system_prompt should explicitly forbidding text and UI.
11
+ - The scene_prompt should describe ONLY the visual scene implied by the metadata, focusing on cinematic lighting and composition.
12
+ - The negative_prompt should be exhaustive against text, logos, watermarks, and UI elements.
13
  - Video must be suitable for short-form vertical social media.
14
  - Prefer realistic, clean, premium visuals.
15
  - Use the metadata fields directly; do not add new concepts.
 
69
  **Output**
70
  {
71
  "generation_provider": "xai",
72
+ "system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. absolutely NO text, logos, UI overlays, screens with text, or watermarks. Natural camera movement and realistic lighting.",
73
+ "scene_prompt": "A modern content creator sitting at a clean desk in a home studio, typing on a sleek laptop with a blank or blurred screen, smartphone placed nearby. Clean workspace, neutral background, cinematic lighting, shallow depth of field.",
74
+ "negative_prompt": "text, words, letters, numbers, alphabets, subtitles, captions, logos, watermarks, UI elements, brand names, screen text, computer interface, icons",
75
  "video_parameters": {
76
  "aspect_ratio": "9:16",
77
  "duration_sec": 2.5,
 
113
  **Output**
114
  {
115
  "generation_provider": "xai",
116
+ "system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. absolutely NO text, logos, UI overlays, screens with text, or watermarks. Natural camera movement.",
117
+ "scene_prompt": "A content creator in an indoor workspace looking frustrated while setting up a camera or looking at a laptop with blurred screen. Lighting adds a dramatic mood. No visible text on screens.",
118
+ "negative_prompt": "text, words, letters, numbers, alphabets, subtitles, captions, logos, watermarks, UI elements, brand names, screen text, computer interface",
119
  "video_parameters": {
120
  "aspect_ratio": "9:16",
121
  "duration_sec": 2.5,
 
157
  **Output**
158
  {
159
  "generation_provider": "xai",
160
+ "system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. absolutely NO text, logos, UI overlays, or watermarks. Natural camera movement.",
161
+ "scene_prompt": "A fast-paced montage of premium luxury-style visuals such as elegant interiors, modern cityscapes, and refined lifestyle shots, conveying high value and exclusivity. No text or charts.",
162
+ "negative_prompt": "text, words, letters, numbers, alphabets, subtitles, captions, logos, watermarks, UI elements, brand names, price tags",
163
  "video_parameters": {
164
  "aspect_ratio": "9:16",
165
  "duration_sec": 2.5,
src/video_generation_process.py CHANGED
@@ -33,8 +33,14 @@ async def generate_video_process(prompt: str, duration: int, image_input: str =
33
  from src.grok_src.grok_video_generator import GrokVideoGenerator
34
  logger.info("Using Grok SDK for video generation...")
35
  generator = GrokVideoGenerator()
36
- # If image_input is provided, it's an image-to-video request
37
- return generator.generate_video(prompt, duration=duration, image_url=image_input)
 
 
 
 
 
 
38
  except Exception as e:
39
  logger.error(f"Grok video generation failed: {e}")
40
  # Fallback to Runway or raise?
 
33
  from src.grok_src.grok_video_generator import GrokVideoGenerator
34
  logger.info("Using Grok SDK for video generation...")
35
  generator = GrokVideoGenerator()
36
+ # Forward negative_prompt and system_prompt so they get merged into the Grok prompt
37
+ return generator.generate_video(
38
+ prompt,
39
+ duration=duration,
40
+ image_url=image_input,
41
+ negative_prompt=kwargs.get("negative_prompt", ""),
42
+ system_prompt=kwargs.get("system_prompt", ""),
43
+ )
44
  except Exception as e:
45
  logger.error(f"Grok video generation failed: {e}")
46
  # Fallback to Runway or raise?