Spaces:

Elvoro
/

Tools

Running

App Files Files Community

jebin2 commited on 6 days ago

Commit

fcf62f5

1 Parent(s): ee25523

feat: Merge system and negative prompts into a single enriched prompt for Grok video generation and refine prompt templates to prevent text.

Browse files

Files changed (4) hide show

src/grok_src/grok_video_generator.py +42 -5
src/pipelines/voiceover_ai_pipeline.py +2 -1
src/prompt/vo_video_generator.md +14 -13
src/video_generation_process.py +8 -2

src/grok_src/grok_video_generator.py CHANGED Viewed

@@ -20,19 +20,56 @@ class GrokVideoGenerator:
             self.client = xai_sdk.Client(api_key=self.api_key)
-    def generate_video(self, prompt: str, duration: int = 5, output_path: Optional[str] = None, image_url: Optional[str] = None) -> Dict:
         """
         Generates a video using Grok's API.
         Args:
-            prompt: Text prompt for the video.
             duration: Duration in seconds (1-15).
             output_path: Local path to save the video.
             image_url: Optional image URL for image-to-video.
         Returns:
             Dictionary containing video details.
         """
         if get_config_value("test_automation", False):
             logger.info("Generating MOCK Grok video response...")
             return {
@@ -48,17 +85,17 @@ class GrokVideoGenerator:
                 ]),
                 "task_id": "mock_grok_task_123",
                 "duration": duration,
-                "prompt": prompt,
                 "status": "success",
                 "created_at": time.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
                 "model": "grok-imagine-video-mock"
             }
         try:
-            logger.info(f"Starting Grok video generation for prompt: {prompt[:50]}...")
             kwargs = {
-                "prompt": prompt,
                 "model": "grok-imagine-video",
                 "duration": max(4, math.ceil(duration)) if duration else 5,
                 "resolution": "720p",

             self.client = xai_sdk.Client(api_key=self.api_key)
+    def _build_enriched_prompt(self, scene_prompt: str, negative_prompt: str = "", system_prompt: str = "") -> str:
+        """
+        Merge system_prompt + scene_prompt + negative_prompt into a single prompt
+        since Grok's API only accepts one prompt field.
+        """
+        logger.info(f"🏗️ Building Grok Prompt | Scene: {len(scene_prompt)} chars | System: {len(system_prompt)} chars | Negative: {len(negative_prompt)} chars")
+        logger.debug(f"Input Negative Prompt: {negative_prompt}")
+        parts = []
+        # 1. System Prompt (Context/Style)
+        if system_prompt:
+            parts.append(system_prompt.strip())
+        # 2. Scene Prompt (Core content)
+        parts.append(scene_prompt.strip())
+        # 3. Negative Prompt (Constraints)
+        # Explicitly phrasing it as negative constraints for the model
+        final_negative = negative_prompt.strip()
+        base_negative = "Do not render any readable text, screen content, UI elements, or written words in the video."
+        if final_negative:
+             parts.append(f"Avoid: {final_negative}. {base_negative}")
+        else:
+             parts.append(f"Avoid: {base_negative}")
+        enriched = " ".join(parts)
+        # Log the actual final prompt to be sure
+        logger.info(f"📝 FINAL Enriched Grok prompt: {enriched}")
+        return enriched
+    def generate_video(self, prompt: str, duration: int = 5, output_path: Optional[str] = None, image_url: Optional[str] = None, negative_prompt: str = "", system_prompt: str = "") -> Dict:
         """
         Generates a video using Grok's API.
         Args:
+            prompt: Text prompt for the video (scene_prompt).
             duration: Duration in seconds (1-15).
             output_path: Local path to save the video.
             image_url: Optional image URL for image-to-video.
+            negative_prompt: Things to avoid (merged into prompt since Grok has no separate field).
+            system_prompt: Style/quality instructions (merged into prompt).
         Returns:
             Dictionary containing video details.
         """
+        # Grok API only has a single "prompt" field, so merge system + scene + negative
+        enriched_prompt = self._build_enriched_prompt(prompt, negative_prompt, system_prompt)
         if get_config_value("test_automation", False):
             logger.info("Generating MOCK Grok video response...")
             return {
                 ]),
                 "task_id": "mock_grok_task_123",
                 "duration": duration,
+                "prompt": enriched_prompt,
                 "status": "success",
                 "created_at": time.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
                 "model": "grok-imagine-video-mock"
             }
         try:
+            logger.info(f"Starting Grok video generation for prompt: {enriched_prompt[:100]}...")
             kwargs = {
+                "prompt": enriched_prompt,
                 "model": "grok-imagine-video",
                 "duration": max(4, math.ceil(duration)) if duration else 5,
                 "resolution": "720p",

src/pipelines/voiceover_ai_pipeline.py CHANGED Viewed

@@ -150,9 +150,10 @@ class VoiceOverAIPipeline(AIContentAutomationBase):
                 result = await generate_video_process(
                     prompt=spec.get("scene_prompt", ""),
                     duration=video_duration,
-                    # Pass extra args as kwargs for providers that support them (e.g. Fal)
                     aspect_ratio=spec.get("video_parameters", {}).get("aspect_ratio", "9:16"),
                     negative_prompt=spec.get("negative_prompt", ""),
                 )
                 # Download and upload to GCS for permanent storage

                 result = await generate_video_process(
                     prompt=spec.get("scene_prompt", ""),
                     duration=video_duration,
+                    # Pass extra args as kwargs for providers that support them
                     aspect_ratio=spec.get("video_parameters", {}).get("aspect_ratio", "9:16"),
                     negative_prompt=spec.get("negative_prompt", ""),
+                    system_prompt=spec.get("system_prompt", ""),
                 )
                 # Download and upload to GCS for permanent storage

src/prompt/vo_video_generator.md CHANGED Viewed

@@ -5,10 +5,11 @@ Given ONE enriched voice-over segment metadata object, generate a complete and r
 Rules:
 - Do NOT generate the video itself.
-- Do NOT invent brand names, logos, UI, or text overlays.
-- The system_prompt should be stable and reusable across clips.
-- The scene_prompt should describe ONLY the visual scene implied by the metadata.
-- The negative_prompt should prevent text, logos, watermarks, and UI elements.
 - Video must be suitable for short-form vertical social media.
 - Prefer realistic, clean, premium visuals.
 - Use the metadata fields directly; do not add new concepts.
@@ -68,9 +69,9 @@ Enriched segment metadata:
 **Output**
 {
   "generation_provider": "xai",
-  "system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. No text, logos, UI overlays, or watermarks. Natural camera movement and realistic lighting.",
-  "scene_prompt": "A modern content creator sitting at a desk in a home studio, planning or editing social media content on a laptop, smartphone placed nearby. Clean workspace, neutral background, calm and focused atmosphere.",
-  "negative_prompt": "text overlays, captions, logos, watermarks, UI elements, brand names, subtitles",
   "video_parameters": {
     "aspect_ratio": "9:16",
     "duration_sec": 2.5,
@@ -112,9 +113,9 @@ Enriched segment metadata:
 **Output**
 {
   "generation_provider": "xai",
-  "system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. No text, logos, UI overlays, or watermarks. Natural camera movement and realistic lighting.",
-  "scene_prompt": "A content creator in an indoor workspace looking frustrated while setting up a camera or reviewing footage on a laptop, conveying time-consuming effort and high cost. Natural, relatable environment.",
-  "negative_prompt": "text overlays, captions, logos, watermarks, UI elements, brand names, subtitles",
   "video_parameters": {
     "aspect_ratio": "9:16",
     "duration_sec": 2.5,
@@ -156,9 +157,9 @@ Enriched segment metadata:
 **Output**
 {
   "generation_provider": "xai",
-  "system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. No text, logos, UI overlays, or watermarks. Natural camera movement and realistic lighting.",
-  "scene_prompt": "A fast-paced montage of premium luxury-style visuals such as elegant interiors, modern cityscapes, and refined lifestyle shots, conveying high value and exclusivity without any text or branding.",
-  "negative_prompt": "text overlays, captions, logos, watermarks, UI elements, brand names, subtitles, price tags",
   "video_parameters": {
     "aspect_ratio": "9:16",
     "duration_sec": 2.5,

 Rules:
 - Do NOT generate the video itself.
+- Do NOT include any text, words, letters, numbers, or subtitles in the visual scene.
+- Do NOT invent brand names, logos, UI, screens with text, or text overlays.
+- The system_prompt should explicitly forbidding text and UI.
+- The scene_prompt should describe ONLY the visual scene implied by the metadata, focusing on cinematic lighting and composition.
+- The negative_prompt should be exhaustive against text, logos, watermarks, and UI elements.
 - Video must be suitable for short-form vertical social media.
 - Prefer realistic, clean, premium visuals.
 - Use the metadata fields directly; do not add new concepts.
 **Output**
 {
   "generation_provider": "xai",
+  "system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. absolutely NO text, logos, UI overlays, screens with text, or watermarks. Natural camera movement and realistic lighting.",
+  "scene_prompt": "A modern content creator sitting at a clean desk in a home studio, typing on a sleek laptop with a blank or blurred screen, smartphone placed nearby. Clean workspace, neutral background, cinematic lighting, shallow depth of field.",
+  "negative_prompt": "text, words, letters, numbers, alphabets, subtitles, captions, logos, watermarks, UI elements, brand names, screen text, computer interface, icons",
   "video_parameters": {
     "aspect_ratio": "9:16",
     "duration_sec": 2.5,
 **Output**
 {
   "generation_provider": "xai",
+  "system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. absolutely NO text, logos, UI overlays, screens with text, or watermarks. Natural camera movement.",
+  "scene_prompt": "A content creator in an indoor workspace looking frustrated while setting up a camera or looking at a laptop with blurred screen. Lighting adds a dramatic mood. No visible text on screens.",
+  "negative_prompt": "text, words, letters, numbers, alphabets, subtitles, captions, logos, watermarks, UI elements, brand names, screen text, computer interface",
   "video_parameters": {
     "aspect_ratio": "9:16",
     "duration_sec": 2.5,
 **Output**
 {
   "generation_provider": "xai",
+  "system_prompt": "Generate a vertical cinematic video with realistic visuals. Style should be clean, modern, and premium. absolutely NO text, logos, UI overlays, or watermarks. Natural camera movement.",
+  "scene_prompt": "A fast-paced montage of premium luxury-style visuals such as elegant interiors, modern cityscapes, and refined lifestyle shots, conveying high value and exclusivity. No text or charts.",
+  "negative_prompt": "text, words, letters, numbers, alphabets, subtitles, captions, logos, watermarks, UI elements, brand names, price tags",
   "video_parameters": {
     "aspect_ratio": "9:16",
     "duration_sec": 2.5,

src/video_generation_process.py CHANGED Viewed

@@ -33,8 +33,14 @@ async def generate_video_process(prompt: str, duration: int, image_input: str =
             from src.grok_src.grok_video_generator import GrokVideoGenerator
             logger.info("Using Grok SDK for video generation...")
             generator = GrokVideoGenerator()
-            # If image_input is provided, it's an image-to-video request
-            return generator.generate_video(prompt, duration=duration, image_url=image_input)
         except Exception as e:
             logger.error(f"Grok video generation failed: {e}")
             # Fallback to Runway or raise?

             from src.grok_src.grok_video_generator import GrokVideoGenerator
             logger.info("Using Grok SDK for video generation...")
             generator = GrokVideoGenerator()
+            # Forward negative_prompt and system_prompt so they get merged into the Grok prompt
+            return generator.generate_video(
+                prompt,
+                duration=duration,
+                image_url=image_input,
+                negative_prompt=kwargs.get("negative_prompt", ""),
+                system_prompt=kwargs.get("system_prompt", ""),
+            )
         except Exception as e:
             logger.error(f"Grok video generation failed: {e}")
             # Fallback to Runway or raise?