Spaces:

Manireddy1508
/

imagetoimage

Paused

App Files Files Community

Manireddy1508 commited on Apr 7, 2025

Commit

054edad

verified ·

1 Parent(s): 64855f6

Update utils/planner.py

Browse files

Files changed (1) hide show

utils/planner.py +41 -24

utils/planner.py CHANGED Viewed

@@ -24,7 +24,7 @@ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
 # ----------------------------
-# 🧠 Load CLIP Tokenizer (for token limit check)
 # ----------------------------
 tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
@@ -49,8 +49,8 @@ SCENE_SYSTEM_INSTRUCTIONS = """
 You are a scene planning assistant for an AI image generation system.
 Your job is to take a caption from a product image and a user prompt, then return a structured JSON with:
 - scene (environment, setting)
-- subject (main actor)
-- objects (main product or items)
 - layout (foreground/background elements and their placement)
 - rules (validation rules to ensure visual correctness)
 Respond ONLY in raw JSON format. Do NOT include explanations.
@@ -83,36 +83,53 @@ def extract_scene_plan(prompt: str, image: Image.Image) -> dict:
     except Exception as e:
         print("❌ extract_scene_plan() Error:", e)
         return {
-            "scene": "studio",
-            "subject": "product",
-            "objects": [],
             "layout": {},
             "rules": {}
         }
 # ----------------------------
-# ✨ Generate Prompt Variations
 # ----------------------------
-def generate_prompt_variations_from_scene(scene_plan: dict, base_prompt: str, n: int = 3) -> list:
-    variations = []
-    for i in range(n):
-        enriched_prompt = (
-            f"{scene_plan.get('subject', 'a product')} "
-            f"in a {scene_plan.get('scene', 'studio setting')} "
-            f"with {', '.join(scene_plan.get('objects', []))} "
-            f"and layout details like {scene_plan.get('layout', {})}. "
-            f"{scene_plan.get('rules', '')}"
-        )
-        # Enforce 77-token limit for SDXL
-        tokens = tokenizer(enriched_prompt)["input_ids"]
-        if len(tokens) > 77:
-            enriched_prompt = tokenizer.decode(tokens[:77], skip_special_tokens=True)
-        variations.append(enriched_prompt.strip())
-    return variations
 # ----------------------------
 # ❌ Generate Negative Prompt

 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
 # ----------------------------
+# 🧠 Load CLIP Tokenizer (for optional diagnostics)
 # ----------------------------
 tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
 You are a scene planning assistant for an AI image generation system.
 Your job is to take a caption from a product image and a user prompt, then return a structured JSON with:
 - scene (environment, setting)
+- subject (main_actor)
+- objects (main_product or items)
 - layout (foreground/background elements and their placement)
 - rules (validation rules to ensure visual correctness)
 Respond ONLY in raw JSON format. Do NOT include explanations.
     except Exception as e:
         print("❌ extract_scene_plan() Error:", e)
         return {
+            "scene": {"environment": "studio", "setting": "plain white background"},
+            "subject": {"main_actor": "a product"},
+            "objects": {"main_product": "product"},
             "layout": {},
             "rules": {}
         }
 # ----------------------------
+# ✨ GPT-Powered Prompt Variations (77-tokens safe)
 # ----------------------------
+ENRICHED_PROMPT_INSTRUCTIONS = """
+You are a prompt engineer for an AI image generation model.
+Given a structured scene plan and product prompt, generate a visually descriptive enriched prompt that:
+1. Describes the subject, product, setting, and layout clearly
+2. Stays strictly under 77 tokens (CLIP limit for SDXL)
+3. Is natural, realistic, and suitable for Stable Diffusion XL
+4. Does NOT include quotes, explanations, or bullet points — just the enriched prompt
+Return only the prompt as a string.
+"""
+def generate_prompt_variations_from_scene(scene_plan: dict, base_prompt: str, n: int = 3) -> list:
+    prompts = []
+    for _ in range(n):
+        try:
+            user_input = f"Scene Plan:\n{json.dumps(scene_plan)}\n\nOriginal User Prompt:\n{base_prompt}"
+            response = client.chat.completions.create(
+                model="gpt-4o-mini-2024-07-18",
+                messages=[
+                    {"role": "system", "content": ENRICHED_PROMPT_INSTRUCTIONS},
+                    {"role": "user", "content": user_input}
+                ],
+                temperature=0.5,
+                max_tokens=100
+            )
+            enriched = response.choices[0].message.content.strip()
+            # Optional: check token count for debug
+            token_count = len(tokenizer(enriched)["input_ids"])
+            print(f"📝 Enriched Prompt ({token_count} tokens): {enriched}")
+            prompts.append(enriched)
+        except Exception as e:
+            print("⚠️ Prompt variation fallback:", e)
+            prompts.append(base_prompt)
+    return prompts
 # ----------------------------
 # ❌ Generate Negative Prompt