Spaces:
Paused
Paused
Update utils/planner.py
Browse files- utils/planner.py +41 -24
utils/planner.py
CHANGED
|
@@ -24,7 +24,7 @@ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base
|
|
| 24 |
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
|
| 25 |
|
| 26 |
# ----------------------------
|
| 27 |
-
# 🧠 Load CLIP Tokenizer (for
|
| 28 |
# ----------------------------
|
| 29 |
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
|
| 30 |
|
|
@@ -49,8 +49,8 @@ SCENE_SYSTEM_INSTRUCTIONS = """
|
|
| 49 |
You are a scene planning assistant for an AI image generation system.
|
| 50 |
Your job is to take a caption from a product image and a user prompt, then return a structured JSON with:
|
| 51 |
- scene (environment, setting)
|
| 52 |
-
- subject (
|
| 53 |
-
- objects (
|
| 54 |
- layout (foreground/background elements and their placement)
|
| 55 |
- rules (validation rules to ensure visual correctness)
|
| 56 |
Respond ONLY in raw JSON format. Do NOT include explanations.
|
|
@@ -83,36 +83,53 @@ def extract_scene_plan(prompt: str, image: Image.Image) -> dict:
|
|
| 83 |
except Exception as e:
|
| 84 |
print("❌ extract_scene_plan() Error:", e)
|
| 85 |
return {
|
| 86 |
-
"scene": "studio",
|
| 87 |
-
"subject": "product",
|
| 88 |
-
"objects":
|
| 89 |
"layout": {},
|
| 90 |
"rules": {}
|
| 91 |
}
|
| 92 |
|
| 93 |
# ----------------------------
|
| 94 |
-
# ✨
|
| 95 |
# ----------------------------
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
for i in range(n):
|
| 100 |
-
enriched_prompt = (
|
| 101 |
-
f"{scene_plan.get('subject', 'a product')} "
|
| 102 |
-
f"in a {scene_plan.get('scene', 'studio setting')} "
|
| 103 |
-
f"with {', '.join(scene_plan.get('objects', []))} "
|
| 104 |
-
f"and layout details like {scene_plan.get('layout', {})}. "
|
| 105 |
-
f"{scene_plan.get('rules', '')}"
|
| 106 |
-
)
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
|
| 113 |
-
|
|
|
|
| 114 |
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
# ----------------------------
|
| 118 |
# ❌ Generate Negative Prompt
|
|
|
|
| 24 |
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
|
| 25 |
|
| 26 |
# ----------------------------
|
| 27 |
+
# 🧠 Load CLIP Tokenizer (for optional diagnostics)
|
| 28 |
# ----------------------------
|
| 29 |
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
|
| 30 |
|
|
|
|
| 49 |
You are a scene planning assistant for an AI image generation system.
|
| 50 |
Your job is to take a caption from a product image and a user prompt, then return a structured JSON with:
|
| 51 |
- scene (environment, setting)
|
| 52 |
+
- subject (main_actor)
|
| 53 |
+
- objects (main_product or items)
|
| 54 |
- layout (foreground/background elements and their placement)
|
| 55 |
- rules (validation rules to ensure visual correctness)
|
| 56 |
Respond ONLY in raw JSON format. Do NOT include explanations.
|
|
|
|
| 83 |
except Exception as e:
|
| 84 |
print("❌ extract_scene_plan() Error:", e)
|
| 85 |
return {
|
| 86 |
+
"scene": {"environment": "studio", "setting": "plain white background"},
|
| 87 |
+
"subject": {"main_actor": "a product"},
|
| 88 |
+
"objects": {"main_product": "product"},
|
| 89 |
"layout": {},
|
| 90 |
"rules": {}
|
| 91 |
}
|
| 92 |
|
| 93 |
# ----------------------------
|
| 94 |
+
# ✨ GPT-Powered Prompt Variations (77-tokens safe)
|
| 95 |
# ----------------------------
|
| 96 |
+
ENRICHED_PROMPT_INSTRUCTIONS = """
|
| 97 |
+
You are a prompt engineer for an AI image generation model.
|
| 98 |
+
Given a structured scene plan and product prompt, generate a visually descriptive enriched prompt that:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
+
1. Describes the subject, product, setting, and layout clearly
|
| 101 |
+
2. Stays strictly under 77 tokens (CLIP limit for SDXL)
|
| 102 |
+
3. Is natural, realistic, and suitable for Stable Diffusion XL
|
| 103 |
+
4. Does NOT include quotes, explanations, or bullet points — just the enriched prompt
|
| 104 |
|
| 105 |
+
Return only the prompt as a string.
|
| 106 |
+
"""
|
| 107 |
|
| 108 |
+
def generate_prompt_variations_from_scene(scene_plan: dict, base_prompt: str, n: int = 3) -> list:
|
| 109 |
+
prompts = []
|
| 110 |
+
for _ in range(n):
|
| 111 |
+
try:
|
| 112 |
+
user_input = f"Scene Plan:\n{json.dumps(scene_plan)}\n\nOriginal User Prompt:\n{base_prompt}"
|
| 113 |
+
response = client.chat.completions.create(
|
| 114 |
+
model="gpt-4o-mini-2024-07-18",
|
| 115 |
+
messages=[
|
| 116 |
+
{"role": "system", "content": ENRICHED_PROMPT_INSTRUCTIONS},
|
| 117 |
+
{"role": "user", "content": user_input}
|
| 118 |
+
],
|
| 119 |
+
temperature=0.5,
|
| 120 |
+
max_tokens=100
|
| 121 |
+
)
|
| 122 |
+
enriched = response.choices[0].message.content.strip()
|
| 123 |
+
|
| 124 |
+
# Optional: check token count for debug
|
| 125 |
+
token_count = len(tokenizer(enriched)["input_ids"])
|
| 126 |
+
print(f"📝 Enriched Prompt ({token_count} tokens): {enriched}")
|
| 127 |
+
|
| 128 |
+
prompts.append(enriched)
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print("⚠️ Prompt variation fallback:", e)
|
| 131 |
+
prompts.append(base_prompt)
|
| 132 |
+
return prompts
|
| 133 |
|
| 134 |
# ----------------------------
|
| 135 |
# ❌ Generate Negative Prompt
|