Manireddy1508 commited on
Commit
054edad
·
verified ·
1 Parent(s): 64855f6

Update utils/planner.py

Browse files
Files changed (1) hide show
  1. utils/planner.py +41 -24
utils/planner.py CHANGED
@@ -24,7 +24,7 @@ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base
24
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
25
 
26
  # ----------------------------
27
- # 🧠 Load CLIP Tokenizer (for token limit check)
28
  # ----------------------------
29
  tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
30
 
@@ -49,8 +49,8 @@ SCENE_SYSTEM_INSTRUCTIONS = """
49
  You are a scene planning assistant for an AI image generation system.
50
  Your job is to take a caption from a product image and a user prompt, then return a structured JSON with:
51
  - scene (environment, setting)
52
- - subject (main actor)
53
- - objects (main product or items)
54
  - layout (foreground/background elements and their placement)
55
  - rules (validation rules to ensure visual correctness)
56
  Respond ONLY in raw JSON format. Do NOT include explanations.
@@ -83,36 +83,53 @@ def extract_scene_plan(prompt: str, image: Image.Image) -> dict:
83
  except Exception as e:
84
  print("❌ extract_scene_plan() Error:", e)
85
  return {
86
- "scene": "studio",
87
- "subject": "product",
88
- "objects": [],
89
  "layout": {},
90
  "rules": {}
91
  }
92
 
93
  # ----------------------------
94
- # ✨ Generate Prompt Variations
95
  # ----------------------------
96
- def generate_prompt_variations_from_scene(scene_plan: dict, base_prompt: str, n: int = 3) -> list:
97
- variations = []
98
-
99
- for i in range(n):
100
- enriched_prompt = (
101
- f"{scene_plan.get('subject', 'a product')} "
102
- f"in a {scene_plan.get('scene', 'studio setting')} "
103
- f"with {', '.join(scene_plan.get('objects', []))} "
104
- f"and layout details like {scene_plan.get('layout', {})}. "
105
- f"{scene_plan.get('rules', '')}"
106
- )
107
 
108
- # Enforce 77-token limit for SDXL
109
- tokens = tokenizer(enriched_prompt)["input_ids"]
110
- if len(tokens) > 77:
111
- enriched_prompt = tokenizer.decode(tokens[:77], skip_special_tokens=True)
112
 
113
- variations.append(enriched_prompt.strip())
 
114
 
115
- return variations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  # ----------------------------
118
  # ❌ Generate Negative Prompt
 
24
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
25
 
26
  # ----------------------------
27
+ # 🧠 Load CLIP Tokenizer (for optional diagnostics)
28
  # ----------------------------
29
  tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
30
 
 
49
  You are a scene planning assistant for an AI image generation system.
50
  Your job is to take a caption from a product image and a user prompt, then return a structured JSON with:
51
  - scene (environment, setting)
52
+ - subject (main_actor)
53
+ - objects (main_product or items)
54
  - layout (foreground/background elements and their placement)
55
  - rules (validation rules to ensure visual correctness)
56
  Respond ONLY in raw JSON format. Do NOT include explanations.
 
83
  except Exception as e:
84
  print("❌ extract_scene_plan() Error:", e)
85
  return {
86
+ "scene": {"environment": "studio", "setting": "plain white background"},
87
+ "subject": {"main_actor": "a product"},
88
+ "objects": {"main_product": "product"},
89
  "layout": {},
90
  "rules": {}
91
  }
92
 
93
  # ----------------------------
94
+ # ✨ GPT-Powered Prompt Variations (77-tokens safe)
95
  # ----------------------------
96
+ ENRICHED_PROMPT_INSTRUCTIONS = """
97
+ You are a prompt engineer for an AI image generation model.
98
+ Given a structured scene plan and product prompt, generate a visually descriptive enriched prompt that:
 
 
 
 
 
 
 
 
99
 
100
+ 1. Describes the subject, product, setting, and layout clearly
101
+ 2. Stays strictly under 77 tokens (CLIP limit for SDXL)
102
+ 3. Is natural, realistic, and suitable for Stable Diffusion XL
103
+ 4. Does NOT include quotes, explanations, or bullet points — just the enriched prompt
104
 
105
+ Return only the prompt as a string.
106
+ """
107
 
108
+ def generate_prompt_variations_from_scene(scene_plan: dict, base_prompt: str, n: int = 3) -> list:
109
+ prompts = []
110
+ for _ in range(n):
111
+ try:
112
+ user_input = f"Scene Plan:\n{json.dumps(scene_plan)}\n\nOriginal User Prompt:\n{base_prompt}"
113
+ response = client.chat.completions.create(
114
+ model="gpt-4o-mini-2024-07-18",
115
+ messages=[
116
+ {"role": "system", "content": ENRICHED_PROMPT_INSTRUCTIONS},
117
+ {"role": "user", "content": user_input}
118
+ ],
119
+ temperature=0.5,
120
+ max_tokens=100
121
+ )
122
+ enriched = response.choices[0].message.content.strip()
123
+
124
+ # Optional: check token count for debug
125
+ token_count = len(tokenizer(enriched)["input_ids"])
126
+ print(f"📝 Enriched Prompt ({token_count} tokens): {enriched}")
127
+
128
+ prompts.append(enriched)
129
+ except Exception as e:
130
+ print("⚠️ Prompt variation fallback:", e)
131
+ prompts.append(base_prompt)
132
+ return prompts
133
 
134
  # ----------------------------
135
  # ❌ Generate Negative Prompt