Spaces:

Manireddy1508
/

imagetoimage

Paused

App Files Files Community

Manireddy1508 commited on Apr 7, 2025

Commit

acc4043

verified ·

1 Parent(s): b574e01

Update utils/planner.py

Browse files

Files changed (1) hide show

utils/planner.py +54 -86

utils/planner.py CHANGED Viewed

@@ -1,35 +1,35 @@
-# utils/planner.py
 import os
 import json
 from dotenv import load_dotenv
 from openai import OpenAI
 from PIL import Image
 import torch
-from transformers import BlipProcessor, BlipForConditionalGeneration, CLIPTokenizer
 # ----------------------------
-# 🔐 Load Environment & GPT Client
 # ----------------------------
 load_dotenv()
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 # ----------------------------
-# 🧠 Load BLIP & CLIP Tokenizer
 # ----------------------------
-device = "cuda" if torch.cuda.is_available() else "cpu"
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
-clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 # ----------------------------
-# 📁 Log Path
 # ----------------------------
-LOG_PATH = "logs/prompt_log.jsonl"
-os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
 # ----------------------------
-# 📸 Generate Caption from Image
 # ----------------------------
 def generate_blip_caption(image: Image.Image) -> str:
     try:
@@ -43,7 +43,7 @@ def generate_blip_caption(image: Image.Image) -> str:
         return "a product image"
 # ----------------------------
-# 🧠 Extract Scene Plan from GPT
 # ----------------------------
 SCENE_SYSTEM_INSTRUCTIONS = """
 You are a scene planning assistant for an AI image generation system.
@@ -70,106 +70,74 @@ def extract_scene_plan(prompt: str, image: Image.Image) -> dict:
             temperature=0.3,
             max_tokens=500
         )
-        json_output = response.choices[0].message.content
-        print("🧠 Scene Plan (Raw):", json_output)
-        return json.loads(json_output)
     except Exception as e:
         print("❌ extract_scene_plan() Error:", e)
         return {
-            "scene": None,
-            "subject": None,
             "objects": [],
             "layout": {},
             "rules": {}
         }
 # ----------------------------
-# 🧠 Generate Positive Prompt Variations (CLIP-safe)
 # ----------------------------
 def generate_prompt_variations_from_scene(scene_plan: dict, base_prompt: str, n: int = 3) -> list:
-    try:
-        system_msg = f"""
-You are a creative prompt variation generator for an AI image generation system.
-Given a base user prompt and its structured scene plan, generate {n} diverse image generation prompts.
-Each prompt must:
-- Be visually rich and descriptive
-- Include stylistic or contextual variation
-- Reference the same product and environment
-- Stay faithful to the base prompt and extracted plan
-- Be under 77 tokens when tokenized using a CLIP tokenizer
-Respond ONLY with a JSON array of strings. No explanations.
-"""
-        response = client.chat.completions.create(
-            model="gpt-4o-mini-2024-07-18",
-            messages=[
-                {"role": "system", "content": system_msg},
-                {"role": "user", "content": json.dumps({
-                    "base_prompt": base_prompt,
-                    "scene_plan": scene_plan
-                })}
-            ],
-            temperature=0.7,
-            max_tokens=600
         )
-        content = response.choices[0].message.content
-        all_prompts = json.loads(content)
-        filtered = []
-        for p in all_prompts:
-            token_count = len(clip_tokenizer(p)["input_ids"])
-            if token_count <= 77:
-                filtered.append(p)
-        print("🧠 Filtered Prompts (<=77 tokens):", filtered)
-        return filtered or [base_prompt]
-    except Exception as e:
-        print("❌ generate_prompt_variations_from_scene() Error:", e)
-        return [base_prompt]
 # ----------------------------
-# 🧠 Generate Negative Prompt Automatically
 # ----------------------------
-def generate_negative_prompt_from_scene(scene_plan: dict) -> str:
-    try:
-        system_msg = """
-You are an assistant that generates negative prompts for an image generation model.
-Based on the structured scene plan, return a list of things that should NOT appear in the image,
-such as incorrect objects, extra limbs, distorted hands, text, watermark, etc.
-Return a single negative prompt string (comma-separated values). No explanations.
 """
         response = client.chat.completions.create(
             model="gpt-4o-mini-2024-07-18",
             messages=[
-                {"role": "system", "content": system_msg},
                 {"role": "user", "content": json.dumps(scene_plan)}
             ],
-            temperature=0.3,
-            max_tokens=150
         )
-        negative_prompt = response.choices[0].message.content.strip()
-        print("🚫 Negative Prompt (GPT):", negative_prompt)
-        return negative_prompt
     except Exception as e:
-        print("❌ generate_negative_prompt_from_scene() Error:", e)
-        return "deformed hands, extra limbs, text, watermark, signature"
-# ----------------------------
-# 📝 Save Logs
-# ----------------------------
-def save_generation_log(caption, scene_plan, prompts, negative_prompt):
-    log = {
-        "blip_caption": caption,
-        "scene_plan": scene_plan,
-        "enriched_prompts": prompts,
-        "negative_prompt": negative_prompt
-    }
-    with open(LOG_PATH, "a") as f:
-        f.write(json.dumps(log, indent=2) + "\n")

 import os
 import json
 from dotenv import load_dotenv
 from openai import OpenAI
 from PIL import Image
 import torch
+from transformers import (
+    BlipProcessor,
+    BlipForConditionalGeneration,
+    CLIPTokenizer
+)
 # ----------------------------
+# 🔐 Load API Keys & Setup
 # ----------------------------
 load_dotenv()
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+device = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------------
+# 📸 Load BLIP Captioning Model
 # ----------------------------
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
 # ----------------------------
+# 🧠 Load CLIP Tokenizer (for token limit check)
 # ----------------------------
+tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
 # ----------------------------
+# 📸 Generate Caption from Product Image
 # ----------------------------
 def generate_blip_caption(image: Image.Image) -> str:
     try:
         return "a product image"
 # ----------------------------
+# 🧠 GPT Scene Planning
 # ----------------------------
 SCENE_SYSTEM_INSTRUCTIONS = """
 You are a scene planning assistant for an AI image generation system.
             temperature=0.3,
             max_tokens=500
         )
+        content = response.choices[0].message.content
+        print("🧠 Scene Plan (Raw):", content)
+        # Optional logging
+        os.makedirs("logs", exist_ok=True)
+        with open("logs/scene_plans.jsonl", "a") as f:
+            f.write(json.dumps({"caption": caption, "prompt": prompt, "scene_plan": content}) + "\n")
+        return json.loads(content)
     except Exception as e:
         print("❌ extract_scene_plan() Error:", e)
         return {
+            "scene": "studio",
+            "subject": "product",
             "objects": [],
             "layout": {},
             "rules": {}
         }
 # ----------------------------
+# ✨ Generate Prompt Variations
 # ----------------------------
 def generate_prompt_variations_from_scene(scene_plan: dict, base_prompt: str, n: int = 3) -> list:
+    variations = []
+    for i in range(n):
+        enriched_prompt = (
+            f"{scene_plan.get('subject', 'a product')} "
+            f"in a {scene_plan.get('scene', 'studio setting')} "
+            f"with {', '.join(scene_plan.get('objects', []))} "
+            f"and layout details like {scene_plan.get('layout', {})}. "
+            f"{scene_plan.get('rules', '')}"
         )
+        # Enforce 77-token limit for SDXL
+        tokens = tokenizer(enriched_prompt)["input_ids"]
+        if len(tokens) > 77:
+            enriched_prompt = tokenizer.decode(tokens[:77], skip_special_tokens=True)
+        variations.append(enriched_prompt.strip())
+    return variations
 # ----------------------------
+# ❌ Generate Negative Prompt
 # ----------------------------
+NEGATIVE_SYSTEM_PROMPT = """
+You are a prompt engineer. Given a structured scene plan, generate a short negative prompt
+to suppress unwanted visual elements such as: distortion, blurriness, poor anatomy,
+logo errors, background noise, or low realism.
+Return a single comma-separated list. No intro text.
 """
+def generate_negative_prompt_from_scene(scene_plan: dict) -> str:
+    try:
         response = client.chat.completions.create(
             model="gpt-4o-mini-2024-07-18",
             messages=[
+                {"role": "system", "content": NEGATIVE_SYSTEM_PROMPT},
                 {"role": "user", "content": json.dumps(scene_plan)}
             ],
+            temperature=0.2,
+            max_tokens=100
         )
+        negative = response.choices[0].message.content.strip()
+        return negative
     except Exception as e:
+        print("❌ Negative Prompt Error:", e)
+        return "blurry, distorted, low quality, deformed, watermark"