# -*- coding: utf-8 -*- """ Prompt generation utilities for different inference types """ from typing import Dict, List, Tuple, Optional def create_prompt_templates(): """Create prompt templates for various tasks""" templates = { "text_understanding": "You are a multimodal model that can process both text and images. Answer the following question based on the provided images. Analyze each image and combine relevant details to answer.", "image_generation": "Generate an image according to the text prompt.", "image_editing": "Generate an image applying the following editing instruction based on the original image.", "dense_prediction": "Perform dense prediction on the given images.", "control_generation": "Generate an image according to the text prompt and the given control image.", "subject_generation": "Generate an image according to the text prompt and the given object image.", "multi_view": "Generate a view-image based on the given image.", "style_transfer": "Transform the current image into the style of the provided image." } return templates def generate_text_to_image_prompt(prompt_text: str, templates: Optional[Dict] = None) -> Tuple[str, str]: """ Generate prompt for text-to-image generation Args: prompt_text: User input text prompt templates: Optional prompt templates dict Returns: Tuple of (input_prompt, unconditional_prompt) """ if templates is None: templates = create_prompt_templates() system_prompt = templates["image_generation"] input_prompt = "" + system_prompt + "" + "" + prompt_text + "" uncon_prompt = "" + system_prompt + "" + "" + "" + "" return input_prompt, uncon_prompt def generate_image_to_image_prompt( prompt_text: str, edit_type: str, templates: Optional[Dict] = None, **kwargs ) -> Tuple[str, str, str]: """ Generate prompt for image-to-image generation Args: prompt_text: User input text prompt edit_type: Type of editing operation templates: Optional prompt templates dict **kwargs: Additional parameters for specific edit types Returns: Tuple of (input_prompt, unconditional_prompt, system_prompt) """ if templates is None: templates = create_prompt_templates() # Determine system prompt and processed prompt text based on edit type if 'dense' in edit_type: des = { "canny": "canny edge map", "hed": "hed edge map", "normal": "normal map", "sam2mask": "sam2 mask", "depth": "depth map", "openpose": "pose estimation map" } system_prompt = templates["dense_prediction"] prompt_text_used = f"Generate a {des.get(edit_type.split('_')[0], 'dense map')} according to the image." elif 'control' in edit_type: system_prompt = templates["control_generation"] prompt_text_used = prompt_text elif 'subject' in edit_type: system_prompt = templates["subject_generation"] prompt_text_used = prompt_text elif 'edit' in edit_type: system_prompt = templates["image_editing"] prompt_text_used = prompt_text elif "ref_transfer" in edit_type: system_prompt = templates["style_transfer"] prompt_text_used = "Transform the current image into the style of the provided image." elif 'multi_view' in edit_type: system_prompt = templates["multi_view"] prompt_text_used = f"Generate the {edit_type.split('_')[-1]} view based on the provided front view." else: system_prompt = "Generate an image according to the prompt and image." prompt_text_used = prompt_text # Build final prompts input_prompt = "" + system_prompt + "" + "" + prompt_text_used + "" uncon_prompt = "" + system_prompt + "" + "" + "" + "" return input_prompt, uncon_prompt, system_prompt def generate_multimodal_understanding_prompt(question: str, templates: Optional[Dict] = None) -> str: """ Generate prompt for multimodal understanding (MMU) Args: question: User question about the image templates: Optional prompt templates dict Returns: Formatted input prompt """ if templates is None: templates = create_prompt_templates() system_prompt = "You are a multimodal model that can process both text and images. Answer the following question based on the provided images. Analyze each image and combine relevant details to answer." input_prompt = "" + system_prompt + "" + "" + question + "" return input_prompt def get_edit_type_specific_prompt(edit_type: str, prompt_text: str, templates: Optional[Dict] = None) -> str: """ Get edit type specific prompt text Args: edit_type: Type of editing operation prompt_text: Original prompt text templates: Optional prompt templates dict Returns: Processed prompt text for the specific edit type """ if templates is None: templates = create_prompt_templates() if 'dense' in edit_type: des = { "canny": "canny edge map", "hed": "hed edge map", "normal": "normal map", "sam2mask": "sam2 mask", "depth": "depth map", "openpose": "pose estimation map" } return f"Generate a {des.get(edit_type.split('_')[0], 'dense map')} according to the image." elif 'control' in edit_type: return prompt_text elif 'subject' in edit_type: return prompt_text elif 'edit' in edit_type: if "multiturn" in edit_type: ids = int(edit_type.split("_")[-1]) if ids == 0: return prompt_text[0] if isinstance(prompt_text, list) else prompt_text else: return prompt_text[ids][0] if isinstance(prompt_text[ids], list) else prompt_text[ids] else: return prompt_text elif "ref_transfer" in edit_type: return "Transform the current image into the style of the provided image." elif 'multi_view' in edit_type: return f"Generate the {edit_type.split('_')[-1]} view based on the provided front view." else: return prompt_text def get_system_prompt_for_edit_type(edit_type: str, templates: Optional[Dict] = None) -> str: """ Get system prompt for specific edit type Args: edit_type: Type of editing operation templates: Optional prompt templates dict Returns: System prompt for the edit type """ if templates is None: templates = create_prompt_templates() if 'dense' in edit_type: return templates["dense_prediction"] elif 'control' in edit_type: return templates["control_generation"] elif 'subject' in edit_type: return templates["subject_generation"] elif 'edit' in edit_type: return templates["image_editing"] elif "ref_transfer" in edit_type: return templates["style_transfer"] elif 'multi_view' in edit_type: return templates["multi_view"] else: return "Generate an image according to the prompt and image." def generate_text_image_to_text_image_prompt(prompt_text, system_prompt): """ Generate prompts for TI2TI tasks Args: prompt_text: User's editing instruction system_prompt: System prompt for the task Returns: input_prompt: Conditional prompt uncon_text: Unconditional prompt """ # Conditional prompt input_prompt = ( f"{system_prompt}" f"{prompt_text}" ) # Unconditional prompt (for CFG) uncon_text = ( f"{system_prompt}" f"" ) return input_prompt, uncon_text