File size: 8,557 Bytes
9b58924
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# -*- coding: utf-8 -*-
"""

Prompt generation utilities for different inference types

"""
from typing import Dict, List, Tuple, Optional

def create_prompt_templates():
    """Create prompt templates for various tasks"""
    templates = {
        "text_understanding": "You are a multimodal model that can process both text and images. Answer the following question based on the provided images. Analyze each image and combine relevant details to answer.",
        "image_generation": "Generate an image according to the text prompt.",
        "image_editing": "Generate an image applying the following editing instruction based on the original image.",
        "dense_prediction": "Perform dense prediction on the given images.",
        "control_generation": "Generate an image according to the text prompt and the given control image.",
        "subject_generation": "Generate an image according to the text prompt and the given object image.",
        "multi_view": "Generate a view-image based on the given image.",
        "style_transfer": "Transform the current image into the style of the provided image."
    }
    return templates


def generate_text_to_image_prompt(prompt_text: str, templates: Optional[Dict] = None) -> Tuple[str, str]:
    """

    Generate prompt for text-to-image generation

    

    Args:

        prompt_text: User input text prompt

        templates: Optional prompt templates dict

        

    Returns:

        Tuple of (input_prompt, unconditional_prompt)

    """
    if templates is None:
        templates = create_prompt_templates()
    
    system_prompt = templates["image_generation"]
    input_prompt = "<system>" + system_prompt + "</system>" + "<user>" + prompt_text + "</user>"
    uncon_prompt = "<system>" + system_prompt + "</system>" + "<user>" + "<uncondition>" + "</user>"
    
    return input_prompt, uncon_prompt


def generate_image_to_image_prompt(

    prompt_text: str, 

    edit_type: str, 

    templates: Optional[Dict] = None,

    **kwargs

) -> Tuple[str, str, str]:
    """

    Generate prompt for image-to-image generation

    

    Args:

        prompt_text: User input text prompt

        edit_type: Type of editing operation

        templates: Optional prompt templates dict

        **kwargs: Additional parameters for specific edit types

        

    Returns:

        Tuple of (input_prompt, unconditional_prompt, system_prompt)

    """
    if templates is None:
        templates = create_prompt_templates()
    
    # Determine system prompt and processed prompt text based on edit type
    if 'dense' in edit_type:
        des = {
            "canny": "canny edge map", 
            "hed": "hed edge map", 
            "normal": "normal map",
            "sam2mask": "sam2 mask", 
            "depth": "depth map", 
            "openpose": "pose estimation map"
        }
        system_prompt = templates["dense_prediction"]
        prompt_text_used = f"Generate a {des.get(edit_type.split('_')[0], 'dense map')} according to the image."
        
    elif 'control' in edit_type:
        system_prompt = templates["control_generation"]
        prompt_text_used = prompt_text
        
    elif 'subject' in edit_type:
        system_prompt = templates["subject_generation"]
        prompt_text_used = prompt_text
        
    elif 'edit' in edit_type:
        system_prompt = templates["image_editing"]
        prompt_text_used = prompt_text
            
    elif "ref_transfer" in edit_type:
        system_prompt = templates["style_transfer"]
        prompt_text_used = "Transform the current image into the style of the provided image."
        
    elif 'multi_view' in edit_type:
        system_prompt = templates["multi_view"]
        prompt_text_used = f"Generate the {edit_type.split('_')[-1]} view based on the provided front view."
        
    else:
        system_prompt = "Generate an image according to the prompt and image."
        prompt_text_used = prompt_text
    
    # Build final prompts
    input_prompt = "<system>" + system_prompt + "</system>" + "<user>" + prompt_text_used + "</user>"
    uncon_prompt = "<system>" + system_prompt + "</system>" + "<user>" + "<uncondition>" + "</user>"
    
    return input_prompt, uncon_prompt, system_prompt


def generate_multimodal_understanding_prompt(question: str, templates: Optional[Dict] = None) -> str:
    """

    Generate prompt for multimodal understanding (MMU)

    

    Args:

        question: User question about the image

        templates: Optional prompt templates dict

        

    Returns:

        Formatted input prompt

    """
    if templates is None:
        templates = create_prompt_templates()
    
    system_prompt = "You are a multimodal model that can process both text and images. Answer the following question based on the provided images. Analyze each image and combine relevant details to answer."
    input_prompt = "<system>" + system_prompt + "</system>" + "<user>" + question + "</user>"
    
    return input_prompt


def get_edit_type_specific_prompt(edit_type: str, prompt_text: str, templates: Optional[Dict] = None) -> str:
    """

    Get edit type specific prompt text

    

    Args:

        edit_type: Type of editing operation

        prompt_text: Original prompt text

        templates: Optional prompt templates dict

        

    Returns:

        Processed prompt text for the specific edit type

    """
    if templates is None:
        templates = create_prompt_templates()
    
    if 'dense' in edit_type:
        des = {
            "canny": "canny edge map", 
            "hed": "hed edge map", 
            "normal": "normal map",
            "sam2mask": "sam2 mask", 
            "depth": "depth map", 
            "openpose": "pose estimation map"
        }
        return f"Generate a {des.get(edit_type.split('_')[0], 'dense map')} according to the image."
        
    elif 'control' in edit_type:
        return prompt_text
        
    elif 'subject' in edit_type:
        return prompt_text
        
    elif 'edit' in edit_type:
        if "multiturn" in edit_type:
            ids = int(edit_type.split("_")[-1])
            if ids == 0:
                return prompt_text[0] if isinstance(prompt_text, list) else prompt_text
            else:
                return prompt_text[ids][0] if isinstance(prompt_text[ids], list) else prompt_text[ids]
        else:
            return prompt_text
            
    elif "ref_transfer" in edit_type:
        return "Transform the current image into the style of the provided image."
        
    elif 'multi_view' in edit_type:
        return f"Generate the {edit_type.split('_')[-1]} view based on the provided front view."
        
    else:
        return prompt_text


def get_system_prompt_for_edit_type(edit_type: str, templates: Optional[Dict] = None) -> str:
    """

    Get system prompt for specific edit type

    

    Args:

        edit_type: Type of editing operation

        templates: Optional prompt templates dict

        

    Returns:

        System prompt for the edit type

    """
    if templates is None:
        templates = create_prompt_templates()
    
    if 'dense' in edit_type:
        return templates["dense_prediction"]
    elif 'control' in edit_type:
        return templates["control_generation"]
    elif 'subject' in edit_type:
        return templates["subject_generation"]
    elif 'edit' in edit_type:
        return templates["image_editing"]
    elif "ref_transfer" in edit_type:
        return templates["style_transfer"]
    elif 'multi_view' in edit_type:
        return templates["multi_view"]
    else:
        return "Generate an image according to the prompt and image."

def generate_text_image_to_text_image_prompt(prompt_text, system_prompt):
    """

    Generate prompts for TI2TI tasks

    

    Args:

        prompt_text: User's editing instruction

        system_prompt: System prompt for the task

    

    Returns:

        input_prompt: Conditional prompt

        uncon_text: Unconditional prompt

    """
    # Conditional prompt
    input_prompt = (
        f"<system>{system_prompt}</system>"
        f"<user>{prompt_text}</user>"
    )
    
    # Unconditional prompt (for CFG)
    uncon_text = (
        f"<system>{system_prompt}</system>"
        f"<user><uncondition></user>"
    )
    
    return input_prompt, uncon_text