Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -24,32 +24,32 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
| 24 |
MAX_SEED = np.iinfo(np.int32).max
|
| 25 |
MAX_IMAGE_SIZE = 1024
|
| 26 |
|
| 27 |
-
hf_client = InferenceClient(
|
| 28 |
-
|
| 29 |
-
)
|
| 30 |
-
VLM_MODEL = "baidu/ERNIE-4.5-VL-424B-A47B-Base-PT"
|
| 31 |
|
| 32 |
-
SYSTEM_PROMPT_TEXT_ONLY = """You are an expert prompt engineer for FLUX.2 by Black Forest Labs. Rewrite user prompts to be more descriptive while strictly preserving their core subject and intent.
|
| 33 |
|
| 34 |
-
Guidelines:
|
| 35 |
-
1. Structure: Keep structured inputs structured (enhance within fields). Convert natural language to detailed paragraphs.
|
| 36 |
-
2. Details: Add concrete visual specifics - form, scale, textures, materials, lighting (quality, direction, color), shadows, spatial relationships, and environmental context.
|
| 37 |
-
3. Text in Images: Put ALL text in quotation marks, matching the prompt's language. Always provide explicit quoted text for objects that would contain text in reality (signs, labels, screens, etc.) - without it, the model generates gibberish.
|
| 38 |
|
| 39 |
-
Output only the revised prompt and nothing else."""
|
| 40 |
|
| 41 |
-
SYSTEM_PROMPT_WITH_IMAGES = """You are FLUX.2 by Black Forest Labs, an image-editing expert. You convert editing requests into one concise instruction (50-80 words, ~30 for brief requests).
|
| 42 |
|
| 43 |
-
Rules:
|
| 44 |
-
- Single instruction only, no commentary
|
| 45 |
-
- Use clear, analytical language (avoid "whimsical," "cascading," etc.)
|
| 46 |
-
- Specify what changes AND what stays the same (face, lighting, composition)
|
| 47 |
-
- Reference actual image elements
|
| 48 |
-
- Turn negatives into positives ("don't change X" → "keep X")
|
| 49 |
-
- Make abstractions concrete ("futuristic" → "glowing cyan neon, metallic panels")
|
| 50 |
-
- Keep content PG-13
|
| 51 |
|
| 52 |
-
Output only the final instruction in plain text and nothing else."""
|
| 53 |
|
| 54 |
def remote_text_encoder(prompts):
|
| 55 |
from gradio_client import Client
|
|
@@ -100,44 +100,44 @@ def image_to_data_uri(img):
|
|
| 100 |
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 101 |
return f"data:image/png;base64,{img_str}"
|
| 102 |
|
| 103 |
-
def upsample_prompt_logic(prompt, image_list):
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
|
| 142 |
# Updated duration function to match generate_image arguments (including progress)
|
| 143 |
def get_duration(prompt_embeds, image_list, width, height, num_inference_steps, guidance_scale, seed, force_dimensions, progress=gr.Progress(track_tqdm=True)):
|
|
@@ -185,11 +185,11 @@ def infer(prompt, input_images=None, seed=42, randomize_seed=False, width=1024,
|
|
| 185 |
|
| 186 |
# 1. Upsampling (Network bound - No GPU needed)
|
| 187 |
final_prompt = prompt
|
| 188 |
-
if prompt_upsampling:
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
|
| 194 |
# 2. Text Encoding (Network bound - No GPU needed)
|
| 195 |
progress(0.1, desc="Encoding prompt...")
|
|
|
|
| 24 |
MAX_SEED = np.iinfo(np.int32).max
|
| 25 |
MAX_IMAGE_SIZE = 1024
|
| 26 |
|
| 27 |
+
# hf_client = InferenceClient(
|
| 28 |
+
# api_key=os.environ.get("HF_TOKEN"),
|
| 29 |
+
# )
|
| 30 |
+
# VLM_MODEL = "baidu/ERNIE-4.5-VL-424B-A47B-Base-PT"
|
| 31 |
|
| 32 |
+
# SYSTEM_PROMPT_TEXT_ONLY = """You are an expert prompt engineer for FLUX.2 by Black Forest Labs. Rewrite user prompts to be more descriptive while strictly preserving their core subject and intent.
|
| 33 |
|
| 34 |
+
# Guidelines:
|
| 35 |
+
# 1. Structure: Keep structured inputs structured (enhance within fields). Convert natural language to detailed paragraphs.
|
| 36 |
+
# 2. Details: Add concrete visual specifics - form, scale, textures, materials, lighting (quality, direction, color), shadows, spatial relationships, and environmental context.
|
| 37 |
+
# 3. Text in Images: Put ALL text in quotation marks, matching the prompt's language. Always provide explicit quoted text for objects that would contain text in reality (signs, labels, screens, etc.) - without it, the model generates gibberish.
|
| 38 |
|
| 39 |
+
# Output only the revised prompt and nothing else."""
|
| 40 |
|
| 41 |
+
# SYSTEM_PROMPT_WITH_IMAGES = """You are FLUX.2 by Black Forest Labs, an image-editing expert. You convert editing requests into one concise instruction (50-80 words, ~30 for brief requests).
|
| 42 |
|
| 43 |
+
# Rules:
|
| 44 |
+
# - Single instruction only, no commentary
|
| 45 |
+
# - Use clear, analytical language (avoid "whimsical," "cascading," etc.)
|
| 46 |
+
# - Specify what changes AND what stays the same (face, lighting, composition)
|
| 47 |
+
# - Reference actual image elements
|
| 48 |
+
# - Turn negatives into positives ("don't change X" → "keep X")
|
| 49 |
+
# - Make abstractions concrete ("futuristic" → "glowing cyan neon, metallic panels")
|
| 50 |
+
# - Keep content PG-13
|
| 51 |
|
| 52 |
+
# Output only the final instruction in plain text and nothing else."""
|
| 53 |
|
| 54 |
def remote_text_encoder(prompts):
|
| 55 |
from gradio_client import Client
|
|
|
|
| 100 |
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 101 |
return f"data:image/png;base64,{img_str}"
|
| 102 |
|
| 103 |
+
# def upsample_prompt_logic(prompt, image_list):
|
| 104 |
+
# try:
|
| 105 |
+
# if image_list and len(image_list) > 0:
|
| 106 |
+
# # Image + Text Editing Mode
|
| 107 |
+
# system_content = SYSTEM_PROMPT_WITH_IMAGES
|
| 108 |
|
| 109 |
+
# # Construct user message with text and images
|
| 110 |
+
# user_content = [{"type": "text", "text": prompt}]
|
| 111 |
|
| 112 |
+
# for img in image_list:
|
| 113 |
+
# data_uri = image_to_data_uri(img)
|
| 114 |
+
# user_content.append({
|
| 115 |
+
# "type": "image_url",
|
| 116 |
+
# "image_url": {"url": data_uri}
|
| 117 |
+
# })
|
| 118 |
|
| 119 |
+
# messages = [
|
| 120 |
+
# {"role": "system", "content": system_content},
|
| 121 |
+
# {"role": "user", "content": user_content}
|
| 122 |
+
# ]
|
| 123 |
+
# else:
|
| 124 |
+
# # Text Only Mode
|
| 125 |
+
# system_content = SYSTEM_PROMPT_TEXT_ONLY
|
| 126 |
+
# messages = [
|
| 127 |
+
# {"role": "system", "content": system_content},
|
| 128 |
+
# {"role": "user", "content": prompt}
|
| 129 |
+
# ]
|
| 130 |
+
|
| 131 |
+
# completion = hf_client.chat.completions.create(
|
| 132 |
+
# model=VLM_MODEL,
|
| 133 |
+
# messages=messages,
|
| 134 |
+
# max_tokens=1024
|
| 135 |
+
# )
|
| 136 |
|
| 137 |
+
# return completion.choices[0].message.content
|
| 138 |
+
# except Exception as e:
|
| 139 |
+
# print(f"Upsampling failed: {e}")
|
| 140 |
+
# return prompt
|
| 141 |
|
| 142 |
# Updated duration function to match generate_image arguments (including progress)
|
| 143 |
def get_duration(prompt_embeds, image_list, width, height, num_inference_steps, guidance_scale, seed, force_dimensions, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
| 185 |
|
| 186 |
# 1. Upsampling (Network bound - No GPU needed)
|
| 187 |
final_prompt = prompt
|
| 188 |
+
# if prompt_upsampling:
|
| 189 |
+
# progress(0.05, desc="Upsampling prompt...")
|
| 190 |
+
# final_prompt = upsample_prompt_logic(prompt, image_list)
|
| 191 |
+
# print(f"Original Prompt: {prompt}")
|
| 192 |
+
# print(f"Upsampled Prompt: {final_prompt}")
|
| 193 |
|
| 194 |
# 2. Text Encoding (Network bound - No GPU needed)
|
| 195 |
progress(0.1, desc="Encoding prompt...")
|