Spaces:
Build error
Build error
| """ | |
| Helper functions mainly for multilingual text image customization. | |
| Acknowledgement: Codes here are heavily borrowed from TextFLUX: https://github.com/yyyyyxie/textflux. | |
| """ | |
| import cv2 | |
| import numpy as np | |
| from PIL import Image, ImageDraw, ImageFont | |
| def generate_prompt(words): | |
| words_str = ', '.join(f"'{word}'" for word in words) | |
| prompt_template = ( | |
| "The pair of images highlights some white words on a black background, as well as their style on a real-world scene image. " | |
| "[IMAGE1] is a template image rendering the text, with the words {words}; " | |
| "[IMAGE2] shows the text content {words} naturally and correspondingly integrated into the image." | |
| ) | |
| return prompt_template.format(words=words_str) | |
| prompt_template2 = ( | |
| "The pair of images highlights some white words on a black background, as well as their style on a real-world scene image. " | |
| "[IMAGE1] is a template image rendering the text, with the words; " | |
| "[IMAGE2] shows the text content naturally and correspondingly integrated into the image." | |
| ) | |
| def run_multilingual_inference(model, image_input, mask_input, reference_input, texts, | |
| num_steps=30, guidance_scale=30, seed=42, num_images=1): | |
| # Resize. | |
| width, height = image_input.size | |
| new_width = (width // 32) * 32 | |
| new_height = (height // 32) * 32 | |
| image_input = image_input.convert("RGB").resize((new_width, new_height)) | |
| mask_input = mask_input.convert("RGB").resize((new_width, new_height)) | |
| texts = [i.strip() for i in texts.split('\n')] | |
| rendered_text = render_glyph_multi(image_input, mask_input, texts) | |
| combined_image = Image.fromarray(np.hstack((np.array(rendered_text), np.array(image_input)))) | |
| combined_mask = Image.fromarray( | |
| np.hstack((np.array(Image.new("RGB", image_input.size, (0, 0, 0))), np.array(mask_input)))) | |
| prompt = generate_prompt(texts) | |
| print("Final prompt:", prompt) | |
| all_generated_images = [] | |
| for i in range(num_images): | |
| res = model.generate( | |
| image=combined_image, | |
| mask_image=combined_mask, | |
| ref_image=reference_input, | |
| prompt=prompt_template2, | |
| prompt_2=prompt, | |
| scale=1.0, | |
| guidance_scale=guidance_scale, | |
| num_inference_steps=num_steps, | |
| width=combined_image.width, | |
| height=combined_image.height, | |
| seed=seed + i, | |
| )[0] | |
| all_generated_images.append(res) | |
| return all_generated_images | |
| def insert_spaces(text, num_spaces): | |
| """ | |
| Insert a specified number of spaces between each character to adjust spacing during text rendering. | |
| """ | |
| if len(text) <= 1: | |
| return text | |
| return (' ' * num_spaces).join(list(text)) | |
| def draw_glyph2( | |
| font, | |
| text, | |
| polygon, | |
| vertAng=10, | |
| scale=1, | |
| width=512, | |
| height=512, | |
| add_space=True, | |
| scale_factor=2, | |
| rotate_resample=Image.BICUBIC, | |
| downsample_resample=Image.Resampling.LANCZOS | |
| ): | |
| """ | |
| Render tilted/curved text within a specified region (defined by polygon): | |
| - First upscale (supersample), then rotate, then downsample to ensure high quality; | |
| - Dynamically adjust font size and whether to insert spaces between characters based on the region's shape. | |
| Return the final downsampled RGBA numpy array to the target dimensions (height, width). | |
| """ | |
| big_w = width * scale_factor | |
| big_h = height * scale_factor | |
| # Upscale polygon coordinates | |
| big_polygon = polygon * scale_factor * scale | |
| rect = cv2.minAreaRect(big_polygon.astype(np.float32)) | |
| box = cv2.boxPoints(rect) | |
| box = np.intp(box) | |
| w, h = rect[1] | |
| angle = rect[2] | |
| if angle < -45: | |
| angle += 90 | |
| angle = -angle | |
| if w < h: | |
| angle += 90 | |
| vert = False | |
| if (abs(angle) % 90 < vertAng or abs(90 - abs(angle) % 90) % 90 < vertAng): | |
| _w = max(box[:, 0]) - min(box[:, 0]) | |
| _h = max(box[:, 1]) - min(box[:, 1]) | |
| if _h >= _w: | |
| vert = True | |
| angle = 0 | |
| # Create large image and temporary white background image | |
| big_img = Image.new("RGBA", (big_w, big_h), (0, 0, 0, 0)) | |
| tmp = Image.new("RGB", big_img.size, "white") | |
| tmp_draw = ImageDraw.Draw(tmp) | |
| _, _, _tw, _th = tmp_draw.textbbox((0, 0), text, font=font) | |
| if _th == 0: | |
| text_w = 0 | |
| else: | |
| w_f, h_f = float(w), float(h) | |
| text_w = min(w_f, h_f) * (_tw / _th) | |
| if text_w <= max(w, h): | |
| if len(text) > 1 and not vert and add_space: | |
| for i in range(1, 100): | |
| text_sp = insert_spaces(text, i) | |
| _, _, tw2, th2 = tmp_draw.textbbox((0, 0), text_sp, font=font) | |
| if th2 != 0: | |
| if min(w, h) * (tw2 / th2) > max(w, h): | |
| break | |
| text = insert_spaces(text, i - 1) | |
| font_size = min(w, h) * 0.80 | |
| else: | |
| shrink = 0.75 if vert else 0.85 | |
| if text_w != 0: | |
| font_size = min(w, h) / (text_w / max(w, h)) * shrink | |
| else: | |
| font_size = min(w, h) * 0.80 | |
| new_font = font.font_variant(size=int(font_size)) | |
| left, top, right, bottom = new_font.getbbox(text) | |
| text_width = right - left | |
| text_height = bottom - top | |
| # Create transparent text rendering layer | |
| layer = Image.new("RGBA", big_img.size, (0, 0, 0, 0)) | |
| draw_layer = ImageDraw.Draw(layer) | |
| cx, cy = rect[0] | |
| if not vert: | |
| draw_layer.text( | |
| (cx - text_width // 2, cy - text_height // 2 - top), | |
| text, | |
| font=new_font, | |
| fill=(255, 255, 255, 255) | |
| ) | |
| else: | |
| _w_ = max(box[:, 0]) - min(box[:, 0]) | |
| x_s = min(box[:, 0]) + _w_ // 2 - text_height // 2 | |
| y_s = min(box[:, 1]) | |
| for c in text: | |
| draw_layer.text((x_s, y_s), c, font=new_font, fill=(255, 255, 255, 255)) | |
| _, _t, _, _b = new_font.getbbox(c) | |
| y_s += _b | |
| rotated_layer = layer.rotate( | |
| angle, | |
| expand=True, | |
| center=(cx, cy), | |
| resample=rotate_resample | |
| ) | |
| xo = int((big_img.width - rotated_layer.width) // 2) | |
| yo = int((big_img.height - rotated_layer.height) // 2) | |
| big_img.paste(rotated_layer, (xo, yo), rotated_layer) | |
| final_img = big_img.resize((width, height), downsample_resample) | |
| final_np = np.array(final_img) | |
| return final_np | |
| def render_glyph_multi(original, computed_mask, texts): | |
| """ | |
| For each independent region in computed_mask: | |
| - Extract region positions using contours and sort them from top to bottom, left to right; | |
| - Call draw_glyph2 to render corresponding text in each region (supports tilt/curve); | |
| - Overlay the rendering results of each region onto a transparent black background image, and output the final rendered image. | |
| """ | |
| mask_np = np.array(computed_mask.convert("L")) | |
| contours, _ = cv2.findContours(mask_np, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
| regions = [] | |
| for cnt in contours: | |
| x, y, w, h = cv2.boundingRect(cnt) | |
| if w * h < 50: | |
| continue | |
| regions.append((x, y, w, h, cnt)) | |
| regions = sorted(regions, key=lambda r: (r[1], r[0])) | |
| render_img = Image.new("RGBA", original.size, (0, 0, 0, 0)) | |
| try: | |
| base_font = ImageFont.truetype("resources/Arial-Unicode-Regular.ttf", 40) | |
| except: | |
| base_font = ImageFont.load_default() | |
| for i, region in enumerate(regions): | |
| if i >= len(texts): | |
| break | |
| text = texts[i].strip() | |
| if not text: | |
| continue | |
| cnt = region[4] | |
| polygon = cnt.reshape(-1, 2) | |
| rendered_np = draw_glyph2( | |
| font=base_font, | |
| text=text, | |
| polygon=polygon, | |
| vertAng=10, | |
| scale=1, | |
| width=original.size[0], | |
| height=original.size[1], | |
| add_space=True, | |
| scale_factor=1, | |
| rotate_resample=Image.BICUBIC, | |
| downsample_resample=Image.Resampling.LANCZOS | |
| ) | |
| rendered_img = Image.fromarray(rendered_np, mode="RGBA") | |
| render_img = Image.alpha_composite(render_img, rendered_img) | |
| return render_img.convert("RGB") | |