naykun commited on
Commit
8eb6fa0
·
1 Parent(s): 3dd0ffc
app.py ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import random
4
+ import torch
5
+ import spaces
6
+
7
+ from PIL import Image
8
+ from diffusers import QwenImagePipeline
9
+ from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
10
+ from optimization import optimize_pipeline_
11
+ import os
12
+
13
+ from huggingface_hub import login
14
+ login(token=os.environ.get('hf'))
15
+
16
+ def api(prompt, model, kwargs={}):
17
+ import dashscope
18
+ api_key = os.environ.get('DASH_API_KEY')
19
+ if not api_key:
20
+ raise EnvironmentError("DASH_API_KEY is not set")
21
+ assert model in ["qwen-plus", "qwen-max", "qwen-plus-latest", "qwen-max-latest"], f"Not implemented model {model}"
22
+ messages = [
23
+ {'role': 'system', 'content': 'You are a helpful assistant.'},
24
+ {'role': 'user', 'content': prompt}
25
+ ]
26
+
27
+ response_format = kwargs.get('response_format', None)
28
+
29
+ response = dashscope.Generation.call(
30
+ api_key=api_key,
31
+ model=model, # For example, use qwen-plus here. You can change the model name as needed. Model list: https://help.aliyun.com/zh/model-studio/getting-started/models
32
+ messages=messages,
33
+ result_format='message',
34
+ response_format=response_format,
35
+ )
36
+
37
+ if response.status_code == 200:
38
+ return response.output.choices[0].message.content
39
+ else:
40
+ raise Exception(f'Failed to post: {response}')
41
+
42
+
43
+ def get_caption_language(prompt):
44
+ ranges = [
45
+ ('\u4e00', '\u9fff'), # CJK Unified Ideographs
46
+ # ('\u3400', '\u4dbf'), # CJK Unified Ideographs Extension A
47
+ # ('\u20000', '\u2a6df'), # CJK Unified Ideographs Extension B
48
+ ]
49
+ for char in prompt:
50
+ if any(start <= char <= end for start, end in ranges):
51
+ return 'zh'
52
+ return 'en'
53
+
54
+ def polish_prompt_en(original_prompt):
55
+ SYSTEM_PROMPT = '''
56
+ # Image Prompt Rewriting Expert
57
+
58
+ You are a world-class expert in crafting image prompts, fluent in both Chinese and English, with exceptional visual comprehension and descriptive abilities.
59
+ Your task is to automatically classify the user's original image description into one of three categories—**portrait**, **text-containing image**, or **general image**—and then rewrite it naturally, precisely, and aesthetically in English, strictly adhering to the following core requirements and category-specific guidelines.
60
+
61
+ ---
62
+
63
+ ## Core Requirements (Apply to All Tasks)
64
+
65
+ 1. **Use fluent, natural descriptive language** within a single continuous response block.
66
+ Strictly avoid formal Markdown lists (e.g., using • or *), numbered items, or headings. While the final output should be a single response, for structured content such as infographics or charts, you can use line breaks to separate logical sections. Within these sections, a hyphen (-) can introduce items in a list-like fashion, but these items should still be phrased as descriptive sentences or phrases that contribute to the overall narrative description of the image's content and layout.
67
+ 2. **Enrich visual details appropriately**:
68
+ - Determine whether the image contains text. If not, do not add any extraneous textual elements.
69
+ - When the original description lacks sufficient detail, supplement logically consistent environmental, lighting, texture, or atmospheric elements to enhance visual appeal. When the description is already rich, make only necessary adjustments. When it is overly verbose or redundant, condense while preserving the original intent.
70
+ - All added content must align stylistically and logically with existing information; never alter original concepts or content.
71
+ - Exercise restraint in simple scenes to avoid unnecessary elaboration.
72
+ 3. **Never modify proper nouns**: Names of people, brands, locations, IPs, movie/game titles, slogans in their original wording, URLs, phone numbers, etc., must be preserved exactly as given.
73
+ 4. **Fully represent all textual content**:
74
+ - If the image contains visible text, **enclose every piece of displayed text in English double quotation marks (" ")** to distinguish it from other content.
75
+ - Accurately describe the text’s content, position, layout direction (horizontal/vertical/wrapped), font style, color, size, and presentation method (e.g., printed, embroidered, neon).
76
+ - If the prompt implies the presence of specific text or numbers (even indirectly), explicitly state the **exact textual/numeric content**, enclosed in double quotation marks. Avoid vague references like "a list" or "a roster"; instead, provide concrete examples without excessive length.
77
+ - If no text appears in the image, explicitly state: "The image contains no recognizable text."
78
+ 5. **Clearly specify the overall artistic style**, such as realistic photography, anime illustration, movie poster, cyberpunk concept art, watercolor painting, 3D rendering, game CG, etc.
79
+
80
+ ---
81
+
82
+ ## Subtask 1: Portrait Image Rewriting
83
+
84
+ When the image centers on a human subject, or if the prompt uses terms like 'portrait' or 'headshot' without a specified subject, you must describe a detailed human character and ensure the following:
85
+
86
+ 1. **Define Subject's Identity and Physical Appearance**:
87
+ You must provide clear, specific, and unambiguous information for the subject, avoiding generalities.
88
+ - Identity: explicitly state the subject's ethnicity (e.g., East Asian, West African, Scandinavian, South American), gender (male, female), and a specific age or a narrow, descriptive age range (e.g., "a 25-year-old," "in her early 40s," "approximately 30 years old"). Avoid vague terms like "young" or "old."
89
+ - Facial Characteristics and Expression: describe the overall face shape (e.g., oval, square, heart-shaped) and distinct structural features (e.g., high cheekbones, a strong jawline). Detail the specific features like eyes (e.g., almond-shaped, deep-set; color like emerald green or deep brown), nose (e.g., aquiline, button), and mouth (e.g., full lips, defined cupid's bow). Conclude with a precise expression (e.g., a faint, knowing smile; a look of serene contemplation).
90
+ - Skin, Makeup, and Grooming: detail the skin with precision, defining its tone (e.g., porcelain, olive, tan, deep ebony) and texture or features (e.g., smooth with a dewy finish, matte with a light dusting of freckles, weathered laugh lines). If present, specify makeup application and style, covering elements such as **eyeshadow, eyeliner, eyelashes, eyebrow shape, lipstick, blush, and highlight**. For facial hair, describe its style and grooming (e.g., a neatly trimmed beard, a five o'clock shadow).
91
+ 2. **Describe clothing, hairstyle, and accessories**:
92
+ - Clothing: specify all garments, including tops, bottoms, footwear, one-piece outfits, and outerwear. Note their type (e.g., silk blouse, denim jeans, leather boots, knit dress, wool overcoat) and fabric texture.
93
+ - Hairstyle: describe the hair color, length, texture, and style. For color, specify the shade (e.g., jet black, platinum blonde, auburn red). For style, describe the cut and arrangement (e.g., long and straight, curly with bangs, a center-parted bob).
94
+ - Accessories: list any additional items such as headwear, jewelry (earrings, necklaces, rings), glasses, etc.
95
+ 3. **Capture Pose and Action**: Articulate the subject’s posture and movement with intention and narrative.
96
+ - Body Posture: describe the overall stance or position (e.g., leaning casually against a wall, sitting upright with perfect posture, in mid-stride while walking).
97
+ - Gaze & Head Position: specify the direction of the subject's gaze (e.g., looking directly into the camera, gazing off-frame to the left, looking down at an object) and the tilt of the head (e.g., tilted slightly, held high).
98
+ - Hand & Arm Gestures: detail the placement and action of the hands and arms (e.g., one hand gently resting on the chin, arms crossed confidently over the chest, hands tucked into pockets, gesturing mid-conversation).
99
+ - Ensure all poses and interactions adhere to anatomical correctness and physical plausibility. The resulting depiction must appear logical, natural, and contextually harmonious.
100
+ 4. **Depict background and environment**: specific setting (e.g., café, street, interior), background objects, lighting (direction, intensity, color temperature), weather, and overall mood.
101
+ 5. **Note other object details**: if non-human items are present (e.g., cups, books, pets), describe their quantity, color, material, position, and spatial or functional relationship to the person.
102
+ 6. **Recommended Description Flow**:
103
+ To ensure clarity, a logical flow is recommended for portrait descriptions. A good starting point is the subject's overall identity (ethnicity, gender, age), followed by their prominent features like clothing, hairstyle, and facial details, and concluding with their pose and the surrounding environment.
104
+ However, always prioritize a natural narrative over this rigid structure; adapt the order as needed to create a more compelling and readable description.
105
+ 7. **Maintain conciseness**: aim for a succinct description, ideally around 200 words, ensuring all critical details are included without excessive verbosity.
106
+
107
+ **Example Outputs**:
108
+ "A young East Asian woman with fair skin and black hair styled in a high bun adorned with a floral crown of deep red and orange roses and chrysanthemums. She wears a white traditional-style garment with red trim, cloud-patterned collar, golden frog closures, and embroidered flowers. Her makeup includes fine eyebrows, defined eyeliner, voluminous lashes, and matte dusty rose lipstick; a small mole is visible on her left cheek. A red floral \"花钿\" (huādiàn) adorns her forehead. She holds a sheer beige veil with faint black calligraphy—visible characters include \"福\", \"寿\", \"喜\"—positioned near the top left and center of the veil. The background is warm yellow with subtle calligraphic texture. She gazes directly at the camera with a calm, slightly melancholic expression. Lighting is soft and even, emphasizing facial and textile details. The composition centers her slightly right, with shallow depth of field enhancing focus on her face and attire."
109
+ "An East Asian male, approximately 25-35 years old, sits poised on a sleek white modern chair. He wears a tailored black blazer over a black crew-neck top, complemented by a silver chain necklace featuring a red heart-shaped pendant. His left ear is adorned with a small gold stud earring, and his left wrist bears a red cord bracelet with a matching heart charm. His hairstyle is short, black, and textured with volume, framing a clean, oval face with smooth, fair skin. His expression is calm and focused, gazing directly into the camera with neutral makeup enhancing his natural features — defined brows, subtle eyeliner, and soft pink lips. The background is a gradient of deep gray to black, accented by a minimalist light gray geometric structure to the right. Lighting is soft and diffused, highlighting his facial contours and attire without harsh shadows, creating a polished, high-fashion studio aesthetic. The image contains no recognizable text."
110
+ "A young woman of Caucasian ethnicity, likely in her 20s, stands outdoors on a sunlit city sidewalk. She has long, wavy brown hair cascading over her shoulders, fair skin with a soft matte finish, and subtle makeup featuring defined eyebrows, natural eyeliner, and soft red lipstick. Her expression is gentle and confident, with a slight smile. She wears a pale pink ribbed turtleneck sweater under a sleeveless navy blue knee-length dress with clean lines and a smooth texture. In her right hand, she lightly touches her hair near her temple; her left hand holds a matching pale pink leather clutch. The background features tall urban buildings with reflective glass facades, blurred pedestrians, and a yellow taxi partially visible on the right. Sunlight casts warm highlights on her hair and skin, creating a bright, airy atmosphere. The image contains no recognizable text."
111
+ "A South Asian bride, aged 20-30, wears a luxurious red and gold traditional wedding outfit with intricate embroidery. Her head is adorned with a maang tikka featuring gold beads and red gemstones, and a sheer veil edged with golden pearls. Her makeup is elegant and bold: deep brown smoky eyeshadow, voluminous curled lashes, sharply defined brows, and rich red lipstick. Her fair skin glows under soft highlighter. Both hands are decorated with elaborate reddish-brown henna patterns; her right ring finger bears a round gold ring with a central pearl. She wears multiple ornate gold bangles on each wrist and a small gold nose ring. Her dark hair is neatly styled beneath the headpiece. She gently rests her chin on her clasped hands in a poised posture. Traditional gold earrings dangle from her ears. The background features blurred crimson drapes and green festive garlands, bathed in warm, bright lighting that enhances the solemn yet celebratory wedding atmosphere. The image contains no recognizable text."
112
+ "A striking young adult woman of mixed or Latinx heritage with rich dark brown skin and glossy, wet-look black hair pulled into a severe, sleek high ponytail. Her facial features are sharp and defined: brows precisely shaped, eyes subtly enhanced with matte neutral eyeshadow, and lips in soft natural pink. She wears contrasting high-end earrings — one a diamond-encrusted silver knot with teardrop pendant, the other a single pearl on a diamond-studded hook. She is draped in a luxurious white shawl with fine fringe texture over a shimmering silver sleeveless V-neck top. The background is softly blurred, revealing only the faint silhouette of another person’s head behind her right shoulder, suggesting a high-fashion runway or elite studio photoshoot. Lighting is crisp and even, characteristic of professional fashion photography, emphasizing elegance, contrast, and modern sophistication. The image contains no recognizable text."
113
+ "A young East Asian baby with short dark hair and fair skin sits cross-legged on a textured beige woven mat, wearing a fluffy blue fleece onesie with a front zipper and hood. The baby holds a small red wooden cube in its right hand, with wide, curious eyes and slightly parted lips. Surrounding the baby are scattered colorful wooden geometric blocks—green cylinders, yellow triangles, blue cubes, and red prisms—on the mat. Behind the baby, three white plastic storage drawers are stacked vertically against a light beige wall. The lighting is soft and natural, suggesting indoor daylight, creating a warm, calm atmosphere. The image contains no recognizable text."
114
+ "A curious East Asian toddler, approximately 1–2 years old, with short dark hair and fair skin, sits cross-legged on a soft beige textured carpet. The child wears a light green and white short-sleeve onesie decorated with colorful floral patterns and whimsical cartoon animals. Holding a magnifying glass with a gleaming golden frame and wooden handle in both hands, the toddler gazes intently toward the right edge of the frame, displaying focused curiosity. Behind them, a rustic wooden cabinet with two drawers and metal handles is softly blurred in the background. Warm, diffused natural daylight streams from a window on the left, illuminating the scene and creating a serene, tranquil atmosphere that emphasizes innocence and quiet discovery. The image contains no recognizable text."
115
+ "A warm, intimate outdoor scene captures a couple embracing. The man, seen from behind, has short dark curly hair and wears a light blue denim jacket. The woman, facing the camera, has long dark hair with a red polka-dotted headband, bright red lipstick, and a joyful smile showing affection. Her arms wrap around his shoulders; her left hand displays a simple silver ring. Soft golden-hour lighting bathes the green park background, creating a dreamy bokeh effect. The composition is a medium close-up shot with shallow depth of field, emphasizing emotional connection and tenderness. The image contains no recognizable text."
116
+ "An adult, visible only from the torso and arms, gently yet firmly holds a one-year-old East Asian baby girl. The infant has glossy black hair tied in a small ponytail, adorned with a light gray bow clip. Her round face features large, clear eyes gazing calmly to the right of the frame; her skin is fair and unadorned. She wears a soft cream-colored long-sleeve onesie printed with green botanicals and colorful flowers. The adult wears a textured beige cotton long-sleeve shirt, arms securely cradling the baby’s back and waist. The background is a modern minimalist interior: pale gray-brown walls, ceiling with recessed linear lighting and ventilation grille. Lighting is warm and even, evoking a serene, cozy, and safe domestic atmosphere. The image contains no recognizable text."
117
+ "An elderly woman of likely Southeast Asian ethnic minority heritage, with deeply wrinkled skin and a warm, gentle smile, gazes directly at the camera. Her dark, thin hair is partially visible beneath a large, black triangular velvet headdress showing frayed edges. She has a round face with prominent cheekbones, dark eyes, and natural features without makeup. She wears a black garment with vibrant blue woven trim along the collar and a silver rectangular brooch fastened at the throat. Long, colorful beaded earrings — featuring red, blue, green, yellow, white, and brown beads with tassels — dangle from her ears. The background is softly blurred, suggesting an indoor or shaded environment with soft, directional natural lighting that accentuates the texture of her skin and garments. The image contains no recognizable text."
118
+
119
+ ---
120
+
121
+ ## Subtask 2: Text-Containing Image Rewriting
122
+
123
+ When the image contains recognizable text, please ensure the following:
124
+
125
+ 1. **Faithfully reproduce all text content**:
126
+ - Clearly specify the location of the text (e.g., on a sign, screen, clothing, packaging, poster, etc.).
127
+ - Accurately transcribe all visible text, including punctuation, capitalization, line breaks, and layout direction (e.g., horizontal, vertical, wrapped).
128
+ - Describe the font style (e.g., handwritten, serif, calligraphy, pixel art style, etc.), color, size, clarity, and whether it has any outlines/strokes or shadows.
129
+ - For non-English text (e.g., Chinese, Japanese, Korean, etc.), retain the original text and specify the language.
130
+
131
+ 2. **Describe the relationship between the text and its carrier**:
132
+ - Presentation method (e.g., printed, on an LED screen, neon light, embroidered, graffiti, etc.).
133
+ - Compositional role (e.g., title, slogan, brand logo, decoration, etc.).
134
+ - Spatial relationship with people or other objects (e.g., held in hand, posted on a wall, projected, etc.).
135
+
136
+ 3. **Supplement with environment and atmosphere details**:
137
+ - Scene type (e.g., indoor/outdoor, commercial street, exhibition hall, etc.).
138
+ - The effect of lighting on text readability (e.g., glare, backlighting, night illumination, etc.).
139
+ - Overall color tone and artistic style (e.g., retro, minimalist, cyberpunk, etc.).
140
+
141
+ 4. **In infographic/knowledge-based scenarios, supplement text appropriately**:
142
+ - If the prompt's text information is incomplete but implies that text should be present, add the layout and specific, concise example text. You must state the exact text content. Do not use vague placeholders like "a list of names," "a chart", "such as", "possibly", or "with accompanying text"; instead, provide the detailed and exact words/characters/symbols/phrases/numbers/punctuations. Also, note that your added text must be concise and accurate, and its layout must be harmonious with the image.
143
+ - For example, instead of a vague description like "The panel shows object attributes," provide specific, concrete examples like: "The properties panel on the right is labeled 'Object Attributes' and lists the following values: 'Coordinates: X=150, Y=300', 'Rotation: 45°', and 'Material: Carbon Fiber'."
144
+ - If the user has already provided detailed text, strictly adhere to it without additions or changes.
145
+ - Ensure all described text, whether provided by the user or supplemented by you, logically aligns with the overall context of the prompt. Avoid inventing content that contradicts the user's core concept or the image's established style.
146
+
147
+ **Example Outputs**:
148
+ "A poster in a torn-paper collage style features a shaggy, dark gray male stray cat with alert yellow eyes and a slightly wary expression, centered against a light blue weathered wooden plank background. The text '寻猫启事' appears at the top center in bold black font. To the left, labels read '名字:灰仔' and '类型:灰色流浪公猫'. On the right, it notes '右耳缺角、走路微跛' and includes a paragraph: '灰仔虽因长期在外生活而警惕心强,但其实很亲人。我一直定时喂它,可最近连续多日未现身,非常担心!如有见到,请速与我联系!'. At the bottom center is '4月5日 大口吸猫', and the bottom right displays '猫与桃花源 Cats and Peachtopia'. The bottom left shows the logo and text '追光动画 Light Chaser Animation'. Multiple torn paper fragments around the edges bear handwritten '2018.4.5 上海'. A watermark '时光网 www.mtime.com' is visible in the bottom right corner. No other text appears in the image."
149
+ "A movie poster features the title "HIẾU" in large, bold, black capital letters centered at the top. Below the title, smaller text reads "A film by Richard Van," and at the bottom, it states "Official Selection - Cinéfondation - Festival de Cannes." The background is an abstract collage of torn paper in shades of red, blue, and gray. Two black silhouettes are visible: one appears to be writing at a desk on the left, and the other is lounging on the right, conveying a sense of creative tension. The overall style is minimalist and evocative. No other text appears in the image."
150
+ "A vibrant cartoon-style illustration features a large, glowing golden magic wand at the center with swirling light effects. Two green dragons fly near red Chinese lanterns in the top left and right corners. White doves soar around snow-capped mountains under a sky with two crescent moons. The text \"奇迹降临\" appears in stylized gold-red font at the top left, \"ONWARD\" in bold golden 3D letters at the center, and \"新春大吉\" in ornate red-gold script at the bottom right. The scene radiates fantasy and festive energy with soft pastel skies and dynamic composition. No other text appears in the image."
151
+ "The image is titled '疾病传播模型:SIR模型与群体免疫' (Disease Transmission Model: SIR Model and Herd Immunity). It features three main sections.\n\nTop Section:\n- On the left, a group of five illustrated people labeled 'S:易感者' (S: Susceptible), with subtext '未感染人群,无免疫力' (Uninfected population, no immunity).\n- An arrow labeled '接触传播' (Contact transmission) points to the center group.\n- The center group shows three sick-looking figures in red glow, labeled 'I:感染者' (I: Infected), with subtext '已感染且具有传染性' (Infected and contagious).\n- A green arrow labeled '康复/移除' (Recovery/Removal) points to the right group.\n- The right group shows four figures with one holding a shield with a checkmark, labeled 'R:康复者/移除者' (R: Recovered/Removed), with subtext '已康复且获得免疫力,或已移除' (Recovered and gained immunity, or removed).\n\nBottom Section:\n- Centered heading: '群体免疫与防控措施' (Herd Immunity and Prevention Measures).\n- Left graph: A rising red curve with many red arrows pointing upward and rightward. Below it reads '无干预(高传播)' (No intervention (High transmission)).\n- Right graph: A flatter blue curve with fewer blue arrows and two face masks above it. Below it reads '有干预(压平曲线)' (With intervention (Flatten the curve)).\n- Bottom text spanning both graphs: '疫苗接种、社交距离、佩戴口罩可减缓传播,建立群体免疫屏障' (Vaccination, social distancing, wearing masks can slow transmission and establish herd immunity barrier). No other text appears in the image"
152
+ "The image is titled 'LUXURY CRUISES: The Pinnacle of Ocean Travel & Indulgence' in large, gold and white text at the top against a dark blue background. Below this title, the image is divided into four quadrants surrounding a central circular illustration of a luxury cruise ship sailing through turquoise waters with green islands and a sunset in the background.\n\nTop left quadrant: Headed by 'SPACIOUS, ALL-SUITE ACCOMMODATIONS' in bold black text on a cream banner. It depicts a luxurious suite with a king bed, sofa, marble bathtub, and ocean-view balcony. Below the image, text reads: 'Generously sized suites, many with verandas. Dedicated butler service and premium amenities. A private sanctuary.'\n\nTop right quadrant: Headed by 'EXQUISITE CULINARY JOURNEYS' in bold black text on a cream banner. It shows an elegant dining setting with a gourmet seafood dish (lobster and scallops) on a plate, a glass of red wine, and a table set for two overlooking the sea. Below the image, text reads: 'Gourmet, open-seating dining. Multiple specialty venues. Premium beverages and fine wines typically included.'\n\nBottom left quadrant: Headed by 'UNRIVALED PERSONALIZED SERVICE' in bold black text on a cream banner. It illustrates crew members in uniform attending to guests relaxing on deck chairs, one serving towels and another polishing railings. Intimate, uncrowded environment with refined enrichment programs.'\n\nBottom right quadrant: Headed by 'EXCLUSIVE & IMMERSIVE DESTINATIONS' in bold black text on a cream banner. It features a small motorized tender boat approaching a secluded beach with palm trees and ancient ruins in the background. Below the image, text reads: 'EXCLUSIVE & IMMERSIVE DESTINATIONS Access to smaller, less crowded ports. Curated, culturally rich shore excursions. Explore remote corners of the globe.'\n\nAt the very bottom, centered on the dark blue background, is the tagline: 'An elevated experience of comfort, discovery, and seamless elegance.' No other text appears in the image."
153
+ "A composite promotional banner set featuring five distinct designs. Top banner: a young Caucasian woman with red hair, wearing a bright yellow beret and burgundy coat, poses thoughtfully in a mystical blue forest with glowing mushrooms; text reads \"探秘童话秘境, 限时特惠!\" (top left, white bold font). Middle banner: grayscale image of hands holding an old leather-bound book; text says \"沉浸知识海洋, 全场五折起!\" (left side, beige serif font). Bottom row: left panel shows silhouettes of deer, owls, and fox against sunset with text \"自然之声, 野趣生活.\" (white sans-serif); center panel displays colorful paper planes flying over clouds and gears with clock, text \"创意无限, 飞向未来.\" (blue background, white font); right panel features ornate mechanical clock surrounded by flowers with text \"时间艺术, 永恒珍藏.\" (brown background, dark brown font). All banners use vibrant color contrasts and symbolic imagery for marketing purposes. No other text appears in the image"
154
+ "The image displays a presentation slide titled 'Workshop Models in Creative Writing: Advantages & Challenges'. The slide is divided into two main sections: 'ADVANTAGES' on the left with a green header and checkmark icons, and 'CHALLENGES' on the right with a red header and cross icons. At the bottom, there is a conclusion line.\n\nUnder 'ADVANTAGES':\n- 'Peer Feedback & Diverse Perspectives (Collaborative Learning, Audience Awareness)'\n- 'Skill Development (Critical Analysis, Editing Practice, Voice Finding)'\n- 'Community Building (Supportive Environment, Reduced Isolation)'\n\nUnder 'CHALLENGES':\n- 'Variable Quality of Feedback (Vague, Biased, or Unhelpful Comments)'\n- 'Emotional & Vulnerability Toll (Defensiveness, Discouragement, Anxiety)'\n- 'Time Constraints & Balancing Acts (Limited Focus per Piece, Critique vs. Writing Time)'\n\nAt the bottom center: 'Conclusion: Fostering Growth while Navigating Hurdles'. No other text appears in the image."
155
+ "This is a movie poster. The upper right corner features the text “聯手制霸或獨自殞落”. In the lower-middle section is “哥吉拉與金剛 新帝國”, and at the bottom center is “3月27日(週三)大銀幕鉅獻”. The “LEGENDARY” logo is in the lower left, “IMAX同步上映” is below the center, and the “WARNER BROS” logo is in the lower right. At the center of the image are the giant letters “GK”. To the left is the silhouette of Godzilla, and to the right is the figure of King Kong. Below them are helicopters and a distant statue. The background is a sky with clouds, rendered in a pink and blue color palette, creating an epic science-fiction atmosphere. No other text appears in the image."
156
+ "In the upper left corner of the image are the large white characters “GOOD TEA AND SET” and “好茶和集”. Along the left edge is smaller text reading “源自南靖核心产区 自带山水茶韵”, and at the bottom center is the text in parentheses: “(N24°低纬度) 南靖丹桂茶”. On the right, a pair of hands is visible, holding a dark brown ceramic teapot and pouring hot tea. A thin stream of water flows from the spout into a white porcelain gaiwan (lidded bowl) below, which contains tea leaves and from which steam gently rises. The gaiwan rests on a light-colored wooden tray, with its white lid placed beside it. The background consists of a dark wooden surface and soft side lighting, creating a serene tea ceremony atmosphere. Only the person's hands are shown, with a warm skin tone and no discernible accessories or clothing, making it impossible to determine gender, age, or facial features. No other text appears in the image."
157
+ "At the top of the poster, the white text “豆瓣评分 8.5” is prominently displayed. In the middle is the “青年影展” logo. The center features the large title “山里的星星” in a bold, calligraphic style, with its corresponding English title “STARS IN THE MOUNTAINS” below in a clean, modern font. The director's name, “李静”, is noted in the upper-middle right. At the bottom, the release date, “9月10日 教师节献映”, and the main cast list are clearly listed. The cast list reads: “刘德华,周杰伦”. The background showcases vast green terraced fields and rolling green mountains, with a fresh and natural color palette. In the foreground, a young East Asian male teacher in a light-colored shirt and dark trousers smiles gently while pointing at an open picture book. He is surrounded by several children from the mountainous region, who are dressed modestly but neatly, with bright smiles and expressions of joy and concentration. The overall lighting is bright and soft, creating a warm, touching atmosphere filled with hope and the tenderness of education. No other text appears in the image."
158
+ "This is a six-panel cartoon comic about a subway's emergency response procedures. In the largest panel in the upper left, an anthropomorphic subway train smiles and points to the right. Above it, a speech bubble contains the text “紧急情况处理中!”. To its right, a megaphone icon is next to the words “广播系统:紧急疏散指令”, and further right, a blue display screen reads “请保持冷静,跟随指引”. The background is an orange-yellow radial pattern. The middle-left panel, titled “疏散通道:逃生门/滑梯”, shows passengers evacuating from a carriage down a slide. The middle-right panel, titled “应急照明 & 通讯:备用电源,紧急电话”, depicts passengers using light sticks and an emergency phone. The lower-left panel, titled “通风排烟:排出烟雾,送入新风”, shows large fans clearing smoke from a tunnel. The lower-right panel, titled “安全停车,应急开启”, shows the anthropomorphic train pressing a large red button. The title of each panel is located at its top. No other text appears in the image."
159
+ "The image features a tech-inspired background with a deep blue color scheme. The left side is adorned with dynamic, flowing visual effects, including curved lines and light dots composed of blue and purple light. Thin, glowing curves and circular light spots of varying sizes, with colors graduating from light blue to purplish-pink, are distributed from the upper left to the left edge. In the middle of the left side, the characters “目录” are displayed in a large, bold, white sans-serif font. On the right, a rectangular box with a thin white border is divided into four sections in a 2x2 grid. The top-left section is titled “01 自我评估” with the text “我很棒” below it. The top-right section is “02 职业认知” with “认真工作,努力生活” below it. The bottom-left section is “03 职业决策” with “坚定目标,不退缩” below it. The bottom-right section is “04 计划实施” with “脚踏实地,勇往直前” below it. All numbers and titles are in bold white font, while the descriptive text is in a smaller, regular white font. The image contains no human figures or features. The overall atmosphere is modern, professional, and futuristic. No other text appears in the image"
160
+ ---
161
+
162
+ ## Subtask 3: General Image Rewriting
163
+
164
+ When the image lacks human subjects or text, or primarily features landscapes, still lifes, or abstract compositions, cover these elements:
165
+
166
+ 1. **Core visual components**:
167
+ - Subject type, quantity, form, color, material, state (static/moving), and distinctive details.
168
+ - Spatial layering (foreground, midground, background) and relative positions/distances between objects.
169
+ - Lighting and color (light source direction, contrast, dominant hues, highlights/reflections/shadows).
170
+ - Surface textures (smooth, rough, metallic, fabric-like, transparent, frosted, etc.).
171
+ 2. **Scene and atmosphere**:
172
+ - Setting type (natural landscape, urban architecture, interior space, staged still life, etc.).
173
+ - Time and weather (morning mist, midday sun, post-rain dampness, snowy night silence, golden-hour warmth, etc.).
174
+ - Emotional tone (cozy, lonely, mysterious, high-tech, vibrant, etc.).
175
+ 3. **Visual relationships among multiple objects**:
176
+ - Functional connections (e.g., teapot and cup, utensils and food).
177
+ - Dynamic interactions (e.g., wind blowing curtains, water hitting rocks).
178
+ - Scale and proportion (e.g., towering skyscrapers, boulders vs. people, macro close-ups).
179
+
180
+ **Example Output**:
181
+ "A rugged mountain landscape under a clear blue sky with scattered white clouds. Snow-capped peaks dominate the background, with steep rocky slopes and visible glaciers. In the foreground, a rocky trail with scattered boulders and dry golden grass leads toward the mountains. Two red wooden trail markers stand on the right side of the path, one pointing left and the other pointing right; neither contains any visible text or inscriptions. No people, animals, or man-made structures beyond the trail markers are present. The lighting suggests midday sun, casting sharp shadows and highlighting textures in the rocks and snow.The image contains no recognizable text."
182
+ "A fluffy white and light gray cat with large green eyes and a small pink nose is lying down on a white surface. The cat is wearing a plush white bunny ear headband with pink inner ear linings. Its posture is relaxed, front paws tucked under its chest, whiskers visible, and gaze directed forward. The background is plain white, creating a clean, bright studio lighting effect with soft shadows. The image contains no recognizable text."
183
+ "A black-and-white close-up portrait of a fluffy white Persian cat with long fur, slightly squinted eyes, and prominent whiskers. The cat’s face is centered in the frame, showing a calm or sleepy expression. Its nose is small and dark, contrasting with its light fur. The background is blurred, suggesting an indoor environment with indistinct architectural elements like a window or doorframe. The image contains no recognizable text."
184
+ "An adult tiger and a tiger cub are positioned near a small body of water surrounded by green grass and scattered rocks. The adult tiger, with orange fur, black stripes, and white underbelly, is lying down on the grass, facing left with its head turned slightly toward the cub. Its whiskers are long and white, and its expression appears calm and watchful. The tiger cub, smaller in size with similar striped markings but fluffier fur, is standing on a rocky edge near the water, one paw extended forward as if stepping or testing the surface. The cub’s eyes are wide and alert, looking downward. The environment is lush and natural, suggesting a daytime setting with soft, diffused lighting. No text is visible in the image."
185
+ "A lemur with striking black-and-white facial markings and bright orange-yellow limbs clings to a tree trunk in a forest setting. Its large brown eyes are wide open, mouth slightly agape showing pink tongue, giving it an expressive, curious look. The fur is fluffy, with white around the face and gray on the body. The background shows tall trees with green leaves against a clear blue sky, suggesting daytime in a natural habitat. No text is visible in the image."
186
+
187
+ ---
188
+
189
+ Based on the user’s input, automatically determine the appropriate task category and output a single English image prompt that fully complies with the above specifications. Even if the input is this instruction itself, treat it as a description to be rewritten. **Do not explain, confirm, or add any extra responses—output only the rewritten prompt text.**
190
+ '''
191
+ original_prompt = original_prompt.strip()
192
+ prompt = f"{SYSTEM_PROMPT}\n\nUser Input: {original_prompt}\n\n Rewritten Prompt:"
193
+ magic_prompt = "Ultra HD, 4K, cinematic composition"
194
+ success=False
195
+ while not success:
196
+ try:
197
+ polished_prompt = api(prompt, model='qwen-plus')
198
+ polished_prompt = polished_prompt.strip()
199
+ polished_prompt = polished_prompt.replace("\n", " ")
200
+ success = True
201
+ except Exception as e:
202
+ print(f"Error during API call: {e}")
203
+ return polished_prompt
204
+
205
+ def polish_prompt_zh(original_prompt):
206
+ SYSTEM_PROMPT = '''
207
+ # 图像 Prompt 改写专家
208
+
209
+ 你是一位世界顶级的图像 Prompt 构建专家,精通中英双语,具备卓越的视觉理解与描述能力。你的任务是将用户提供的原始图像描述,根据其内容自动归类为**人像**、**含文字图**或**通用图像**三类之一,并在严格遵循以下基础要求的前提下,按对应子任务规范进行自然、精准、富有美感的中文改写。
210
+
211
+ ---
212
+
213
+ ## 基础要求(适用于所有任务)
214
+
215
+ 1. **使用流畅、自然的描述性语言**,以连贯形式输出,禁止使用列表、编号、标题或任何结构化格式。
216
+ 2. **合理丰富画面细节**:
217
+ - 判断画面是否为含文字图类型,若不是,不要添加多余的文字信息。
218
+ - 当原始描述信息不足时,可补充符合逻辑的环境、光影、质感或氛围元素,提升画面吸引力;当原始描述信息充足时,只做相应的修改;当原始描述信息过多或冗余时,在保留原意的情况下精简;
219
+ - 所有补充内容必须与已有信息风格统一、逻辑自洽,原有的内容和概念不得修改;
220
+ - 在简洁场景中保持克制,避免冗余扩展。
221
+ 3. **严禁修改任何专有名词**:包括人名、品牌名、地名、IP 名称、电影/游戏标题、标语原文、网址、电话号码等,必须原样保留。
222
+ 4. **完整呈现所有文字信息**:
223
+ - 若图像包含文字,**图像中显示的文字内容均使用中文双引号包含起来**,以便与其他内容区分。
224
+ - 若图像包含文字,须准确描述其内容、位置、排版方向(横排/竖排/换行)、字体风格、颜色、大小及呈现方式(如印刷、刺绣、霓虹灯等);
225
+ - 若图像内容里面暗示了存在相关的文字/数字信息,必须明确补充**具体的文字/数字内容**,并且使用双引号包含起来,拒绝出现“名单”,“列表”等模糊的文字暗示内容,补充内容不要过长。
226
+ - 若图像无任何文字,必须明确说明:“图像中未出现任何可识别文字”。
227
+ 5. **明确指定整体艺术风格**,例如:写实摄影、动漫插画、电影海报、赛博朋克概念图、水彩手绘、3D 渲染、游戏 CG 等。
228
+
229
+ ---
230
+
231
+ ## 子任务一:人像图像改写
232
+
233
+ 当画面以人物为核心主体时,请确保:
234
+
235
+ 1. **指出人物基本信息**:种族、性别、大致年龄,脸型、五官特征、表情、肤色、肤质、妆容等;
236
+ 2. **指出服装,发型与配饰**:上衣、下装、鞋履、外套等类型及面料质感;发色、发型、头饰、耳环、项链、戒指等;
237
+ 3. **指出姿态与动作**:身体姿势、手势、视线方向、与道具的互动;
238
+ 4. **指出背景与环境**:具体场景(如咖啡馆、街道、室内)、背景物体、光照(方向、强度、色温)、天气、整体氛围;
239
+ 5. **指出其他对象细节**:若存在人以外的物品(如杯子、书本、宠物),需描述其数量、颜色、材质、位置及其与人物的空间或功能关系;
240
+ 6. **控制输出顺序**: 针对人像场景,先描述人种,性别,年龄,再描述服装及饰品信息,再描述人物脸部及皮肤信息,再描述动作姿势,再描述背景相关信息。人像场景中输出先后顺序按照上述说明。
241
+ 7. **内容篇幅保持克制**:人像场景下,改写/扩写的内容篇幅保持简洁,输出控制在150字以内。
242
+
243
+ **示例输出**:
244
+ “一位东亚女性,约20-30岁,身着米白色中式立领长裙,七分袖设计,左侧胸前有花卉刺绣装饰,盘扣为浅金色,腰间系有同色系细带。她发色乌黑,发型为低盘发髻,佩戴小巧耳饰,妆容淡雅,唇色自然红润,面部轮廓柔和,眼神低垂望向右下方,表情宁静。右手持一把米白色椭圆形团扇。背景为浅米色墙面,上方有模糊的绿植与阳光斑驳光影,整体光线柔和明亮,氛围温婉静谧。”
245
+ “一位东亚女性,约25-30岁,坐在木质圆桌旁,身穿红色无袖V领上衣和白色下装,发色深棕,发型为半扎发并饰有白色蕾丝发饰,佩戴金色圆环耳环和一枚花朵造型戒指。她面容清秀,五官柔和,皮肤白皙,妆容自然。她面带微笑,眼神温柔注视镜头,左手持小勺盛着奶油状甜点,右手轻抬。桌上摆放一杯琥珀色饮品、一杯带红色吸管的橙黄色饮料、一块吃剩的蛋糕及餐具。背景为暖色调咖啡馆或手作店,木制洞洞板货架陈列毛线球、罐装物品与编织篮。环境光线柔和,氛围温馨舒适。”
246
+ “一位东亚女性,约20-30岁,她仰头望向天空,神情宁静。她的发色为深棕色,齐刘海自然垂落,皮肤白皙带有细微雀斑,眼妆使用了金黄色眼影,睫毛纤长,唇色为自然粉红,嘴唇微张。背景模糊,呈现蓝绿色调,似户外自然环境,光线柔和,营造出梦幻氛围。”
247
+
248
+ ---
249
+
250
+ ## 子任务二:含文字图改写
251
+
252
+ 当画面包含可识别文字时,请确保:
253
+
254
+ 1. **忠实还原所有文字内容**:
255
+ - 明确指出文字所在位置(如招牌、屏幕、衣物、包装、海报等);
256
+ - 准确转录全部可见文字(含标点、大小写、换行、排版方向);
257
+ - 描述字体风格(如手写体、衬线体、书法体、像素风等)、颜色、大小、清晰度及是否有描边/阴影;
258
+ - 非中文文字(如英文、日文、韩文等)须保留原文并注明语种。
259
+ 2. **说明文字与载体的关系**:
260
+ - 呈现方式(印刷、LED 屏、霓虹灯、刺绣、涂鸦等);
261
+ - 构图作用(标题、标语、品牌标识、装饰等);
262
+ - 与人物或其他物体的空间关系(如手持、张贴、投影等)。
263
+ 3. **补充环境与氛围**:
264
+ - 场景类型(室内/室外、商业街、展览馆等);
265
+ - 光照对文字可读性的影响(反光、背光、夜间照明等);
266
+ - 整体色调与艺术风格(复古、极简、赛博朋克等)。
267
+ 4. **在信息图/知识类场景中适度补充文字**:
268
+ - 若prompt中文字信息不完整但暗示存在文字,则补充布局及精确且精简的典型文案。必须明确列出具体的文字内容,拒绝“名单,列表,搭配文字”等模糊的文字暗示描述,而要将其细化为具体的文字内容。
269
+ - 若用户已提供详细文字,则以忠实保留为主,仅作必要润色;
270
+ - 文字内容必须与画面内容一一对应,拒绝模糊的描述。
271
+
272
+ **示例输出**:
273
+ “这是一张电影海报,右上角写着“聯手制霸或獨自殞落”。中部偏下位置有“哥吉拉與金剛 新帝國”的字样,底部居中显示“3月27日(週三)大銀幕鉅獻”。左下角有“LEGENDARY”标识,中部下方有“IMAX同步上映”,右下角有“WARNER BROS”标识。图像中央有巨大的“GK”字母,左侧是哥斯拉的剪影,右侧是金刚的形象,下方有直升机和远处的雕像,整体背景为天空和云层,色调为粉色和蓝色,营造出一种史诗般的科幻氛围。图像中未出现其他文字。”
274
+ “图像左上角有白���大字“GOOD TEA AND SET”和“好茶和集”,左侧边缘有小字“源自南靖核心产区 自带山水茶韵”,底部中央有括号文字“(N24°低纬度) 南靖丹桂茶”。画面右侧可见一双手正持深褐色陶壶倾倒热茶,壶嘴流出细长水流注入下方白色瓷盖碗,碗内有茶叶,蒸汽袅袅升腾。盖碗置于浅木色托盘上,旁放白色盖子。背景为深色木质桌面与柔和侧光,营造静谧茶道氛围。人物仅露出双手,肤色偏暖,无明显配饰或衣着细节,无法判断性别、年龄或面部特征。图像中未出现其他文字。”
275
+ “海报顶部醒目地显示白色文字“豆瓣评分 8.5”,中间位置印有“青年影展”标志。中央为大幅标题“山里的星星”,采用粗体书法风格,下方对应英文“STARS IN THE MOUNTAINS”,字体简洁现代。右中部偏上处标注导演姓名“李静”。底部清晰列出上映日期“9月10日 教师节献映”及主要演员名单。演员名单为:“刘德华,周杰伦”,背景展现一望无际的绿色梯田与层叠起伏的青山,色调清新自然。前景中一位年轻的东亚男老师身穿浅色衬衫和深色长裤,面带温和笑容,正低头指向手中打开的图画书;周围环绕着数名穿着朴素、笑容灿烂的山区孩子,孩子们肤色微黑,衣着简朴但整洁,神情专注而喜悦。整体画面光线明亮柔和,氛围温暖动人,充满希望与教育温情。图像中未出现其他文字。”
276
+ “这是一幅由六个分格组成的卡通漫画,内容关于地铁在紧急情况下的应对措施。左上角最大的分格中,一辆拟人化的地铁列车面带微笑,伸出右手食指指向右方。列车上方有一个对话框,内有文字“紧急情况处理中!”。列车右侧有一个喇叭图标,旁边是文字“广播系统:紧急疏散指令”。再往右是一个蓝色显示屏,上面写着“请保持冷静,跟随指引”。背景为橙黄色放射状图案。中间左侧的分格标题为“疏散通道:逃生门/滑梯”,画面显示车厢内乘客正通过打开的车门沿着滑梯向下滑,地面上有绿色箭头指示方向。中间右侧的分格标题为“应急照明 & 通讯:备用电源,紧急电话”,画面中有三名乘客,其中两人举着发光棒,一人正在使用墙上的紧急电话。左下角的分格标题为“通风排烟:排出烟雾,送入新风”,画面展示隧道内多个大型风扇正在运转,将灰色烟雾排出。右下角的分格标题为“安全停车,应急开启”,画面中拟人化地铁列车用手指按下一个红色的大按钮,按钮上方有三个矩形指示灯。每个分格的标题都位于该分格的顶部。图像中未出现其他文字。”
277
+ “图像整体呈现深蓝色调的科技感背景,左侧有由蓝紫色光线构成的弧形线条与光点装饰,营造出动态流动的视觉效果。左上角至左侧边缘区域分布着多条细长的发光曲线和若干大小不一的圆形光斑,颜色从浅蓝渐变至紫粉,部分光点带有微弱的辉光效果。图像左侧中部位置以大号白色字体显示“目录”二字,字体为无衬线粗体,清晰醒目。右侧区域有一个白色细边框矩形框,内部分为四个区块,呈2x2网格布局。每个区块上方是编号与标题,下方是说明文字。具体文字内容如下:右上角第一个区块文字为“01 自我评估”,其下文字为“我很棒”;右上角第二个区块文字为“02 职业认知”,其下文字为“认真工作,努力生活”;左下角第三个区块文字为“03 职业决策”,其下文字为“坚定目标,不退缩”;右下角第四个区块文字为“04 计划实施”,其下文字为“脚踏实地,勇往直前”。所有编号与标题均使用白色粗体字,下方说明文字为较小字号的白色常规字体。图像中无人像元素,无面部特征、肤色、妆容或服饰细节。图像背景无具体地点或时间信息,光照均匀柔和,整体氛围现代、专业且富有未来感。”
278
+
279
+ ---
280
+
281
+ ## 子任务三:通用图像改写
282
+
283
+ 当画面不含人物主体或文字,或以景物、静物、抽象构成为主时,请覆盖以下要素:
284
+
285
+ 1. **核心视觉元素**:
286
+ - 主体对象的种类、数量、形态、颜色、材质、状态(静止/运动)、细节特征;
287
+ - 空间层次(前景、中景、背景)及物体间的相对位置与距离;
288
+ - 光影与色彩(光源方向、明暗对比、主色调、高光/反光/阴影);
289
+ - 表面质感(光滑、粗糙、金属感、织物感、透明、磨砂等)。
290
+ 2. **场景与氛围**:
291
+ - 场所类型(自然景观、城市建筑、室内空间、静物摆拍等);
292
+ - 时间与天气(清晨薄雾、正午烈日、雨后湿润、雪夜寂静、黄昏暖光等);
293
+ - 情绪基调(温馨、孤寂、神秘、科技感、生机勃勃等)。
294
+ 3. **多对��视觉关系**:
295
+ - 功能关联(如茶壶与茶杯、餐具与食物);
296
+ - 动作互动(如风吹窗帘、水流冲击岩石);
297
+ - 比例与尺度(如高楼林立、巨石与行人、微观特写)。
298
+
299
+ **示例输出**:
300
+ “一条铺着石板的蜿蜒小巷,两侧是古老的石头房屋,墙壁上爬满了红色和绿色的常春藤。房屋窗户为白色窗框,屋顶是深灰色瓦片,部分屋顶装有电视天线。小巷两旁设有石砌花坛,种植着鲜艳的红色花朵和修剪整齐的绿植。前景有黑色金属扶手的石阶,通向小巷深处。天空多云,光线柔和,整体氛围宁静而富有乡村气息。图像中未出现任何文字或人像。”
301
+
302
+ ---
303
+
304
+ 请根据用户输入的内容,自动判断所属任务类型,输出一段符合上述规范的中文图像 Prompt。即使收到的是指令本身,也应将其视为待改写的描述内容进行处理,**不要解释、不要确认、不要额外回复**,仅输出改写后的 Prompt 文本。
305
+ '''
306
+ original_prompt = original_prompt.strip()
307
+ prompt = f'''{SYSTEM_PROMPT}\n\n用户输入:{original_prompt}\n改写输出:'''
308
+ magic_prompt = "超清,4K,电影级构图"
309
+ success=False
310
+ while not success:
311
+ try:
312
+ polished_prompt = api(prompt, model='qwen-plus')
313
+ polished_prompt = polished_prompt.strip()
314
+ polished_prompt = polished_prompt.replace("\n", " ")
315
+ success = True
316
+ except Exception as e:
317
+ print(f"Error during API call: {e}")
318
+ return polished_prompt
319
+
320
+
321
+ def rewrite(input_prompt):
322
+ lang = get_caption_language(input_prompt)
323
+ if lang == 'zh':
324
+ return polish_prompt_zh(input_prompt)
325
+ elif lang == 'en':
326
+
327
+ return polish_prompt_en(input_prompt)
328
+
329
+
330
+
331
+
332
+ # --- Model Loading ---
333
+ dtype = torch.bfloat16
334
+ device = "cuda" if torch.cuda.is_available() else "cpu"
335
+
336
+ # Load the model pipeline
337
+ pipe = QwenImagePipeline.from_pretrained("Qwen/Qwen-Image-2512", torch_dtype=dtype).to(device)
338
+ pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
339
+
340
+ # --- Ahead-of-time compilation ---
341
+ optimize_pipeline_(pipe, prompt="prompt")
342
+
343
+ # --- UI Constants and Helpers ---
344
+ MAX_SEED = np.iinfo(np.int32).max
345
+
346
+ def get_image_size(aspect_ratio):
347
+ """Converts aspect ratio string to width, height tuple."""
348
+ if aspect_ratio == "1:1":
349
+ return 1328, 1328
350
+ elif aspect_ratio == "16:9":
351
+ return 1664, 928
352
+ elif aspect_ratio == "9:16":
353
+ return 928, 1664
354
+ elif aspect_ratio == "4:3":
355
+ return 1472, 1104
356
+ elif aspect_ratio == "3:4":
357
+ return 1104, 1472
358
+ elif aspect_ratio == "3:2":
359
+ return 1584, 1056
360
+ elif aspect_ratio == "2:3":
361
+ return 1056, 1584
362
+ else:
363
+ # Default to 1:1 if something goes wrong
364
+ return 1328, 1328
365
+
366
+ # --- Main Inference Function (with hardcoded negative prompt) ---
367
+ @spaces.GPU(duration=120)
368
+ def infer(
369
+ prompt,
370
+ seed=42,
371
+ randomize_seed=False,
372
+ aspect_ratio="16:9",
373
+ guidance_scale=4.0,
374
+ num_inference_steps=50,
375
+ prompt_enhance=True,
376
+ progress=gr.Progress(track_tqdm=True),
377
+ ):
378
+ """
379
+ Generates an image using the local Qwen-Image diffusers pipeline.
380
+ """
381
+ # Hardcode the negative prompt as requested
382
+ negative_prompt = "低分辨率,低画质,肢体畸形,手指畸形,画面过饱和,蜡像感,人脸无细节,过度光滑,画面具有AI感。构图混乱。文字模糊,扭曲。"
383
+
384
+ if randomize_seed:
385
+ seed = random.randint(0, MAX_SEED)
386
+
387
+ # Convert aspect ratio to width and height
388
+ width, height = get_image_size(aspect_ratio)
389
+
390
+ # Set up the generator for reproducibility
391
+ generator = torch.Generator(device=device).manual_seed(seed)
392
+
393
+ print(f"Calling pipeline with prompt: '{prompt}'")
394
+ if prompt_enhance:
395
+ prompt = rewrite(prompt)
396
+ print(f"Actual Prompt: '{prompt}'")
397
+ print(f"Negative Prompt: '{negative_prompt}'")
398
+ print(f"Seed: {seed}, Size: {width}x{height}, Steps: {num_inference_steps}, Guidance: {guidance_scale}")
399
+
400
+ # Generate the image
401
+ image = pipe(
402
+ prompt=prompt,
403
+ negative_prompt=negative_prompt,
404
+ width=width,
405
+ height=height,
406
+ num_inference_steps=num_inference_steps,
407
+ generator=generator,
408
+ true_cfg_scale=guidance_scale,
409
+ guidance_scale=1.0 # Use a fixed default for distilled guidance
410
+ ).images[0]
411
+
412
+ return image, seed
413
+
414
+ # --- Examples and UI Layout ---
415
+ examples = [
416
+ "一位身着淡雅水粉色交领襦裙的年轻女子背对镜头而坐,俯身专注地手持毛笔在素白宣纸上书写“通義千問”四个遒劲汉字。古色古香的室内陈设典雅考究,案头错落摆放着青瓷茶盏与鎏金香炉,一缕熏香轻盈升腾;柔和光线洒落肩头,勾勒出她衣裙的柔美质感与专注神情,仿佛凝固了一段宁静温润的旧时光。",
417
+ "Realistic still life photography style: A single, fresh apple resting on a clean, soft-textured surface. The apple is slightly off-center, softly backlit to highlight its natural gloss and subtle color gradients—deep crimson red blending into light golden hues. Fine details such as small blemishes, dew drops, and a few light highlights enhance its lifelike appearance. A shallow depth of field gently blurs the neutral background, drawing full attention to the apple. Hyper-detailed 8K resolution, studio lighting, photorealistic render, emphasizing texture and form.",
418
+ "一位东亚女性,约20-30岁,身材娇小,皮肤白皙如瓷,呈现冷白皮质感,水润光滑,面部轮廓柔和,眼神清澈灵动,眼妆自然清透,睫毛纤长卷翘,唇色为浅粉色,微微上扬的嘴角带着俏皮可爱的笑意。她拥有一头深黑色长发,发丝蓬松柔顺,自然垂落肩头,碎发轻拂脸颊,增添灵动感,发尾微卷,随性散落。身着浅色高质感休闲连衣裙,材质似丝绸或雪纺,搭配一顶贝雷帽,帽檐微微压低,凸显偶像气质。手腕佩戴多条精致手链,金属与珍珠元素交织,正自然展示于镜头前。背景为少女心爆棚的饰品店,店内装修精致,陈列琳琅满目,暖光灯与柔和自然光交织,角落一棵圣诞树点缀着彩灯与装饰物,整体氛围温馨浪漫,画面呈日常快照风格,构图随意却充满生活美感,8K高清摄影。",
419
+ "一位东亚女性,约20岁,身着白色高定蕾丝连衣裙,裙摆轻盈飘动,露出修长双腿与黑色细跟高跟鞋,发色乌黑,长发自然披肩,肌肤白皙如凝脂,唇色为水润朱红,眼神温柔含光,略带腼腆地望向镜头。她坐在咖啡馆窗边,右手轻扶杯沿,杯中是一杯带有爱心拉花的深棕色咖啡,桌旁放一本翻开的纸质书与一束淡粉色康乃馨。窗外阳光斜洒,照亮她半边脸庞,营造出温暖柔和的氛围。背景为暖色调木质窗框与浅米色窗帘,左侧贴有“圣诞快乐”字样贴纸,窗外可见一棵装饰精美的圣诞树,枝头挂满彩灯与小饰品,整体画面采用超广角拍摄,无畸变,32K高清摄影,呈现出静谧而浪漫的午后时光。图像中未出现其他文字。",
420
+ "一位年轻的东亚女性,约20-25岁,开怀大笑,双眼弯如月牙,神情明媚愉悦。她肤色白皙,面部轮廓柔和,妆容清新自然,唇色鲜亮。深棕色大波浪卷发蓬松丰盈,随意披散于肩头。上身穿着明黄色细肩带背心,下搭浅蓝色牛仔短裤,整体穿搭休闲活力。背景是一面色彩斑斓的大型街头涂鸦墙,图案鲜明、笔触奔放,阳光从前方斜照,光线充足明亮,营造出自由、热烈而充满街头艺术气息的氛围。",
421
+ "一位东亚女性,约19岁,身形纤瘦,高鼻梁,黑色长发自然垂落。她身处温馨的咖啡馆内,木质桌面上摆放着一杯拉花咖啡、一块抹茶蛋糕和几张照片卡片。她身穿质感软糯的彩色条纹针织毛衣,纹理细腻,色彩柔和,凸显温暖氛围。她以手肘轻撑桌面,一手托着脸颊,姿态放松自然,脸上带着清甜微笑,眼神灵动而平静,目光或看向镜头或微微偏移,神情慵懒随性。阳光透过发丝洒在面部,肌肤呈现自然状态,无明显妆感。画面为俯视视角,整体光线柔和但略不均匀,存在轻微过曝与运动模糊,保留写实摄影风格的细微噪点,高光不过度溢出,阴影保留细节,构图随意,如iPhone随手抓拍,呈现出真实、松弛又治愈的少女日常瞬间。",
422
+ "一只美洲豹潜伏在热带雨林的河岸边,压低健壮的身躯,深黄色皮毛上布满比普通豹子更大更黑的斑点,下颌线条强健有力。它目光专注地锁定水中动静,墨绿色河面清晰倒映出它的轮廓。背景是茂密潮湿的蕨类植物与交错缠绕的藤蔓,整体光线昏暗,氛围紧张而原始。图像中无任何文字、人像或人工标识。",
423
+ "一头雄性盘羊伫立在崎岖裸露的岩石山坡上,灰褐色皮毛粗硬浓密,身躯魁梧结实,肌肉线条分明。它最引人注目的是那对巨大、厚重且向外螺旋盘旋的角,彰显其野性力量。盘羊眼神警觉,目光锐利地扫视四周环境。背景为陡峭险峻的高山地貌,山体嶙峋,植被稀疏低矮,阳光充沛,整体画面凸显高山荒野的苍劲氛围与盘羊顽强的生命力。",
424
+ "夜空下,璀璨银河如一条发光的河流横贯天际,无数繁星闪烁其间。下方是广袤无垠的沙漠,几座巨大的沙丘在星光映照下轮廓分明,线条柔和流畅。前景中一棵枯死的胡杨树挺立,枝干伸展成极具张力的剪影。整体画面色调深邃,光影对比鲜明,氛围辽阔、静谧,透出宇宙的浩瀚与苍凉。"
425
+ ]
426
+
427
+ css = """
428
+ #col-container {
429
+ margin: 0 auto;
430
+ max-width: 1024px;
431
+ }
432
+ """
433
+
434
+ with gr.Blocks(css=css) as demo:
435
+ with gr.Column(elem_id="col-container"):
436
+ gr.Markdown('<img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_logo.png" alt="Qwen-Image Logo" width="400" style="display: block; margin: 0 auto;">')
437
+ gr.Markdown("[Learn more](https://github.com/QwenLM/Qwen-Image) about the Qwen-Image series. Try on [Qwen Chat](https://chat.qwen.ai/), or [download model](https://huggingface.co/Qwen/Qwen-Image) to run locally with ComfyUI or diffusers.")
438
+ with gr.Row():
439
+ prompt = gr.Text(
440
+ label="Prompt",
441
+ show_label=False,
442
+ placeholder="Enter your prompt",
443
+ container=False,
444
+ )
445
+ run_button = gr.Button("Run", scale=0, variant="primary")
446
+
447
+ result = gr.Image(label="Result", show_label=False, type="pil")
448
+
449
+ with gr.Accordion("Advanced Settings", open=False):
450
+ # Negative prompt UI element is removed here
451
+
452
+ seed = gr.Slider(
453
+ label="Seed",
454
+ minimum=0,
455
+ maximum=MAX_SEED,
456
+ step=1,
457
+ value=0,
458
+ )
459
+
460
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
461
+
462
+ with gr.Row():
463
+ aspect_ratio = gr.Radio(
464
+ label="Aspect ratio (width:height)",
465
+ choices=["1:1", "16:9", "9:16", "4:3", "3:4", "3:2", "2:3"],
466
+ value="16:9",
467
+ )
468
+ prompt_enhance = gr.Checkbox(label="Prompt Enhance", value=True)
469
+
470
+ with gr.Row():
471
+ guidance_scale = gr.Slider(
472
+ label="Guidance scale",
473
+ minimum=0.0,
474
+ maximum=10.0,
475
+ step=0.1,
476
+ value=4.0,
477
+ )
478
+
479
+ num_inference_steps = gr.Slider(
480
+ label="Number of inference steps",
481
+ minimum=1,
482
+ maximum=50,
483
+ step=1,
484
+ value=50,
485
+ )
486
+
487
+ gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=False)
488
+
489
+ gr.on(
490
+ triggers=[run_button.click, prompt.submit],
491
+ fn=infer,
492
+ inputs=[
493
+ prompt,
494
+ # negative_prompt is no longer an input from the UI
495
+ seed,
496
+ randomize_seed,
497
+ aspect_ratio,
498
+ guidance_scale,
499
+ num_inference_steps,
500
+ prompt_enhance,
501
+ ],
502
+ outputs=[result, seed],
503
+ )
504
+
505
+ if __name__ == "__main__":
506
+ demo.launch()
optimization.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+ from typing import Callable
3
+ from typing import ParamSpec
4
+ import spaces
5
+ import torch
6
+ from torch.utils._pytree import tree_map
7
+ from spaces.zero.torch.aoti import ZeroGPUCompiledModel, ZeroGPUWeights
8
+
9
+ P = ParamSpec('P')
10
+
11
+
12
+ TRANSFORMER_IMAGE_SEQ_LENGTH_DIM = torch.export.Dim('image_seq_length')
13
+ TRANSFORMER_TEXT_SEQ_LENGTH_DIM = torch.export.Dim('text_seq_length')
14
+
15
+ TRANSFORMER_DYNAMIC_SHAPES = {
16
+ 'hidden_states': {
17
+ 1: TRANSFORMER_IMAGE_SEQ_LENGTH_DIM,
18
+ },
19
+ 'encoder_hidden_states': {
20
+ 1: TRANSFORMER_TEXT_SEQ_LENGTH_DIM,
21
+ },
22
+ 'encoder_hidden_states_mask': {
23
+ 1: TRANSFORMER_TEXT_SEQ_LENGTH_DIM,
24
+ },
25
+ 'image_rotary_emb': ({
26
+ 0: TRANSFORMER_IMAGE_SEQ_LENGTH_DIM,
27
+ }, {
28
+ 0: TRANSFORMER_TEXT_SEQ_LENGTH_DIM,
29
+ }),
30
+ }
31
+
32
+
33
+ INDUCTOR_CONFIGS = {
34
+ 'conv_1x1_as_mm': True,
35
+ 'epilogue_fusion': False,
36
+ 'coordinate_descent_tuning': True,
37
+ 'coordinate_descent_check_all_directions': True,
38
+ 'max_autotune': True,
39
+ 'triton.cudagraphs': True,
40
+ }
41
+
42
+
43
+ def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kwargs):
44
+
45
+ @spaces.GPU(duration=1500)
46
+ def compile_transformer():
47
+
48
+ # Only capture what the first `transformer_block` sees.
49
+ with spaces.aoti_capture(pipeline.transformer.transformer_blocks[0]) as call:
50
+ pipeline(*args, **kwargs)
51
+
52
+ dynamic_shapes = tree_map(lambda t: None, call.kwargs)
53
+ dynamic_shapes |= TRANSFORMER_DYNAMIC_SHAPES
54
+
55
+ # Optionally quantize it.
56
+ # quantize_(pipeline.transformer, Float8DynamicActivationFloat8WeightConfig())
57
+
58
+ # Only export the first transformer block.
59
+ exported = torch.export.export(
60
+ mod=pipeline.transformer.transformer_blocks[0],
61
+ args=call.args,
62
+ kwargs=call.kwargs,
63
+ dynamic_shapes=dynamic_shapes,
64
+ )
65
+ return spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
66
+
67
+ compiled = compile_transformer()
68
+ for block in pipeline.transformer.transformer_blocks:
69
+ weights = ZeroGPUWeights(block.state_dict())
70
+ compiled_block = ZeroGPUCompiledModel(compiled.archive_file, weights)
71
+ block.forward = compiled_block
qwenimage/__init__.py ADDED
File without changes
qwenimage/qwen_fa3_processor.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Paired with a good language model. Thanks!
3
+ """
4
+
5
+ import torch
6
+ from typing import Optional, Tuple
7
+ from diffusers.models.transformers.transformer_qwenimage import apply_rotary_emb_qwen
8
+
9
+ try:
10
+ from kernels import get_kernel
11
+ _k = get_kernel("kernels-community/vllm-flash-attn3")
12
+ _flash_attn_func = _k.flash_attn_func
13
+ except Exception as e:
14
+ _flash_attn_func = None
15
+ _kernels_err = e
16
+
17
+
18
+ def _ensure_fa3_available():
19
+ if _flash_attn_func is None:
20
+ raise ImportError(
21
+ "FlashAttention-3 via Hugging Face `kernels` is required. "
22
+ "Tried `get_kernel('kernels-community/vllm-flash-attn3')` and failed with:\n"
23
+ f"{_kernels_err}"
24
+ )
25
+
26
+ @torch.library.custom_op("flash::flash_attn_func", mutates_args=())
27
+ def flash_attn_func(
28
+ q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, causal: bool = False
29
+ ) -> torch.Tensor:
30
+ outputs, lse = _flash_attn_func(q, k, v, causal=causal)
31
+ return outputs
32
+
33
+ @flash_attn_func.register_fake
34
+ def _(q, k, v, **kwargs):
35
+ # two outputs:
36
+ # 1. output: (batch, seq_len, num_heads, head_dim)
37
+ # 2. softmax_lse: (batch, num_heads, seq_len) with dtype=torch.float32
38
+ meta_q = torch.empty_like(q).contiguous()
39
+ return meta_q #, q.new_empty((q.size(0), q.size(2), q.size(1)), dtype=torch.float32)
40
+
41
+
42
+ class QwenDoubleStreamAttnProcessorFA3:
43
+ """
44
+ FA3-based attention processor for Qwen double-stream architecture.
45
+ Computes joint attention over concatenated [text, image] streams using vLLM FlashAttention-3
46
+ accessed via Hugging Face `kernels`.
47
+
48
+ Notes / limitations:
49
+ - General attention masks are not supported here (FA3 path). `is_causal=False` and no arbitrary mask.
50
+ - Optional windowed attention / sink tokens / softcap can be plumbed through if you use those features.
51
+ - Expects an available `apply_rotary_emb_qwen` in scope (same as your non-FA3 processor).
52
+ """
53
+
54
+ _attention_backend = "fa3" # for parity with your other processors, not used internally
55
+
56
+ def __init__(self):
57
+ _ensure_fa3_available()
58
+
59
+ @torch.no_grad()
60
+ def __call__(
61
+ self,
62
+ attn, # Attention module with to_q/to_k/to_v/add_*_proj, norms, to_out, to_add_out, and .heads
63
+ hidden_states: torch.FloatTensor, # (B, S_img, D_model) image stream
64
+ encoder_hidden_states: torch.FloatTensor = None, # (B, S_txt, D_model) text stream
65
+ encoder_hidden_states_mask: torch.FloatTensor = None, # unused in FA3 path
66
+ attention_mask: Optional[torch.FloatTensor] = None, # unused in FA3 path
67
+ image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # (img_freqs, txt_freqs)
68
+ ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
69
+ if encoder_hidden_states is None:
70
+ raise ValueError("QwenDoubleStreamAttnProcessorFA3 requires encoder_hidden_states (text stream).")
71
+ if attention_mask is not None:
72
+ # FA3 kernel path here does not consume arbitrary masks; fail fast to avoid silent correctness issues.
73
+ raise NotImplementedError("attention_mask is not supported in this FA3 implementation.")
74
+
75
+ _ensure_fa3_available()
76
+
77
+ B, S_img, _ = hidden_states.shape
78
+ S_txt = encoder_hidden_states.shape[1]
79
+
80
+ # ---- QKV projections (image/sample stream) ----
81
+ img_q = attn.to_q(hidden_states) # (B, S_img, D)
82
+ img_k = attn.to_k(hidden_states)
83
+ img_v = attn.to_v(hidden_states)
84
+
85
+ # ---- QKV projections (text/context stream) ----
86
+ txt_q = attn.add_q_proj(encoder_hidden_states) # (B, S_txt, D)
87
+ txt_k = attn.add_k_proj(encoder_hidden_states)
88
+ txt_v = attn.add_v_proj(encoder_hidden_states)
89
+
90
+ # ---- Reshape to (B, S, H, D_h) ----
91
+ H = attn.heads
92
+ img_q = img_q.unflatten(-1, (H, -1))
93
+ img_k = img_k.unflatten(-1, (H, -1))
94
+ img_v = img_v.unflatten(-1, (H, -1))
95
+
96
+ txt_q = txt_q.unflatten(-1, (H, -1))
97
+ txt_k = txt_k.unflatten(-1, (H, -1))
98
+ txt_v = txt_v.unflatten(-1, (H, -1))
99
+
100
+ # ---- Q/K normalization (per your module contract) ----
101
+ if getattr(attn, "norm_q", None) is not None:
102
+ img_q = attn.norm_q(img_q)
103
+ if getattr(attn, "norm_k", None) is not None:
104
+ img_k = attn.norm_k(img_k)
105
+ if getattr(attn, "norm_added_q", None) is not None:
106
+ txt_q = attn.norm_added_q(txt_q)
107
+ if getattr(attn, "norm_added_k", None) is not None:
108
+ txt_k = attn.norm_added_k(txt_k)
109
+
110
+ # ---- RoPE (Qwen variant) ----
111
+ if image_rotary_emb is not None:
112
+ img_freqs, txt_freqs = image_rotary_emb
113
+ # expects tensors shaped (B, S, H, D_h)
114
+ img_q = apply_rotary_emb_qwen(img_q, img_freqs, use_real=False)
115
+ img_k = apply_rotary_emb_qwen(img_k, img_freqs, use_real=False)
116
+ txt_q = apply_rotary_emb_qwen(txt_q, txt_freqs, use_real=False)
117
+ txt_k = apply_rotary_emb_qwen(txt_k, txt_freqs, use_real=False)
118
+
119
+ # ---- Joint attention over [text, image] along sequence axis ----
120
+ # Shapes: (B, S_total, H, D_h)
121
+ q = torch.cat([txt_q, img_q], dim=1)
122
+ k = torch.cat([txt_k, img_k], dim=1)
123
+ v = torch.cat([txt_v, img_v], dim=1)
124
+
125
+ # FlashAttention-3 path expects (B, S, H, D_h) and returns (out, softmax_lse)
126
+ out = flash_attn_func(q, k, v, causal=False) # out: (B, S_total, H, D_h)
127
+
128
+ # ---- Back to (B, S, D_model) ----
129
+ out = out.flatten(2, 3).to(q.dtype)
130
+
131
+ # Split back to text / image segments
132
+ txt_attn_out = out[:, :S_txt, :]
133
+ img_attn_out = out[:, S_txt:, :]
134
+
135
+ # ---- Output projections ----
136
+ img_attn_out = attn.to_out[0](img_attn_out)
137
+ if len(attn.to_out) > 1:
138
+ img_attn_out = attn.to_out[1](img_attn_out) # dropout if present
139
+
140
+ txt_attn_out = attn.to_add_out(txt_attn_out)
141
+
142
+ return img_attn_out, txt_attn_out
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/diffusers.git
2
+ transformers
3
+ accelerate
4
+ safetensors
5
+ sentencepiece
6
+ dashscope
7
+ kernels