SleepVeryHard commited on 9 days ago

Commit

ac9d6fd

verified ·

1 Parent(s): 9865576

bump

Browse files

Files changed (24) hide show

.gitattributes +12 -0
examples/chroma.jpg +3 -0
examples/json.jpg +3 -0
examples/json_comic.jpg +3 -0
examples/long.jpg +3 -0
examples/long_thoughts.jpg +3 -0
examples/long_thoughts_v2.jpg +0 -0
examples/md_comic.jpg +3 -0
examples/min_structured_json.jpg +3 -0
examples/min_structured_md.jpg +3 -0
examples/short.webp +0 -0
formats.md +284 -0
model_BF16.gguf +3 -0
pics/example.jpg +3 -0
pics/nsfw_inaccuracy.png +0 -0
pics/original.jpg +3 -0
pics/retrieval_from_tags.png +0 -0
pics/splash.jpg +3 -0
pics/zero_shot_guess.png +0 -0
scripts/__pycache__/prompts.cpython-310.pyc +0 -0
scripts/caption_distributed.py +319 -0
scripts/gradio_interface.py +635 -0
scripts/prompts.py +220 -0
scripts/transformers_ver.py +67 -0

.gitattributes CHANGED Viewed

@@ -38,3 +38,15 @@ ToriiGate-0.5_Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
 mmproj_Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
 ToriiGate-0.5_Q4_K_L.gguf filter=lfs diff=lfs merge=lfs -text
 ToriiGate-0.5_Q6_K_L.gguf filter=lfs diff=lfs merge=lfs -text

 mmproj_Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
 ToriiGate-0.5_Q4_K_L.gguf filter=lfs diff=lfs merge=lfs -text
 ToriiGate-0.5_Q6_K_L.gguf filter=lfs diff=lfs merge=lfs -text
+examples/chroma.jpg filter=lfs diff=lfs merge=lfs -text
+examples/json_comic.jpg filter=lfs diff=lfs merge=lfs -text
+examples/json.jpg filter=lfs diff=lfs merge=lfs -text
+examples/long_thoughts.jpg filter=lfs diff=lfs merge=lfs -text
+examples/long.jpg filter=lfs diff=lfs merge=lfs -text
+examples/md_comic.jpg filter=lfs diff=lfs merge=lfs -text
+examples/min_structured_json.jpg filter=lfs diff=lfs merge=lfs -text
+examples/min_structured_md.jpg filter=lfs diff=lfs merge=lfs -text
+model_BF16.gguf filter=lfs diff=lfs merge=lfs -text
+pics/example.jpg filter=lfs diff=lfs merge=lfs -text
+pics/original.jpg filter=lfs diff=lfs merge=lfs -text
+pics/splash.jpg filter=lfs diff=lfs merge=lfs -text

examples/chroma.jpg ADDED Viewed

Git LFS Details

SHA256: 3735cef47dc761c78d5e034265335df81bcfd7f0554d6eb714465062f54817f6
Pointer size: 131 Bytes
Size of remote file: 264 kB

examples/json.jpg ADDED Viewed

Git LFS Details

SHA256: 5e5ba7a86cc70383af43234fe1306ac5882fd46a2539250bb16a3398bf77b294
Pointer size: 131 Bytes
Size of remote file: 151 kB

examples/json_comic.jpg ADDED Viewed

Git LFS Details

SHA256: 4ecbb4e0338e2865cc17f7d1b3d72bcb3bd1ca5d3dac0c7d50fb3689d4eb73f6
Pointer size: 131 Bytes
Size of remote file: 158 kB

examples/long.jpg ADDED Viewed

Git LFS Details

SHA256: 73cb70f4334affa777ea22ea50804ab562766a265222ba91c26842251135ff36
Pointer size: 131 Bytes
Size of remote file: 213 kB

examples/long_thoughts.jpg ADDED Viewed

Git LFS Details

SHA256: 7ee4ca01627a93e7c4673e9721b2b86f2c6860a660d5a92160cba8f4afcfdc62
Pointer size: 131 Bytes
Size of remote file: 137 kB

examples/long_thoughts_v2.jpg ADDED Viewed

examples/md_comic.jpg ADDED Viewed

Git LFS Details

SHA256: 0e9f996f16c8d472a5bf249ccc0c875def22f94dd043c142cf111c07d03b2b06
Pointer size: 131 Bytes
Size of remote file: 264 kB

examples/min_structured_json.jpg ADDED Viewed

Git LFS Details

SHA256: 0e6103522edb0ee3ace2db6e45d39564ecca0d99228b99ad055784929a524da7
Pointer size: 131 Bytes
Size of remote file: 210 kB

examples/min_structured_md.jpg ADDED Viewed

Git LFS Details

SHA256: d39ee8704022c162636350523e6745a9dfdfcf25cb4dcd2c8bf2e834832f7f3f
Pointer size: 131 Bytes
Size of remote file: 216 kB

examples/short.webp ADDED Viewed

formats.md ADDED Viewed

	@@ -0,0 +1,284 @@

+# Formats
+The model supports 9 main format variants. They can be divided into 4 categories:
+## 1. Detailed Structured Descriptions with Multiple Sections
+- Long Thoughts
+- Long Thoughts v2
+- Chroma-style
+These create the most detailed descriptions across various criteria with maximum informativeness. They can later be refactored into captions for training a specific model, or their parts can be used for labeling.
+The first two are intended for use only with named characters.
+## 2. Structured Formats for Ready-to-Use Prompts
+- Minimalistic Structured Markdown
+- Minimalistic Structured Json
+- Json
+These can be directly used as prompts after minimal processing. The existing structure allows selecting individual parts for individual prompting, or leaving them in their current form.
+Json is recommended to be parsed into plain text or Markdown.
+The `Minimalistic Structured Markdown` variant has two additional reasoning sections at the beginning; when using it, they should be excluded with regular expressions. It can only be used with character names.
+## 3. Comic Descriptions
+- Markdown Comic
+- Json Comic
+Both formats are optimized for comic and manga descriptions, creating detailed descriptions of each frame.
+`Markdown Comic` has a reasoning block at the beginning to improve accuracy.
+## 4. Legacy Formats
+- Long
+- Short
+Long and short descriptions in plain text without formatting and structuring. Support for using names is available.
+# Examples
+## Long Thoughts v2
+![long_thoughts_v2](examples/long_thoughts_v2.jpg)
+> # 1. Thoughts about characters
+> The image features two characters: Shirakami Fubuki and Sukonbu (a personification of Shirakami Fubuki). Shirakami Fubuki is identifiable by her distinct fox ears, white hair with a single braid, fox tail, and ahoge. Sukonbu is a small, chibi-like figure with fox ears, a star accessory on the chest, and a simple, rounded design.
+>
+> # 2. Key details
+> - The main focus is on the shadow cast on the sand, which shows a figure in a blue bikini with fox ears and a tail, making a double fox shadow puppet gesture with both hands.
+> - The shadow is clear and well-defined, contrasting sharply with the sandy background.
+> - The actual character (Fubuki) is not directly visible, only her shadows and a small plush-like Sukonbu figure in the lower left corner.
+> - The shadow hints at a playful and cute moment, referencing the double fox shadow puppet.
+> - The lighting suggests a sunny day with sharp shadows, possibly at sunset or sunrise due to the warm sand tone.
+>
+> # 3. Long description
+> This high-resolution image depicts a sandy beach scene from a birds-eye view. The sand is smooth and uniformly colored in warm beige tones. Dominating the center is a sharp, dark shadow of a fox-eared girl over a blue bikini with a frilled skirt bottom. The shadow shows the character standing on one leg and arms raised, hands forming the classic fox shadow puppet gesture with two fingers extended on each hand. The shadow includes a clearly visible fox tail, fox ears, and an ahoge (a single hair strand sticking up).
+>
+> In the bottom left corner, a small, chibi-style plush or figure resembling Sukonbu, the personification of Shirakami Fubuki, is sitting on the sand. Sukonbu has large black eyes, a small open mouth with a cheerful expression, blue markings on the face, and a star-shaped accessory on its chest. The plushs ears and tail are clearly visible, matching the fox theme.
+>
+> The image cleverly uses the shadow as a focal point, showing the silhouette of Shirakami Fubuki in her blue bikini, while the actual figure is not seen directly, creating a playful and meme-inspired visual. The lighting is bright and casts a crisp shadow on the sand, emphasizing the contours and playful pose of the fox-girl shadow puppet.
+>
+> # 4. Detailed description for each character
+> ## Shirakami Fubuki
+> Though not directly visible, Shirakami Fubukis shadow is the central figure on the sand. The shadow shows her wearing a blue bikini top tied in the center with a bow and a frilled bikini skirt bottom. She has fox ears on top of her head and a prominent fox tail curling behind her. Her arms are raised, and she is making a double fox shadow puppet with her hands. The shadow also shows her signature ahoge on top of her head. Her posture is relaxed, standing with legs apart, and the shadow captures her playful and cute personality.
+>
+> ## Sukonbu (Shirakami Fubuki)
+> Sukonbu appears as a small, chibi plush or figure in the bottom left corner of the image. It has a round, white face with large black oval eyes, a small smiling mouth, and light blue markings on the cheeks and forehead. The plush has large fox ears with black inner parts and a yellow tuft of hair at the back. It wears a blue scarf or ribbon tied around its neck with a star-shaped accessory in the center. The tail is fluffy and visible behind it, matching the fox theme. Sukonbu looks cheerful and adds a cute contrast to the shadow figure.
+## Long Thoughts
+![long_thoughts](examples/long_thoughts.jpg)
+> # 1. Thoughts about characters
+> The main identifiable character here is Wakamo from Blue Archive, recognized by her distinctive fox ears, large fluffy tail, and long black hair with yellow eyes. The other figure is a roughly sketched man in a suit, likely representing Sensei (Blue Archive), though he is not fully detailed or named.
+>
+> # 2. General description
+> The image is a humorous, stylized cartoon drawing with a simple white background. It features a chibi-styled Wakamo sitting on a small wooden stool, holding a bouquet of yellow flowers. She has large fox ears, a fluffy black and red tail wagging excitedly, and wears a black dress with gold patterns and a red cape. Next to her stands a crudely drawn man in a beige suit, tightly bound in red string, holding a "Marriage Certificate" paper with a red stamp and a checkmark. The mans face is obscured by a scribble, and he smiles awkwardly. In the upper right corner, a small inset photo shows a real-life wedding scene with the same couple tied up in a similar red string bondage, smiling and posing happily.
+>
+> # 3. Detailed description for each character
+> ## Wakamo (Blue Archive)
+> Wakamo is portrayed in a chibi style with exaggerated large yellow eyes and a small open mouth showing a cheerful expression. Her long black hair flows down her back, adorned with a small yellow flower hair ornament on one side. Her fox ears are black with red tips and white inner fur, positioned upright. She wears a black dress with gold cloud-like patterns and a red cape draped over her shoulders. Her large, fluffy fox tail is black with a red tip and is animatedly wagging, indicated by motion lines. She holds a small bouquet of yellow flowers in her gloved hands and sits on a simple wooden stool.
+>
+> ## Sensei (Blue Archive) (sketch)
+> The man next to Wakamo is roughly sketched with minimal detail. He wears a beige suit with a white shirt and a tie, and a small yellow flower is tucked into his suit pocket. His face is obscured by a scribble, but he has a smiling mouth visible. He is tightly bound in red string crossing his torso and arms, holding a paper labeled "Marriage Certificate" with a red stamp and a checkmark. His posture is slightly bent forward, and the strings appear taut, emphasizing the bondage.
+>
+> # 4. Individual Parts
+> 1. Wakamos large fox ears: black with red tips and white inner fur, upright on her head.
+> 2. Wakamos long black hair: flowing behind her with a yellow flower ornament on the left side.
+> 3. Wakamos yellow eyes: large and round with a happy expression.
+> 4. Wakamos black dress: decorated with gold cloud patterns, short and fitted.
+> 5. Wakamos red cape: draped over her shoulders, flowing behind her.
+> 6. Wakamos fluffy fox tail: black with a red tip, wagging energetically with motion lines.
+> 7. Wakamos small bouquet: yellow flowers held in both hands.
+> 8. Wakamos small wooden stool: simple design, supporting her seated position.
+> 9. The mans beige suit: classic cut with a white shirt and tie underneath.
+> 10. Yellow flower in the mans suit pocket: matching the bouquet Wakamo holds.
+> 11. Mans face: scribbled out, obscuring identity, but smiling mouth is visible.
+> 12. Red string bondage: tightly wrapped around the mans torso, arms, and hands.
+> 13. Marriage Certificate paper: held by the man, with red stamp and a checkmark.
+> 14. Motion lines around Wakamos tail: indicating wagging movement.
+> 15. White background: clean and simple, focusing attention on characters.
+> 16. Small inset photo in upper right corner: real-life photo of the same couple tied up in red string, smiling, with wedding attire and bouquet.
+>
+> # 5. Texts on image
+> - On the paper held by the man, the text reads: "Marriage Certificate" in handwritten style.
+> - The paper also features a red stamp and a checkmark symbol.
+>
+> # 6. Background and effects
+> The background is plain white with no additional elements, emphasizing the characters. The main drawing style is cartoonish with simple lines and flat colors. The inset photo in the upper right corner is a real-life photograph showing the same couple tied up in red string bondage, smiling and posing happily in wedding attire with a bouquet. The contrast between the stylized cartoon and the real photo adds a humorous meta-commentary. The image is clear and focused, with no blur or depth of field effects. The tails wagging motion is conveyed through curved motion lines.
+## Chroma-style
+![chroma](examples/chroma.jpg)
+> ### 1. Regular Summary:
+>
+> This piece is a stunning digital painting, likely done by a professional artist judging by the detail and dynamic composition. The perspective is a dramatic dutch angle, looking up at Rosmontis from Arknights, a catgirl with long grey hair and green eyes, who's wielding a massive, futuristic-looking weapon that looks like a jetpack or some kind of energy cannon.  She's dressed in a short, sleeveless white dress with a black jacket draped over her shoulders, black gloves, and black boots. Her cat ears and tail are clearly visible. The background is a chaotic scene of crumbling buildings and debris, suggesting a post-apocalyptic or battle-ravaged city.  The color palette is mostly dark greys and blacks, contrasted by the bright blues and reds of the energy effects and weapon.  There's a sense of intense action and power; she looks ready to wreck some shit. The overall aesthetic is high fantasy meets sci-fi, with a lot of detail in the character design and the weapon's design. The artist clearly has skill and the work is very high quality.
+>
+> ### 2. Individual Parts:
+>
+> 1. **Rosmontis (Arknights):** The central figure, a catgirl wielding a large weapon. She's positioned slightly off-center, facing towards the viewer.
+> 2. **Large Weapon:** A futuristic-looking energy cannon or jetpack-like device, held by Rosmontis.  It's the most prominent object in the image, with glowing blue and red energy effects.
+> 3. **White Dress:** A short, sleeveless white dress worn by Rosmontis.
+> 4. **Black Jacket:** A black jacket draped over Rosmontis's shoulders.
+> 5. **Black Gloves:** Gloves worn by Rosmontis.
+> 6. **Black Boots:** Boots worn by Rosmontis.
+> 7. **Cat Ears:** Cat ears atop Rosmontis's head.
+> 8. **Cat Tail:** A cat tail extending from Rosmontis's back.
+> 9. **Crumbling Buildings:** The background consists of partially destroyed buildings, suggesting a post-apocalyptic setting.
+> 10. **Debris:** Scattered debris and rubble throughout the background.
+> 11. **Energy Effects:** Bright blue and red energy effects surrounding the weapon and Rosmontis, adding to the dynamic feel.
+> 12. **Ground:** The ground is cracked and broken, consistent with the background's overall state of destruction.
+> 13. **Lighting:** The lighting is dramatic, highlighting Rosmontis and the weapon against the darker background.
+> 14. **Perspective:** The image is shot from a low angle, looking up at Rosmontis, creating a sense of power and scale.
+> 15. **Color Palette:** The color palette is dark, with contrasting bright blues and reds.
+>
+>
+> ### 3. Midjourney-Style Summary:
+>
+> Rosmontis, Arknights character, catgirl, wielding massive energy weapon,  futuristic design, glowing blue and red energy,  short white dress, black jacket,  black gloves and boots,  cat ears and tail visible,  dramatic dutch angle,  crumbling city background,  post-apocalyptic setting,  intense action,  high fantasy/sci-fi aesthetic,  high-quality artwork,  dynamic composition,  dark color palette,  bright color accents,  sense of power and scale,  detailed character design,  detailed weapon design,  professional-level art.
+>
+> ### 4. DeviantArt Commission Request
+>
+> A dynamic action piece featuring Rosmontis from Arknights.  She's a catgirl with long grey hair and green eyes,  wearing a short, sleeveless white dress and a black jacket.  She's wielding a huge, futuristic energy weapon that resembles a jetpack or energy cannon,  with glowing blue and red energy effects. The background should be a chaotic scene of a ruined city, with crumbling buildings and debris.  The overall style should be high fantasy meets sci-fi, with a focus on dynamic action and detail. The perspective should be a dramatic dutch angle, looking up at Rosmontis from a low angle.  The lighting should be dramatic, highlighting Rosmontis and the weapon.  The color palette should be mostly dark greys and blacks, with bright blue and red accents.  High resolution, please.
+## Minimalistic Structured Markdown
+![min_structured_md](examples/min_structured_md.jpg)
+> # 1. Thoughts about characters
+> The girl with long white hair tied in a ponytail, pointy elf ears, and wearing a white and gold-trimmed outfit is clearly Frieren. The boy with short blue hair, blue eyes, and a blue cloak is Himmel from Sousou no Frieren.
+>
+> # 2. Key details
+> The two characters are sharing a gentle kiss, eyes closed, standing close to each other in front of a rustic stone and timber house with a red roof. The outdoor setting is bright with a blue sky, scattered clouds, and mountains in the background. The mood is romantic and serene. Petals float softly around them, adding a delicate, dreamy atmosphere.
+>
+> # 3. Structured description
+> ## General
+> The image depicts a romantic moment outdoors, with two characters kissing in front of a traditional stone and wood cottage with a red-tiled roof. The scene is bathed in warm natural light, with a mountain range and scattered clouds visible in the background. The grassy field and stone wall in the foreground add depth and frame the couple.
+>
+> ## Frieren
+> Frieren stands on the left, eyes closed, leaning in for the kiss. She has long white hair pulled into a ponytail, pointed elf ears, and wears a white dress with gold trim and a matching capelet. Her expression is peaceful and tender.
+>
+> ## Himmel (Sousou no Frieren)
+> Himmel stands on the right, also with eyes closed, reciprocating the kiss. He has short blue hair, a blue cloak with a high collar, and a black undershirt visible beneath. His posture is gentle and affectionate.
+>
+> ## Image effects
+> The image has a soft, slightly grainy texture reminiscent of traditional animation or painted art. The lighting is warm and natural, casting gentle shadows and highlights that emphasize the characters and the background scenery. Petals floating in the air add a subtle dynamic element.
+## Minimalistic Structured Json
+![min_structured_json](examples/min_structured_json.jpg)
+> {
+>
+> "General": "Three girls from Nijisanji pose closely together against a plain white background, each making a distinct hand sign. The image is brightly lit and sharply detailed, focusing on their upper bodies and expressive faces.",
+>
+> "Ange Katrina": "On the left, Ange has short, layered red hair with a small gold triangular hairclip. Her blue eyes and slightly parted lips show a gentle, curious expression with a faint blush. She wears a red jacket with wide white cuffs over a high-collared white shirt adorned with a large blue bow and gold triangular decorations. Her right hand forms a peace sign near her face.",
+>
+> "Lize Helesta": "Centered, Lize has long white hair with blue streaks and blunt bangs, accented by a blue feather hair ornament on the left side. Her purple eyes and confident, slightly smug smile with a blush accompany her raised hands making a double 'rock on' gesture. She wears a white blouse with a high collar, a large blue bow, and frilled blue cuffs under a sleeveless dark vest.",
+>
+> "Inui Toko": "On the right, Inui sports long dark brown hair styled in low twintails with black dog ears topped by a cream maid headdress. Her heterochromia features one yellow and one red eye, and she has a small red flower hairpin. She wears a black kimono with wide sleeves, a white frilled collar, and a blue obi tied at the back. A small black-and-white dog plush or accessory peeks behind her shoulder. She makes a peace sign with her left hand, her mouth open slightly revealing a small fang, and her cheeks flushed."
+>
+> }
+## Json
+![json](examples/json.jpg)
+> {
+>
+>   "character_1": "Hoshimi Miyabi is depicted sleeping upright on a couch, her head resting on the shoulder of another girl. She has long black hair with fox ears, and her attire consists of a white shirt, a black necktie, and a dark grey pleated skirt. Her expression is peaceful, her eyes closed. She's wearing black thigh-high stockings with a lace trim at the top.",
+>
+>   "character_2": "Belle is sitting on the couch, holding a smartphone in her hands. She has short, blue hair and green eyes. She's wearing a dark grey long-sleeved shirt with Japanese text on it, and a black skirt. Her expression is calm and she's looking at the phone. She's wearing black thigh-high stockings.",
+>
+>   "background": "The setting appears to be an indoor space, possibly a living room or common area. The background includes a brick wall, a chalkboard, a small potted cactus on a table, and various other indistinct objects. The couch is orange and has cushions. The overall lighting suggests an evening or nighttime setting.",
+>
+>   "image_effects": "The image has a soft, slightly muted color palette, giving it a calm and relaxed atmosphere. The style is consistent with modern anime artwork.",
+>
+>   "texts": "None",
+>
+>   "atmosphere": "The overall atmosphere is intimate and peaceful. The scene depicts a quiet moment of companionship between the two girls, with a sense of calm and relaxation. The soft lighting and muted colors contribute to this feeling."
+>
+> }
+## Markdown Comic
+![md_comic](examples/md_comic.jpg)
+> # 1. Thoughts about characters
+> The three girls depicted are clearly identifiable based on the given tags and their visual traits. The girl with brown hair, blue eyes, long hair, rabbit ears, and wearing a black jacket with a blue scarf is Amiya (Arknights). The girl with short pinkish hair, blue eyes, fox ears, and a white jacket with a red cross is Sussurro (Arknights). The blonde-haired girl with yellow eyes, fox ears, a blue hairband, and a white and black outfit is Suzuran (Arknights).
+>
+> # 2. Key details
+> - The comic humorously contrasts the characters' appearances at two different ages: "In 1097 years" and "In 1102 years."
+> - The first two frames show the characters standing side by side with neutral expressions.
+> - The third frame zooms in on Sussurro’s face with a serious, slightly annoyed expression against a black background, emphasizing her mood.
+> - The final frame shows Sussurro drinking milk directly from a carton, with a speech bubble saying "too late."
+> - A mysterious hooded figure with a milk carton stands in the background of the last frame, adding to the scene’s humor.
+>
+> # 3. Comic format
+> The comic is a 4koma (four-panel comic) arranged in a 2x2 grid. The characters Amiya, Sussurro, and Suzuran (all Arknights operators) appear in all frames except the third, which is a close-up of Sussurro alone.
+>
+> # 4. Details for each frame
+> ## 4.1 Frame 1 (top-left)
+> Three girls stand side by side against a white background with the text "In 1097 years" above them. From left to right: Amiya, Sussurro, and Suzuran. Amiya has long brown hair, rabbit ears, a black jacket with blue highlights, and a blue scarf. Sussurro has short pinkish hair, fox ears, and wears a white jacket with a red cross on the sleeve and a black choker. Suzuran has long blonde hair, fox ears, a blue hairband, and a white and black dress with a skirt. All three have neutral, slightly serious expressions.
+>
+> ## 4.2 Frame 2 (top-right)
+> The same three characters appear again, but the text above reads "In 1102 years." They look slightly older and more mature. Amiya’s hair is longer, and she wears a sleeveless white top with a blue skirt and her jacket hanging off her shoulders. Sussurro’s expression is unchanged, still serious. Suzuran looks more mature with a frilled collar and a more elaborate outfit, standing with her hands on her hips. The background remains white.
+>
+> ## 4.3 Frame 3 (bottom-left)
+> A close-up of Sussurro’s face fills the frame against a black background. She has a slightly annoyed, sweat-dropping expression with narrowed blue eyes. Her fox ears are prominent, and her pinkish hair with an ahoge (hair antenna) curls upward. She wears her white jacket with the red cross visible on the sleeve.
+>
+> ## 4.4 Frame 4 (bottom-right)
+> Sussurro stands drinking milk directly from a carton, tilting her head back with closed eyes and a satisfied expression. She wears a sleeveless blue dress with a black collar and a black choker. Her fox ears and tail are visible. Next to her is a hooded figure (possibly a doctor or nurse) standing silently with a milk carton in hand. On the table in front of them is a bowl of cereal or some dry food and a small white container. Sussurro’s speech bubble says "too late."
+>
+> # 5. Extra comment
+> The comic uses a simple and clean art style with clear linework and soft colors. The humor revolves around the passage of time and Sussurro’s stubborn or impatient attitude, culminating in her drinking milk in a somewhat casual, relaxed manner despite the serious tone of the previous frames. The mysterious hooded figure adds an extra layer of comedic mystery to the final panel.
+## Json Comic
+![json_comic](examples/json_comic.jpg)
+> {
+>
+>   "comic_format": "Comic of 2 frames",
+>
+>   "1st_frame": "A man with brown hair and stubble is shown in a state of shock. He is wearing a jumpsuit with a logo on the chest and holding a blue stuffed toy. The door he is standing next to is broken, and there is a pregnancy test visible in the foreground. There is a poster on the wall with a character and the text 'Ha Ha Yee'.",
+>
+>   "2nd_frame": "A girl with black hair and black eyes is peeking through the broken door. She is smiling and looking at the man with a playful expression. She has a striped shirt visible under her jacket. The text 'Here's Mommy!' is displayed below her.",
+>
+>   "character_1": "The man appears surprised and anxious, with facial hair and wearing a jumpsuit. He is holding a stuffed toy.",
+>
+>   "character_2": "The girl has a mischievous smile, black hair, and is wearing a striped shirt under a jacket. She seems to be the source of the man's surprise.",
+>
+>   "texts": "The text 'Here's Mommy!' is present in the second frame.",
+>
+>   "meaning": "The comic parodies a scene from 'The Shining,' with a humorous twist involving a pregnancy test and the unexpected appearance of the girl."
+>
+> }
+## Long
+![long](examples/long.jpg)
+> The image features two characters from "Boku no Hero Academia," Todoroki Shouto and Bakugou Katsuki, standing back-to-back against a vibrant green background.
+>
+> Todoroki Shouto is on the left. He has striking multicolored hair, split between white and red, and heterochromia with one blue eye and one grey. His expression is calm, with a closed mouth and a slight smile. He wears a stylish letterman jacket with a floral print, showcasing intricate red and orange flowers. The jacket's sleeves are long, and he has a relaxed posture, with his head slightly tilted.
+>
+> On the right is Bakugou Katsuki, identifiable by his spiky blonde hair and intense red eyes. His expression is more animated, with an open mouth revealing sharp teeth, conveying a sense of determination or frustration. He is also wearing a letterman jacket, but with a different floral design featuring subtle white and grey flowers. His body language is assertive, leaning slightly forward.
+>
+> The background is a simple, solid green that makes the characters stand out prominently. The text "Todoroki & Bakugou" is boldly displayed in white, adding a dynamic element to the composition. The overall atmosphere is energetic and vibrant, capturing the contrasting personalities of the two characters.
+## Short
+![short](examples/short.webp)
+> The image features a chibi-style girl, Artoria Pendragon (Fate), with short, pale blonde hair in a bob, small braids, and striking yellow eyes. Her expression is serious, with a slight blush on her cheeks. She wears a dark purplish-grey dress with a low-cut neckline, revealing cleavage, and long sleeves with white detailing. A dark teal pleated skirt is visible underneath, along with black pantyhose and shoes. She holds a black sword with red markings in her right hand, ready for action. The background is a dark bluish-grey with dynamic red streaks, suggesting motion. The overall atmosphere is intense and dramatic, enhanced by the chibi art style. The artist's signature 'Yui2' is in the upper right corner.
+---
+Images belong to their authors and are used exclusively as examples.

model_BF16.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:061ffdbe2656986c014044941be9e259c76c5f1482cde7b855bfc1e8382afe36
+size 9695791968

pics/example.jpg ADDED Viewed

Git LFS Details

SHA256: 99aa3fbdd340eb048b2dd47afdbc5125f89af9dd27a179e5b787f0d749d255db
Pointer size: 131 Bytes
Size of remote file: 237 kB

pics/nsfw_inaccuracy.png ADDED Viewed

pics/original.jpg ADDED Viewed

Git LFS Details

SHA256: 5c80877a61b67b54c975ef7ec9669e9276d09c7a2c922be431c4b4de0d1850ee
Pointer size: 131 Bytes
Size of remote file: 232 kB

pics/retrieval_from_tags.png ADDED Viewed

pics/splash.jpg ADDED Viewed

Git LFS Details

SHA256: b6fe3bc257532dfaea1598c688067c05d6b52e46f11a5f84d66c85ba9ab9cd8e
Pointer size: 131 Bytes
Size of remote file: 142 kB

pics/zero_shot_guess.png ADDED Viewed

scripts/__pycache__/prompts.cpython-310.pyc ADDED Viewed

Binary file (9.89 kB). View file

scripts/caption_distributed.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import json
+import base64
+import requests
+from pathlib import Path
+from typing import Dict, Any, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+from PIL import Image
+# Import prompt building functions from prompts.py
+from prompts import make_user_query, system_prompt, prompts_b
+# ==================== CONFIGURATION ====================
+# Captioning type (from prompts_b in prompts.py)
+C_TYPE = 'long_thoughts_v2'
+if C_TYPE not in prompts_b:
+    raise(f"{C_TYPE} not found in known formats!")
+# Content options
+USE_NAMES = True
+ADD_TAGS = False
+ADD_CHAR_LIST = False
+ADD_CHARS_TAGS = False
+ADD_CHARS_DESCR = False
+# Grounding requires image folder to contain JSON files with the same name with following format:
+# {
+#    "tags": [], # list of strings with tags
+#    "characters": [], # list of strings with character tags/names
+#    "char_p_tags": {"chars": {"Albedo": "girl", "horns", "black_hair",...}, "skins": {}},
+#    "char_descr": {"chars": {"Albedo": "Albedo is a curvy woman with..."}}, "skins": {}}
+# }
+# Output settings
+SUFFIX = "_lsv2_zs.txt"
+# API settings
+API_URL = "http://127.0.0.1:9001/v1/chat/completions"
+API_KEY = "not-needed"  # vllm typically doesn't require auth
+MODEL = "toriigate-0.5"  # or your local model name
+# Processing settings
+INPUT_FOLDER = "/path/to/files"
+#OUTPUT_FOLDER = "/path/to/output"
+OUTPUT_FOLDER = INPUT_FOLDER
+# Thread pool settings
+NUM_WORKERS = 16
+# Image settings
+MAX_PIXELS = 1.0  # Maximum resolution in megapixels (e.g., 1.0 = 1MP)
+# Request settings
+MAX_TOKENS = 2048
+TEMPERATURE = 0.5
+REQUEST_TIMEOUT = 60  # seconds
+# ==================== END CONFIGURATION ====================
+def encode_image_base64(image_path: str, max_pixels: float = MAX_PIXELS) -> str:
+    """Encode image to base64 string, resizing if necessary."""
+    img = Image.open(image_path)
+    # Check if resizing needed
+    current_pixels = img.width * img.height
+    max_pixels_count = max_pixels * 1_000_000
+    if current_pixels <= max_pixels_count:
+        # No resize needed
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+        with open(image_path, "rb") as f:
+            return base64.b64encode(f.read()).decode("utf-8")
+    # Calculate new dimensions while preserving aspect ratio
+    scale = (max_pixels_count / current_pixels) ** 0.5
+    new_width = int(img.width * scale)
+    new_height = int(img.height * scale)
+    # Resize with high quality
+    img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+    # Encode resized image to base64
+    import io
+    buffer = io.BytesIO()
+    img.save(buffer, format='JPEG', quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def load_json_item(json_path: Optional[Path]) -> tuple[Optional[Dict[str, Any]], bool]:
+    """
+    Load JSON metadata from file.
+    Returns (data, was_loaded) tuple. If file missing/None, returns (empty_template, False).
+    """
+    empty_template = {
+        "tags": [],
+        "characters": [],
+        "char_p_tags": {"chars": {}, "skins": {}},
+        "char_descr": {"chars": {}, "skins": {}}
+    }
+    if json_path is None or not json_path.exists():
+        #print(f"[WARN] JSON file not found: {json_path.name if json_path else 'N/A'}")
+        return empty_template, False
+    try:
+        with open(json_path, "r", encoding="utf-8") as f:
+            return json.load(f), True
+    except Exception as e:
+        print(f"[ERROR] Failed to load {json_path}: {e}")
+        return empty_template, False
+def find_image_path(image_name: str, folder: Path) -> Optional[Path]:
+    """Find image file with given name (supports jpg, png, etc.)."""
+    extensions = ['.jpg', '.jpeg', '.png', '.webp', '.bmp']
+    for ext in extensions:
+        path = folder / f"{image_name}{ext}"
+        if path.exists():
+            return path
+    return None
+def find_json_path(image_name: str, folder: Path) -> Optional[Path]:
+    """Find JSON file with given name."""
+    path = folder / f"{image_name}.json"
+    return path if path.exists() else None
+def prepare_messages(item: Dict[str, Any], image_data: str) -> list:
+    """Prepare OpenAI-style messages for the API."""
+    user_query = make_user_query(
+        item,
+        c_type=C_TYPE,
+        use_names=USE_NAMES,
+        add_tags=ADD_TAGS,
+        add_characters=ADD_CHAR_LIST,
+        add_char_tags=ADD_CHARS_TAGS,
+        add_descritpion=ADD_CHARS_DESCR,
+        underscores_replace=False
+    )
+    return [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": system_prompt}]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}},
+                {"type": "text", "text": user_query}
+            ]
+        }
+    ]
+def call_caption_api(messages: list) -> Optional[str]:
+    """Call the captioning API (no retries)."""
+    payload = {
+        "model": MODEL,
+        "messages": messages,
+        "max_tokens": MAX_TOKENS,
+        "temperature": TEMPERATURE,
+        "stream": False
+    }
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {API_KEY}"
+    }
+    try:
+        response = requests.post(
+            API_URL,
+            headers=headers,
+            json=payload,
+            timeout=REQUEST_TIMEOUT
+        )
+        response.raise_for_status()
+        result = response.json()
+        content = result['choices'][0]['message']['content']
+        return content
+    except requests.exceptions.RequestException as e:
+        print(f"[API ERROR] {e}")
+        return None
+    except (KeyError, IndexError) as e:
+        print(f"[PARSE ERROR] Failed to parse API response: {e}")
+        return None
+    return None
+def process_image(image_path: Path, json_path: Path) -> tuple[Optional[str], bool]:
+    """
+    Process a single image and return (caption, json_loaded) tuple.
+    If JSON missing, uses empty template.
+    """
+    # Load JSON metadata
+    item, json_loaded = load_json_item(json_path)
+    # Encode image (with resizing if needed)
+    try:
+        image_data = encode_image_base64(str(image_path), MAX_PIXELS)
+    except Exception as e:
+        print(f"[ERROR] Failed to encode image {image_path.name}: {e}")
+        return None, json_loaded
+    # Prepare messages
+    messages = prepare_messages(item, image_data)
+    # Call API (no retries)
+    caption = call_caption_api(messages)
+    return caption, json_loaded
+def get_base_name(filename: str) -> str:
+    """Get base name without extension."""
+    return Path(filename).stem
+def main():
+    """Main processing loop with progress bar."""
+    input_dir = Path(INPUT_FOLDER)
+    output_dir = Path(OUTPUT_FOLDER)
+    if not input_dir.exists():
+        print(f"Error: Input folder '{INPUT_FOLDER}' not found")
+        return
+    output_dir.mkdir(exist_ok=True)
+    # Find all image files
+    image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.webp', '*.bmp']
+    image_files = []
+    for ext_pattern in image_extensions:
+        image_files.extend(input_dir.glob(ext_pattern))
+    # Remove duplicates and sort
+    image_files = sorted(set(image_files))
+    if not image_files:
+        print(f"No image files found in '{INPUT_FOLDER}'")
+        return
+    print(f"Found {len(image_files)} images to process")
+    print(f"Configuration:")
+    print(f"  C_TYPE: {C_TYPE}")
+    print(f"  USE_NAMES: {USE_NAMES}")
+    print(f"  ADD_TAGS: {ADD_TAGS}")
+    print(f"  ADD_CHAR_LIST: {ADD_CHAR_LIST}")
+    print(f"  ADD_CHARS_TAGS: {ADD_CHARS_TAGS}")
+    print(f"  ADD_CHARS_DESCR: {ADD_CHARS_DESCR}")
+    print(f"  MODEL: {MODEL}")
+    print(f"  API_URL: {API_URL}")
+    print(f"  NUM_WORKERS: {NUM_WORKERS}")
+    print(f"  MAX_PIXELS: {MAX_PIXELS} MP")
+    print("-" * 50)
+    processed = 0
+    failed = 0
+    json_missing = 0
+    # Prepare tasks
+    tasks = []
+    for image_file in image_files:
+        base_name = get_base_name(image_file.name)
+        json_path = find_json_path(base_name, input_dir)
+        tasks.append((image_file, json_path))
+    # Process with thread pool and progress bar
+    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
+        future_to_file = {
+            executor.submit(process_image, img_path, json_path): (img_path, json_path)
+            for img_path, json_path in tasks
+        }
+        for future in tqdm(as_completed(future_to_file), total=len(tasks), desc="Processing", unit="img"):
+            image_path, json_path = future_to_file[future]
+            output_file = output_dir / f"{get_base_name(image_path.name)}{SUFFIX}"
+            try:
+                caption, json_loaded = future.result()
+                if not json_loaded:
+                    json_missing += 1
+                if caption:
+                    # Save caption
+                    try:
+                        with open(output_file, "w", encoding="utf-8") as f:
+                            f.write(caption)
+                        processed += 1
+                    except Exception as e:
+                        tqdm.write(f"[ERROR] Failed to save {output_file.name}: {e}")
+                        failed += 1
+                else:
+                    tqdm.write(f"[ERROR] Captioning failed for {image_path.name}")
+                    failed += 1
+            except Exception as e:
+                tqdm.write(f"[ERROR] Task failed for {image_path.name}: {e}")
+                failed += 1
+    print("=" * 50)
+    print(f"Processing complete:")
+    print(f"  Processed: {processed}")
+    print(f"  JSON missing (warnings): {json_missing}")
+    print(f"  Failed: {failed}")
+    print(f"  Output folder: {OUTPUT_FOLDER}")
+if __name__ == "__main__":
+    main()

scripts/gradio_interface.py ADDED Viewed

	@@ -0,0 +1,635 @@

+import json
+import base64
+import io
+import requests
+from pathlib import Path
+from typing import Dict, Any, Optional, Tuple
+import gradio as gr
+from PIL import Image
+# Import prompt building functions from prompts.py
+from prompts import make_user_query, system_prompt, prompts_b
+# ==================== CONFIGURATION ====================
+# API settings
+API_URL = "http://127.0.0.1:8000/v1/chat/completions"
+API_KEY = "not-needed"
+# Image settings
+MAX_PIXELS = 1.0  # Maximum resolution in megapixels (e.g., 4.0 = 4MP)
+# Request settings
+MAX_TOKENS = 4096
+TEMPERATURE = 0.5
+REQUEST_TIMEOUT = 5  # Reduced for connection check
+WORK_TIMEOUT = 300
+# Captioning type options (from prompts_b in prompts.py)
+CAPTION_TYPES = list(prompts_b.keys())
+DEFAULT_C_TYPE = CAPTION_TYPES[0] if CAPTION_TYPES else None
+if not DEFAULT_C_TYPE:
+    raise RuntimeError("No caption types available in prompts_b!")
+# ==================== END CONFIGURATION ====================
+def check_api_connection(api_url: str) -> Tuple[str, str]:
+    """
+    Check API connection and return model info.
+    Returns (status_message, model_name).
+    """
+    try:
+        # Try to get models endpoint
+        base_url = api_url.rstrip('/').split('/v1/')[0]
+        models_url = f"{base_url}/v1/models"
+        response = requests.get(models_url, timeout=REQUEST_TIMEOUT)
+        response.raise_for_status()
+        result = response.json()
+        if result and 'data' in result and len(result['data']) > 0:
+            model_name = result['data'][0].get('id', 'Unknown')
+            return "✅ Connected", model_name
+        else:
+            return "⚠️ Connected (no model info)", "Unknown"
+    except requests.exceptions.ConnectionError:
+        return "❌ Connection failed", "N/A"
+    except requests.exceptions.Timeout:
+        return "❌ Timeout", "N/A"
+    except Exception as e:
+        return f"❌ Error: {str(e)[:50]}", "N/A"
+def encode_image_base64(image: Image.Image, max_pixels: float = MAX_PIXELS) -> str:
+    """Encode image to base64 string, resizing if necessary."""
+    img = image
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+    # Check if resizing needed
+    current_pixels = img.width * img.height
+    max_pixels_count = max_pixels * 1_000_000
+    if current_pixels >= max_pixels_count:
+        # Calculate new dimensions while preserving aspect ratio
+        scale = (max_pixels_count / current_pixels) ** 0.5
+        new_width = int(img.width * scale)
+        new_height = int(img.height * scale)
+        # Resize with high quality
+        img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+        # No resize needed
+    # Encode resized image to base64
+    buffer = io.BytesIO()
+    img.save(buffer, format='JPEG', quality=100)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def call_caption_api(messages: list, api_url: str = API_URL, model_name: str = "toriigate-0.5") -> Optional[str]:
+    """Call the captioning API."""
+    payload = {
+        "model": model_name,
+        "messages": messages,
+        "max_tokens": MAX_TOKENS,
+        "temperature": TEMPERATURE,
+        "stream": False
+    }
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {API_KEY}"
+    }
+    try:
+        response = requests.post(
+            api_url,
+            headers=headers,
+            json=payload,
+            timeout=WORK_TIMEOUT
+        )
+        response.raise_for_status()
+        result = response.json()
+        content = result['choices'][0]['message']['content']
+        return content
+    except requests.exceptions.RequestException as e:
+        return f"API Error: {e}"
+    except (KeyError, IndexError) as e:
+        return f"Parse Error: {e}"
+def empty_template() -> Dict[str, Any]:
+    """Return empty template for missing JSON data."""
+    return {
+        "tags": [],
+        "characters": [],
+        "char_p_tags": {"chars": {}, "skins": {}},
+        "char_descr": {"chars": {}, "skins": {}}
+    }
+def generate_caption(
+    image: Image.Image,
+    api_url: str,
+    model_name: str,
+    c_type: str,
+    use_names: bool,
+    add_tags: bool,
+    add_char_list: bool,
+    add_chars_tags: bool,
+    add_chars_descr: bool,
+    tags_text: str,
+    characters_text: str,
+    char1_name: str,
+    char1_tags: str,
+    char2_name: str,
+    char2_tags: str,
+    char3_name: str,
+    char3_tags: str,
+    char4_name: str,
+    char4_tags: str,
+    char5_name: str,
+    char5_tags: str,
+    char_descr1_name: str,
+    char_descr1_text: str,
+    char_descr2_name: str,
+    char_descr2_text: str,
+    char_descr3_name: str,
+    char_descr3_text: str,
+    char_descr4_name: str,
+    char_descr4_text: str,
+    char_descr5_name: str,
+    char_descr5_text: str
+) -> str:
+    """Generate caption for a single image."""
+    if image is None:
+        return "Please upload an image first."
+    # Build item dict from inputs
+    item = empty_template()
+    # Parse tags
+    if add_tags and tags_text.strip():
+        item["tags"] = [t.strip() for t in tags_text.split(',') if t.strip()]
+    # Parse characters
+    if add_char_list:
+        item["characters"] = [c.strip() for c in characters_text.split(',') if c.strip()]
+    # Auto-populate characters list from char tags/descriptions if not manually specified
+    if add_chars_tags or add_chars_descr:
+        auto_chars = []
+        if add_chars_tags:
+            char_entries = [
+                char1_name, char2_name, char3_name, char4_name, char5_name
+            ]
+            for name in char_entries:
+                if name and name.strip():
+                    auto_chars.append(name.strip())
+        if add_chars_descr:
+            descr_entries = [
+                char_descr1_name, char_descr2_name, char_descr3_name,
+                char_descr4_name, char_descr5_name
+            ]
+            for name in descr_entries:
+                if name and name.strip() and name.strip() not in auto_chars:
+                    auto_chars.append(name.strip())
+        # Only auto-populate if characters list is empty or not manually set
+        if auto_chars and (not add_char_list or not item["characters"]):
+            item["characters"] = auto_chars
+            add_char_list = True
+    # Parse character tags from structured inputs
+    if add_chars_tags:
+        chars_dict = {}
+        char_entries = [
+            (char1_name, char1_tags),
+            (char2_name, char2_tags),
+            (char3_name, char3_tags),
+            (char4_name, char4_tags),
+            (char5_name, char5_tags)
+        ]
+        for name, tags_str in char_entries:
+            if name is None:
+                continue
+            name = name.strip()
+            if name:
+                tags_list = [t.strip() for t in tags_str.split(',') if t.strip()] if tags_str and tags_str.strip() else []
+                chars_dict[name] = tags_list
+        if chars_dict:
+            item["char_p_tags"] = {"chars": chars_dict, "skins": {}}
+    # Parse character descriptions from structured inputs
+    if add_chars_descr:
+        descr_dict = {}
+        descr_entries = [
+            (char_descr1_name, char_descr1_text),
+            (char_descr2_name, char_descr2_text),
+            (char_descr3_name, char_descr3_text),
+            (char_descr4_name, char_descr4_text),
+            (char_descr5_name, char_descr5_text)
+        ]
+        for name, descr in descr_entries:
+            if name is None or descr is None:
+                continue
+            name = name.strip()
+            descr = descr.strip()
+            if name and descr:
+                descr_dict[name] = descr
+        if descr_dict:
+            item["char_descr"] = {"chars": descr_dict, "skins": {}}
+    # Encode image
+    image_data = encode_image_base64(image)
+    # Prepare messages
+    user_query = make_user_query(
+        item,
+        c_type=c_type,
+        use_names=use_names,
+        add_tags=add_tags,
+        add_characters=add_char_list,
+        add_char_tags=add_chars_tags,
+        add_description=add_chars_descr,
+        underscores_replace=False
+    )
+    messages = [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": system_prompt}]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}},
+                {"type": "text", "text": user_query}
+            ]
+        }
+    ]
+    # Call API
+    return call_caption_api(messages, api_url, model_name)
+def create_ui():
+    """Create and return the Gradio interface."""
+    with gr.Blocks(title="ToriiGate Captioner", theme=gr.themes.Soft()) as app:
+        gr.Markdown("# 🖼️ ToriiGate Captioner")
+        # API URL row with status
+        with gr.Row():
+            api_url_input = gr.Textbox(
+                label="API URL",
+                value=API_URL,
+                interactive=True,
+                scale=4
+            )
+            api_status = gr.Textbox(
+                label="Status",
+                value="⏳ Waiting for input...",
+                interactive=False,
+                scale=1
+            )
+            model_name_display = gr.Textbox(
+                label="Model",
+                value="N/A",
+                interactive=False,
+                scale=1
+            )
+        with gr.Row():
+            # Left column - Image input
+            with gr.Column(scale=1):
+                image_input = gr.Image(
+                    label="Upload Image",
+                    type="pil",
+                    height=400
+                )
+                gr.Markdown("### Configuration")
+                # Caption type selector
+                c_type = gr.Dropdown(
+                    choices=CAPTION_TYPES,
+                    value=DEFAULT_C_TYPE,
+                    label="Caption Type",
+                    interactive=True
+                )
+                # Boolean options with conditional text inputs
+                with gr.Group():
+                    use_names = gr.Checkbox(
+                        value=True,
+                        label="Use Names (enable character names)"
+                    )
+                    add_tags = gr.Checkbox(
+                        value=False,
+                        label="Add Tags"
+                    )
+                    tags_text = gr.Textbox(
+                        label="Tags (comma-separated)",
+                        placeholder="e.g., 1girl, blue_hair, school_uniform",
+                        interactive=False
+                    )
+                    add_char_list = gr.Checkbox(
+                        value=False,
+                        label="Add Character List"
+                    )
+                    characters_text = gr.Textbox(
+                        label="Character Names (comma-separated)",
+                        placeholder="e.g., nishizono_mio, hoshimi_miyabi",
+                        interactive=False
+                    )
+                    add_chars_tags = gr.Checkbox(
+                        value=False,
+                        label="Add Character Tags"
+                    )
+                    with gr.Group(visible=False) as char_tags_group:
+                        gr.Markdown("**Add character names and their tags**")
+                        with gr.Accordion("Character 1", open=True):
+                            char1_name = gr.Textbox(
+                                label="Name",
+                                placeholder="e.g., albedo",
+                                interactive=True
+                            )
+                            char1_tags = gr.Textbox(
+                                label="Tags (comma-separated)",
+                                placeholder="e.g., white_hair, green_eyes, horns",
+                                interactive=True
+                            )
+                        with gr.Accordion("Character 2", open=False):
+                            char2_name = gr.Textbox(
+                                label="Name",
+                                placeholder="e.g., hoshimi_miyabi",
+                                interactive=True
+                            )
+                            char2_tags = gr.Textbox(
+                                label="Tags (comma-separated)",
+                                placeholder="e.g., blue_hair, fox_ears",
+                                interactive=True
+                            )
+                        with gr.Accordion("Character 3", open=False):
+                            char3_name = gr.Textbox(
+                                label="Name",
+                                placeholder="e.g., nishizono_mio",
+                                interactive=True
+                            )
+                            char3_tags = gr.Textbox(
+                                label="Tags (comma-separated)",
+                                placeholder="e.g., brown_hair, glasses",
+                                interactive=True
+                            )
+                        with gr.Accordion("Character 4", open=False):
+                            char4_name = gr.Textbox(
+                                label="Name",
+                                placeholder="e.g.",
+                                interactive=True
+                            )
+                            char4_tags = gr.Textbox(
+                                label="Tags (comma-separated)",
+                                placeholder="e.g.",
+                                interactive=True
+                            )
+                        with gr.Accordion("Character 5", open=False):
+                            char5_name = gr.Textbox(
+                                label="Name",
+                                placeholder="e.g.",
+                                interactive=True
+                            )
+                            char5_tags = gr.Textbox(
+                                label="Tags (comma-separated)",
+                                placeholder="e.g.",
+                                interactive=True
+                            )
+                        char_tags_clear_btn = gr.Button(
+                            "🗑️ Clear All",
+                            variant="secondary",
+                            size="sm"
+                        )
+                    add_chars_descr = gr.Checkbox(
+                        value=False,
+                        label="Add Character Descriptions"
+                    )
+                    with gr.Group(visible=False) as char_descr_group:
+                        gr.Markdown("**Add character descriptions**")
+                        with gr.Accordion("Character 1", open=True):
+                            char_descr1_name = gr.Textbox(
+                                label="Name",
+                                placeholder="e.g., albedo",
+                                interactive=True
+                            )
+                            char_descr1_text = gr.Textbox(
+                                label="Description",
+                                placeholder="e.g., Albedo is a curvy woman with...",
+                                lines=3,
+                                interactive=True
+                            )
+                        with gr.Accordion("Character 2", open=False):
+                            char_descr2_name = gr.Textbox(
+                                label="Name",
+                                placeholder="e.g., hoshimi_miyabi",
+                                interactive=True
+                            )
+                            char_descr2_text = gr.Textbox(
+                                label="Description",
+                                placeholder="e.g., Miyabi is a calm and collected...",
+                                lines=3,
+                                interactive=True
+                            )
+                        with gr.Accordion("Character 3", open=False):
+                            char_descr3_name = gr.Textbox(
+                                label="Name",
+                                placeholder="e.g., nishizono_mio",
+                                interactive=True
+                            )
+                            char_descr3_text = gr.Textbox(
+                                label="Description",
+                                placeholder="e.g., Mio is a cheerful girl with...",
+                                lines=3,
+                                interactive=True
+                            )
+                        with gr.Accordion("Character 4", open=False):
+                            char_descr4_name = gr.Textbox(
+                                label="Name",
+                                placeholder="e.g.",
+                                interactive=True
+                            )
+                            char_descr4_text = gr.Textbox(
+                                label="Description",
+                                placeholder="e.g.",
+                                lines=3,
+                                interactive=True
+                            )
+                        with gr.Accordion("Character 5", open=False):
+                            char_descr5_name = gr.Textbox(
+                                label="Name",
+                                placeholder="e.g.",
+                                interactive=True
+                            )
+                            char_descr5_text = gr.Textbox(
+                                label="Description",
+                                placeholder="e.g.",
+                                lines=3,
+                                interactive=True
+                            )
+                        char_descr_clear_btn = gr.Button(
+                            "🗑️ Clear All",
+                            variant="secondary",
+                            size="sm"
+                        )
+                generate_btn = gr.Button("🚀 Generate Caption", variant="primary", size="lg")
+            # Right column - Output
+            with gr.Column(scale=1):
+                output_text = gr.Textbox(
+                    label="Caption Output",
+                    lines=20,
+                    max_lines=50,
+                    interactive=False
+                )
+        # Toggle text inputs based on checkbox state
+        def toggle_input(is_checked: bool, input_component):
+            return gr.update(interactive=is_checked)
+        add_tags.change(
+            lambda x: toggle_input(x, tags_text),
+            inputs=add_tags,
+            outputs=tags_text
+        )
+        add_char_list.change(
+            lambda x: toggle_input(x, characters_text),
+            inputs=add_char_list,
+            outputs=characters_text
+        )
+        add_chars_tags.change(
+            fn=lambda x: gr.update(visible=x),
+            inputs=add_chars_tags,
+            outputs=char_tags_group
+        )
+        add_chars_descr.change(
+            fn=lambda x: gr.update(visible=x),
+            inputs=add_chars_descr,
+            outputs=char_descr_group
+        )
+        # API URL change handler
+        api_url_input.change(
+            fn=check_api_connection,
+            inputs=api_url_input,
+            outputs=[api_status, model_name_display]
+        )
+        # Wire up generate button
+        generate_btn.click(
+            fn=generate_caption,
+            inputs=[
+                image_input,
+                api_url_input,
+                model_name_display,
+                c_type,
+                use_names,
+                add_tags,
+                add_char_list,
+                add_chars_tags,
+                add_chars_descr,
+                tags_text,
+                characters_text,
+                char1_name,
+                char1_tags,
+                char2_name,
+                char2_tags,
+                char3_name,
+                char3_tags,
+                char4_name,
+                char4_tags,
+                char5_name,
+                char5_tags,
+                char_descr1_name,
+                char_descr1_text,
+                char_descr2_name,
+                char_descr2_text,
+                char_descr3_name,
+                char_descr3_text,
+                char_descr4_name,
+                char_descr4_text,
+                char_descr5_name,
+                char_descr5_text
+            ],
+            outputs=output_text
+        )
+        # Clear character tags button handler
+        def clear_char_tags():
+            return "", "", "", "", "", "", "", "", "", ""
+        char_tags_clear_btn.click(
+            fn=clear_char_tags,
+            inputs=[],
+            outputs=[
+                char1_name, char1_tags,
+                char2_name, char2_tags,
+                char3_name, char3_tags,
+                char4_name, char4_tags,
+                char5_name, char5_tags
+            ]
+        )
+        # Clear character descriptions button handler
+        def clear_char_descr():
+            return "", "", "", "", "", "", "", "", "", ""
+        char_descr_clear_btn.click(
+            fn=clear_char_descr,
+            inputs=[],
+            outputs=[
+                char_descr1_name, char_descr1_text,
+                char_descr2_name, char_descr2_text,
+                char_descr3_name, char_descr3_text,
+                char_descr4_name, char_descr4_text,
+                char_descr5_name, char_descr5_text
+            ]
+        )
+    return app
+if __name__ == "__main__":
+    app = create_ui()
+    app.launch(server_name="127.0.0.1", server_port=7860)

scripts/prompts.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import random
+prompts_b = {
+"long_thoughts_v2": """Your answer must contain 6 parts:
+<format>
+# 1. Thoughts about characters
+You need to think here and compare peoples/creatures that you see on the picture with given popular tags, or descriptions, or your memories for each characters to determine who is who.
+# 2. Key details
+Here you need to determine key details on comic and list them.
+# 3. Long description
+Here come up with a long and detailed description of image content. Be creative, mention all detailes you listed above and other important things.
+# 4. Detailed description for each character
+## Name 1
+Detailed and long description for the first character
+## Name 2
+Same for each one (if present)
+</format>
+""",
+"long_thoughts": """Your answer must contain 6 parts:
+<format>
+# 1. Thoughts about characters
+You need to think here and compare peoples/creatures that you see on the picture  with given popular tags, or descriptions, or your memories for each characters to determine who is who.
+If no characters are listed in input - just write here "No named characters"
+# 2. General description
+A one-two paragraph summary of the image. Mention all individual parts/objects/characters/positions/interactions/etc.
+# 3. Detailed description for each character
+## Character name 1 (put here the name if any)
+In very detail write about features, poses, look, used objects, interactions, and other things for character on the picture.
+## Character name 2 (put here the name if any)
+Same for each character.
+...
+# 4. Individual Parts
+List the individual things you see in the image and their relative positions to other parts. Use a numbered list of between 5 and 20 items depending on image complexity.
+# 5. Texts on image
+Mention every texts that you notice on image, including types (a speech bubble, watermark, banner, etc.) and content.
+# 6. Background and effects
+Give some info about objects on background, describe the location (if seen). Then mention effects (style, camera angle, clarity/blurrines, effects like depth of field, strange angle/forshortening, etc.)
+</format>
+""",
+"json": """Use json-style caption for given image with following structure:
+{"character" : "Description for character or object. Name (if defined), main details, features, position, pose, etc.",
+/or in case of multiple
+"character_1" : "Description for first"
+"character_2" : "Description for second ",
+"character_N"...
+/or if there are no characters
+"main content" : "long and detailed description of main content of image that might be the main focus if characters are missing",
+/
+"background" : "Detailed descritpion of background and it's content",
+"image_effects" : "If there are some visual effects like fisheye distortion, chromatic aberration, glitches, messy drawing or anything else - write about it. If it's just a general anime art - omit this field."
+"texts" : "Speech bubbles, bars, marks, signs etc. with texts if present, else None",
+"atmosphere" : "...",
+}
+In special cases you can add extra keys.
+""",
+"long": """Make a caption for given image with natural text. Use 2 to 5 paragraphs. Make your description long and vivid, mentioning all the details.
+""",
+"min_structured_md": """Your answer must contain 3 parts:
+<format>
+# 1. Thoughts about characters
+You need to think here and compare peoples/creatures that you see on the picture  with given popular tags, or descriptions, or your memories for each characters to determine who is who.
+If no characters are listed in input - just write here "No named characters"
+# 2. Key details
+Here you need to write about the key details on image, prefere using regular text.
+# 3. Structured description
+## General
+Write about general composition, content of image, background and all things that are not related to characters directly.
+## Character name 1 (put here the name if any)
+Write about datails and content related to specific character, including features, poses, look, used objects, interactions, and other things.
+## Character name 2 (put here the name if any)
+Same for each character.
+## Image effects
+Mention image effect, style, camera angle
+</format>
+In general stick to shorter descriptions.
+""",
+"json_comic": """Use json-style caption to describe to comin, stick to following structure:
+{
+"comic_format": "menation the format, for example Comic of N frames",
+"1st_frame": "Main description of the content for fist frame",
+"2nd_frame": "Same for the second",
+...
+"Nth_ftame": "...",
+"character_1": "Describe the characters in comic",
+...
+"character_N": "Separate description for each",
+"meaning": "Try to guess general mood, vibe and meaning of the comic"
+}
+""",
+"md_comic": """Use markdown format to describe to comic, 5 parts are recommended:
+<format>
+# 1. Thoughts about characters
+You need to think here and compare peoples/creatures that you see on the picture with given popular tags, or descriptions, or your memories for each characters to determine who is who.
+# 2. Key details
+Here you need to determine key details on comic and list them.
+# 3. Comic format
+In this section come up with the description of comic format, how many pages there are, horisontal/vertical orientation and other things. Optionally you can list main characters here.
+# 4. Details for each frame
+## 4.1 Frame 1 (position)
+Description for each frame, includding characters, objects, interactions, texts/speech bubbles and other things. Be detailed but not overdoo.
+## 4.2 Frame 2 (position)
+Same for each frame.
+...
+# 5. Extra comment
+Here you should write general desciption and some other info about the image.
+</format>
+""",
+"min_structured_json": """
+Use json-style caption for given image with following structure:
+{"General" : "Here you need to come up with general/common information about picture, overall composition. Stick to shorter phrases and tags instead of long purple prose. Avoid bullets and markdown, write in plain text.",
+"character_1 (put here the name if any)" : "Description of first character."
+"character_2 (if present" : "Description for second ",
+"character_N"
+...
+"image_effects" : "Mention here effects on image if there are any distinct."
+"texts" : "Speech bubbles, bars, marks, signs etc. with texts if present, else None",
+"watermarks" : "If present",
+}
+Prefere shorter description and tags.
+""",
+"chroma-style": """Your task is to describe the picture in very detail using a structure of 4 parts.
+### 1. Regular Summary:
+[A one-paragraph summary of the image. The paragraph should mention all individual parts/things/characters/etc.]
+### 2. Individual Parts:
+[List the individual things you see in the image and their relative positions to other parts. Use a numbered list of between 5 and 30 items depending on image complexity.]
+### 3. Midjourney-Style Summary:
+[A summary that has higher concept density by using comma-separated partial sentences instead of proper sentence structure.]
+### 4. DeviantArt Commission Request
+[Write a description as if you're commissioning this *exact* image via someone who is currently taking requests.]
+""",
+"short":"""The caption for image should be quite short without long purple prose and slop. Cover main objects and details.
+""",
+}
+prompts_names_only = {
+    "long_thoughts_v2":True,
+    "long_thoughts": True,
+    "json": False,
+    "long": False,
+    "json_comic": False,
+    "md_comic": True,
+    "min_structured_md": True,
+    "min_structured_json": False,
+    "chroma-style": False,
+    "short":False,
+    }
+def make_user_query(item, c_type, use_names, add_tags, add_characters, add_char_tags, add_description, underscores_replace = False):
+    tags = item.get('tags', [])
+    random.shuffle(tags)
+    if underscores_replace:
+        tags = [a.replace('_', ' ') if len(a)>3 else a for a in tags]
+        tags_string = ', '.join(tags)
+    else:
+        tags_string = ' '.join(tags)
+    user_request = '# Captioning format:\n'
+    user_request += prompts_b[c_type]
+    user_request += '\n'
+    if add_tags:
+        user_request += f"# Booru tags for the image\n[{tags_string}]\n\n"
+    if use_names: #Имена персонажей
+        if add_characters:
+            chars_tags = item.get('characters',[])
+            if underscores_replace:
+                tags = [a.replace('_', ' ') for a in chars_tags]
+                chars_string = ', '.join(chars_tags)
+            else:
+                chars_string = ' '.join(chars_tags)
+            user_request += f"# Characters on picture:\nHere are names/tags for characters from the picture, make sure to use them: [{chars_string}].\n\n"
+            chars_popular_tags = (item.get('char_p_tags',"{'chars':{},'skins':{}}"))
+            chars_description = (item.get('char_descr',"{'chars':{},'skins':{}}"))
+            if len(chars_popular_tags['chars']) > 0 and (add_char_tags or add_description):
+                user_request += "# Known traits for characters\n"
+                char_underscores = underscores_replace
+                if add_char_tags:
+                    user_request += "Here are popular tags for each characters on picture:\n"
+                    for c_name, c_tags in chars_popular_tags['chars'].items():
+                        name = c_name.replace('_',' ') if char_underscores else c_name
+                        tags_s = (', '.join([a.replace('_', ' ') if len(a)>3 else a for a in c_tags]) if char_underscores else
+                                ' '.join(c_tags))
+                        user_request += f"{name}: [{tags_s}]\n"
+                    if len(chars_popular_tags['skins']) > 0:
+                        user_request += "Extra tags for characters skins:\n"
+                        for c_name, c_tags in chars_popular_tags['skins'].items():
+                            name = c_name.replace('_',' ') if char_underscores else c_name
+                            tags_s = (', '.join([a.replace('_', ' ') if len(a)>3 else a for a in c_tags]) if char_underscores else
+                                    ' '.join(c_tags))
+                            user_request += f"{name}: [{tags_s}]\n"
+                elif add_description:
+                    user_request += "Here are general descriptions for each characters on the picture:\n"
+                    for c_name, c_descr in chars_description['chars'].items():
+                        name = c_name.replace('_',' ') if char_underscores else c_name
+                        user_request += f"## {name}\n{c_descr}\n\n"
+                    if len(chars_description['skins']) > 0:
+                        user_request += "Here are also descriptions for specific skin of characters:\n"
+                        for c_name, c_descr in chars_description['skins'].items():
+                            name = c_name.replace('_',' ') if char_underscores else c_name
+                            user_request += f"## {name}\n{c_descr}\n\n"
+        else:
+            user_request += "# Characters on picture:\nTry to recognize the characters in the picture and use their names.\n"
+        user_request += '\n'
+    else:
+        user_request += "# Characters on picture:\nAvoid to guess names for characters.\n"
+    return user_request
+system_prompt = "You are image captioning expert. Describe user's picture according to requested format and instructions."

scripts/transformers_ver.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from prompts import make_user_query, system_prompt
+from transformers import (
+    Qwen3_5ForConditionalGeneration,
+    AutoProcessor,
+)
+from PIL import Image
+import torch
+MODEL_PATH = "M:/ai/qwen3.5_mm_trainer/Qwen3.5-4B-Base_k2"
+DEVICE = 'cuda'
+model = Qwen3_5ForConditionalGeneration.from_pretrained(
+            MODEL_PATH,
+            torch_dtype=torch.bfloat16,
+            attn_implementation="sdpa",
+            device_map=DEVICE
+        )
+processor = AutoProcessor.from_pretrained(
+        MODEL_PATH,
+        min_pixels=256*32*32,
+        padding_side="right"
+    )
+C_TYPE = 'long_thoughts_v2'
+USE_NAMES = True
+ADD_TAGS = False
+ADD_CHAR_LIST = False
+ADD_CHARS_TAGS = False
+ADD_CHARS_DESCR = False
+def prepare_messages(item):
+    user_query = make_user_query(item,
+         C_TYPE, USE_NAMES, ADD_TAGS, ADD_CHAR_LIST, ADD_CHARS_TAGS, ADD_CHARS_DESCR
+                 )
+    return [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": system_prompt}]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": user_query},
+            ],
+        }
+    ]
+img = Image.open('test_image.png')
+images = [img]
+msgs = prepare_messages({})
+texts = [processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)]
+inputs = processor(text=texts, images=images, return_tensors="pt")
+inputs = {k:v.to(DEVICE) for k,v in inputs.items()}
+with torch.no_grad():
+    generate_ids = model.generate(**inputs, max_new_tokens=1024)
+generated_texts = processor.batch_decode(
+                generate_ids[:, inputs["input_ids"].shape[1]:],
+                skip_special_tokens=True
+            )
+print(generated_texts[0])