|
|
import gradio as gr |
|
|
import torch |
|
|
from PIL import Image |
|
|
from transformers import BlipProcessor, BlipForConditionalGeneration, CLIPProcessor, CLIPModel |
|
|
from torchvision import transforms |
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
|
|
|
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") |
|
|
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device) |
|
|
|
|
|
|
|
|
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device) |
|
|
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") |
|
|
|
|
|
|
|
|
|
|
|
def danbooru_tagging(image): |
|
|
return "1girl, bodysuit, sitting, wooden floor, solo" |
|
|
|
|
|
def generate_blip_caption(image): |
|
|
raw_image = Image.open(image).convert("RGB") |
|
|
inputs = blip_processor(raw_image, return_tensors="pt").to(device) |
|
|
out = blip_model.generate(**inputs) |
|
|
caption = blip_processor.decode(out[0], skip_special_tokens=True) |
|
|
return caption |
|
|
|
|
|
def generate_clip_prompt(image, detail_level): |
|
|
raw_image = Image.open(image).convert("RGB") |
|
|
inputs = clip_processor(images=raw_image, return_tensors="pt").to(device) |
|
|
outputs = clip_model.get_image_features(**inputs) |
|
|
base_prompt = "a woman in a bodysuit on wooden floor" |
|
|
if detail_level >= 4: |
|
|
return base_prompt + ", cinematic lighting, ultra detailed, HDR" |
|
|
elif detail_level == 3: |
|
|
return base_prompt + ", moody atmosphere" |
|
|
elif detail_level == 2: |
|
|
return base_prompt + ", minimal shadows" |
|
|
else: |
|
|
return base_prompt |
|
|
|
|
|
def get_output(image, output_type, style, detail_level, tags, model_choice): |
|
|
if model_choice == "BLIP": |
|
|
if output_type == "Detailed Description": |
|
|
return generate_blip_caption(image) |
|
|
elif output_type == "Short Caption": |
|
|
return generate_blip_caption(image).split(",")[0] |
|
|
elif output_type == "Model Training Data": |
|
|
return generate_blip_caption(image).lower().replace(" ", "_") |
|
|
elif output_type == "AI Prompt": |
|
|
return generate_clip_prompt(image, detail_level) |
|
|
elif model_choice == "CLIP": |
|
|
return generate_clip_prompt(image, detail_level) |
|
|
elif model_choice == "DeepDanbooru": |
|
|
return danbooru_tagging(image) |
|
|
elif model_choice == "NSFW Detector": |
|
|
return "(Simulierter NSFW-Klassifikator: Ergebnis nicht implementiert)" |
|
|
return "[Keine gültige Auswahl getroffen]" |
|
|
|
|
|
with gr.Blocks(css="style.css") as app: |
|
|
gr.Markdown("# NSFW Image to Text Generator ✨") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
img = gr.Image(type="filepath", label="Upload Image") |
|
|
output_type = gr.Dropdown([ |
|
|
"Detailed Description", "Short Caption", "Tag List", "AI Prompt", "Model Training Data"], |
|
|
label="Output Type", value="Detailed Description") |
|
|
style = gr.Dropdown([ |
|
|
"Neutral", "Erotic", "Artistic", "Technical", "Literary", "BDSM", "Fetish"], |
|
|
label="Style", value="Neutral") |
|
|
detail = gr.Slider(1, 5, step=1, value=3, label="Detail Level") |
|
|
tags = gr.Textbox(label="Custom Tags (comma separated)") |
|
|
model_choice = gr.Radio([ |
|
|
"CLIP", "BLIP", "DeepDanbooru", "NSFW Detector"], |
|
|
label="AI Model", value="BLIP") |
|
|
btn_generate = gr.Button("Generate Text") |
|
|
|
|
|
with gr.Column(): |
|
|
output = gr.Textbox(label="Generated Output", lines=8) |
|
|
gr.Button("Enhance") |
|
|
gr.Button("Shorten") |
|
|
gr.Button("Rewrite") |
|
|
|
|
|
btn_generate.click(get_output, |
|
|
inputs=[img, output_type, style, detail, tags, model_choice], |
|
|
outputs=output) |
|
|
|
|
|
with gr.Tab("Prompt Tools"): |
|
|
prompt_input = gr.Textbox(label="Prompt Builder") |
|
|
btn_optimize = gr.Button("Optimize Prompt") |
|
|
btn_random = gr.Button("Randomize") |
|
|
optimized_output = gr.Textbox(label="Optimized Prompt") |
|
|
btn_optimize.click(lambda p: p + ", ultra detailed", inputs=prompt_input, outputs=optimized_output) |
|
|
btn_random.click(lambda: "a cyberpunk alley at night", outputs=optimized_output) |
|
|
|
|
|
with gr.Tab("Training Data"): |
|
|
btn_tags = gr.Button("Generate Tags") |
|
|
tags_out = gr.Textbox(label="Training Tags") |
|
|
btn_tags.click(lambda: "1girl, solo, black bodysuit, sitting", outputs=tags_out) |
|
|
|
|
|
caption_mode = gr.Dropdown([ |
|
|
"Basic Caption", "Detailed Description", "Booru Style", "Natural Language"], |
|
|
label="Caption Generation") |
|
|
btn_caption = gr.Button("Generate Caption") |
|
|
caption_out = gr.Textbox(label="Training Caption") |
|
|
btn_caption.click(lambda mode: { |
|
|
"Basic Caption": "A woman posing for a photo", |
|
|
"Detailed Description": "A woman in a futuristic city wearing a sleek bodysuit.", |
|
|
"Booru Style": "1girl, bodysuit, city, night", |
|
|
"Natural Language": "She stands still beneath neon lights, calm yet focused." |
|
|
}.get(mode, ""), inputs=caption_mode, outputs=caption_out) |
|
|
|
|
|
trigger_word = gr.Textbox(label="Trigger Word") |
|
|
trigger_class = gr.Textbox(label="Class") |
|
|
btn_lora = gr.Button("Prepare LoRA Training Data") |
|
|
lora_out = gr.Textbox(label="LoRA Output") |
|
|
btn_lora.click(lambda t, c: f"LoRA: {t}, class: {c}", inputs=[trigger_word, trigger_class], outputs=lora_out) |
|
|
|
|
|
app.launch() |
|
|
|