import gradio as gr import torch from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration, CLIPProcessor, CLIPModel from torchvision import transforms # Geräteeinstellung device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # BLIP-Modelle blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device) # CLIP-Modelle clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device) clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") # Platzhalter für DeepDanbooru def danbooru_tagging(image): return "1girl, bodysuit, sitting, wooden floor, solo" def generate_blip_caption(image): raw_image = Image.open(image).convert("RGB") inputs = blip_processor(raw_image, return_tensors="pt").to(device) out = blip_model.generate(**inputs) caption = blip_processor.decode(out[0], skip_special_tokens=True) return caption def generate_clip_prompt(image, detail_level): raw_image = Image.open(image).convert("RGB") inputs = clip_processor(images=raw_image, return_tensors="pt").to(device) outputs = clip_model.get_image_features(**inputs) base_prompt = "a woman in a bodysuit on wooden floor" if detail_level >= 4: return base_prompt + ", cinematic lighting, ultra detailed, HDR" elif detail_level == 3: return base_prompt + ", moody atmosphere" elif detail_level == 2: return base_prompt + ", minimal shadows" else: return base_prompt def get_output(image, output_type, style, detail_level, tags, model_choice): if model_choice == "BLIP": if output_type == "Detailed Description": return generate_blip_caption(image) elif output_type == "Short Caption": return generate_blip_caption(image).split(",")[0] elif output_type == "Model Training Data": return generate_blip_caption(image).lower().replace(" ", "_") elif output_type == "AI Prompt": return generate_clip_prompt(image, detail_level) elif model_choice == "CLIP": return generate_clip_prompt(image, detail_level) elif model_choice == "DeepDanbooru": return danbooru_tagging(image) elif model_choice == "NSFW Detector": return "(Simulierter NSFW-Klassifikator: Ergebnis nicht implementiert)" return "[Keine gültige Auswahl getroffen]" with gr.Blocks(css="style.css") as app: gr.Markdown("# NSFW Image to Text Generator ✨") with gr.Row(): with gr.Column(): img = gr.Image(type="filepath", label="Upload Image") output_type = gr.Dropdown([ "Detailed Description", "Short Caption", "Tag List", "AI Prompt", "Model Training Data"], label="Output Type", value="Detailed Description") style = gr.Dropdown([ "Neutral", "Erotic", "Artistic", "Technical", "Literary", "BDSM", "Fetish"], label="Style", value="Neutral") detail = gr.Slider(1, 5, step=1, value=3, label="Detail Level") tags = gr.Textbox(label="Custom Tags (comma separated)") model_choice = gr.Radio([ "CLIP", "BLIP", "DeepDanbooru", "NSFW Detector"], label="AI Model", value="BLIP") btn_generate = gr.Button("Generate Text") with gr.Column(): output = gr.Textbox(label="Generated Output", lines=8) gr.Button("Enhance") gr.Button("Shorten") gr.Button("Rewrite") btn_generate.click(get_output, inputs=[img, output_type, style, detail, tags, model_choice], outputs=output) with gr.Tab("Prompt Tools"): prompt_input = gr.Textbox(label="Prompt Builder") btn_optimize = gr.Button("Optimize Prompt") btn_random = gr.Button("Randomize") optimized_output = gr.Textbox(label="Optimized Prompt") btn_optimize.click(lambda p: p + ", ultra detailed", inputs=prompt_input, outputs=optimized_output) btn_random.click(lambda: "a cyberpunk alley at night", outputs=optimized_output) with gr.Tab("Training Data"): btn_tags = gr.Button("Generate Tags") tags_out = gr.Textbox(label="Training Tags") btn_tags.click(lambda: "1girl, solo, black bodysuit, sitting", outputs=tags_out) caption_mode = gr.Dropdown([ "Basic Caption", "Detailed Description", "Booru Style", "Natural Language"], label="Caption Generation") btn_caption = gr.Button("Generate Caption") caption_out = gr.Textbox(label="Training Caption") btn_caption.click(lambda mode: { "Basic Caption": "A woman posing for a photo", "Detailed Description": "A woman in a futuristic city wearing a sleek bodysuit.", "Booru Style": "1girl, bodysuit, city, night", "Natural Language": "She stands still beneath neon lights, calm yet focused." }.get(mode, ""), inputs=caption_mode, outputs=caption_out) trigger_word = gr.Textbox(label="Trigger Word") trigger_class = gr.Textbox(label="Class") btn_lora = gr.Button("Prepare LoRA Training Data") lora_out = gr.Textbox(label="LoRA Output") btn_lora.click(lambda t, c: f"LoRA: {t}, class: {c}", inputs=[trigger_word, trigger_class], outputs=lora_out) app.launch()