File size: 3,464 Bytes
a650c88
 
 
 
 
 
ba7578e
a650c88
 
 
 
 
 
 
 
ba7578e
 
a650c88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba7578e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a650c88
ba7578e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import subprocess
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

# Install flash-attn if needed
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

# Initialize Florence model
device = "cuda" if torch.cuda.is_available() else "cpu"
florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval()
florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True)

def generate_caption(image):
    if image is None:
        return ""
    if not isinstance(image, Image.Image):
        image = Image.fromarray(image)
    
    inputs = florence_processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
    generated_ids = florence_model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        early_stopping=False,
        do_sample=False,
        num_beams=3,
    )
    generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = florence_processor.post_process_generation(
        generated_text,
        task="<MORE_DETAILED_CAPTION>",
        image_size=(image.width, image.height)
    )
    return parsed_answer["<MORE_DETAILED_CAPTION>"]

# Custom CSS for a "Beautiful UI"
css = """
.container { max-width: 900px; margin: auto; padding-top: 2rem; }
.header { text-align: center; margin-bottom: 2rem; }
.header h1 { font-size: 2.5rem; font-weight: 800; color: #ffffff; margin-bottom: 0.5rem; }
.header p { color: #a0a0a0; font-size: 1.1rem; }
.generate-btn { 
    background: linear-gradient(90deg, #4776E6 0%, #8E54E9 100%) !important;
    border: none !important;
    color: white !important;
    font-weight: bold !important;
}
.generate-btn:hover { transform: scale(1.02); transition: 0.2s; }
.output-box { border-radius: 10px !important; background-color: #1a1a1a !important; }
"""

with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="violet", secondary_hue="indigo")) as demo:
    with gr.Column(elem_classes="container"):
        # HTML Header Section
        gr.HTML(
            """
            <div class="header">
                <h1>✨ Image to Prompt Studio</h1>
                <p>Upload an image to generate a highly detailed AI prompt using Florence-2.</p>
            </div>
            """
        )
        
        with gr.Row():
            with gr.Column(scale=1):
                input_img = gr.Image(label="Upload Image", type="pil")
                submit_btn = gr.Button("Generate Prompt", variant="primary", elem_classes="generate-btn")
            
            with gr.Column(scale=1):
                output_text = gr.Textbox(
                    label="Generated Prompt", 
                    lines=8, 
                    placeholder="Your prompt will appear here...",
                    show_copy_button=True,
                    elem_classes="output-box"
                )

        # Example images (optional)
        gr.Examples(
            examples=[], # You can add paths to example images here
            inputs=input_img
        )

    # Logic
    submit_btn.click(
        fn=generate_caption,
        inputs=[input_img],
        outputs=[output_text]
    )

if __name__ == "__main__":
    demo.launch(debug=True)