File size: 2,989 Bytes
e975d72
 
070ac5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# app.py

import gradio as gr
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

# 1. ์žฅ์น˜ ์„ค์ •
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Florence ๋ชจ๋ธ ๋ฐ ํ”„๋กœ์„ธ์„œ ๋กœ๋“œ
florence_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Florence-2-base",
    trust_remote_code=True
).to(device).eval()
florence_processor = AutoProcessor.from_pretrained(
    "microsoft/Florence-2-base",
    trust_remote_code=True
)

# 3. ์ด๋ฏธ์ง€ ์„ค๋ช… ์ƒ์„ฑ ํ•จ์ˆ˜
def generate_caption(image):
    if not isinstance(image, Image.Image):
        image = Image.fromarray(image)

    inputs = florence_processor(
        text="<MORE_DETAILED_CAPTION>",
        images=image,
        return_tensors="pt"
    ).to(device)
    generated_ids = florence_model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        early_stopping=False,
        do_sample=False,
        num_beams=3,
    )
    generated_text = florence_processor.batch_decode(
        generated_ids,
        skip_special_tokens=False
    )[0]
    parsed_answer = florence_processor.post_process_generation(
        generated_text,
        task="<MORE_DETAILED_CAPTION>",
        image_size=(image.width, image.height)
    )
    prompt = parsed_answer["<MORE_DETAILED_CAPTION>"]
    print("Generation completed!:", prompt)
    return prompt

# 4. Gradio ์ธํ„ฐํŽ˜์ด์Šค (์บ๋ฆฌ์ปค์ณ ๋ฒ„ํŠผ ํฌํ•จ)
with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange") as demo:
    gr.Markdown("## ๐Ÿ–ผ๏ธ ์ด๋ฏธ์ง€ ์„ค๋ช… ์ƒ์„ฑ๊ธฐ")
    gr.Markdown(
        "โš  ํ˜„์žฌ CPU ๋ชจ๋“œ๋กœ ์‹คํ–‰ ์ค‘์ด๋ฏ€๋กœ ์†๋„๊ฐ€ ๋А๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์–‘ํ•ด ๋ถ€ํƒ๋“œ๋ฆฝ๋‹ˆ๋‹ค."
    )

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(label="Input Image", type="pil")
        with gr.Column():
            caption_output = gr.Textbox(
                label="Output Prompt",
                lines=6,                # ์ด์ „๋ณด๋‹ค 2๋ฐฐ ๋†’์ด
                show_copy_button=True
            )
            # ์˜ค๋ฅธ์ชฝ ํ•˜๋‹จ '์บ๋ฆฌ์ปค์ณ ๋งŒ๋“ค๊ธฐ' ๋ฒ„ํŠผ
            gr.HTML("""
            <div style="margin-top:10px; text-align:center;">
              <a href="https://huggingface.co/spaces/VIDraft/stable-diffusion-3.5-large-turboX" target="_blank">
                <button style="
                  padding:10px 20px;
                  background-color:#ff9900;
                  color:white;
                  border:none;
                  border-radius:10px;
                  font-size:16px;
                  box-shadow:2px 2px 8px rgba(0,0,0,0.3);
                  cursor:pointer;
                ">
                  ๐ŸŽจ ์บ๋ฆฌ์ปค์ณ ๋งŒ๋“ค๊ธฐ
                </button>
              </a>
            </div>
            """)

    image_input.upload(fn=generate_caption, inputs=image_input, outputs=caption_output)

if __name__ == "__main__":
    demo.launch(debug=True)