File size: 9,198 Bytes
e7ae0df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125dd3f
e7ae0df
 
 
 
 
 
 
125dd3f
e7ae0df
 
 
 
 
125dd3f
e7ae0df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbe0610
e7ae0df
 
 
 
 
 
 
fbe0610
e7ae0df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbe0610
e7ae0df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# -*- coding: utf-8 -*-
# 財政部財政資訊中心 江信宗

import gradio as gr
from groq import Groq
import base64
import os
import io
import json
from PIL import Image
from zhconv_rs import zhconv
import traceback

custom_css = """

.center-aligned {

    text-align: center !important;

    color: #ff4081;

    text-shadow: 2px 2px 4px rgba(0,0,0,0.1);

}

.input-background {

    background-color: #B7E0FF !important;

    padding: 15px !important;

    border-radius: 10px !important;

    margin: 0 !important;

}

.input-background textarea {

    font-size: 18px !important;

    background-color: #ffffff;

    border: 1px solid #f0f8ff;

    border-radius: 8px;

}

.image-background {

    border-radius: 10px !important;

    border: 2px solid #B7E0FF !important;

}

.lng-background {

    background-color: #FFF5CD !important;

    padding: 5px !important;

    border-radius: 10px !important;

    margin: 0 !important;

}

.api-background {

    background-color: #FFCFB3 !important;

    padding: 5px !important;

    border-radius: 10px !important;

    margin: 0 !important;

}

.script-background {

    background-color: #FEF9D9 !important;

    padding: 15px !important;

    border-radius: 10px !important;

    margin: 0 !important;

}

.script-background textarea {

    font-size: 18px !important;

    background-color: #ffffff;

    border: 1px solid #f0f8ff;

    border-radius: 8px;

}

.model-background {

    background-color: #FFF4B5 !important;

    padding: 5px !important;

    border-radius: 10px !important;

    margin: 0 !important;

}

.model-background textarea {

    font-size: 18px !important;

    background-color: #ffffff;

    border: 1px solid #f0f8ff;

    border-radius: 8px;

}

.gen-button {

    border-radius: 10px !important;

    border: none !important;

    background-color: #ff4081 !important;

    color: white !important;

    font-weight: bold !important;

    transition: all 0.3s ease !important;

    margin: 0 !important;

}

.gen-button:hover {

    background-color: #f50057 !important;

    transform: scale(1.05);

}

.clear-button {

    color: white !important;

    background-color: #000000 !important;

    padding: 5px !important;

    border-radius: 10px !important;

    margin: 0 !important;

}

.clear-button:hover {

    background-color: #000000 !important;

    transform: scale(1.05);

}

"""

MODELS = [
    "llama-3.2-90b-vision-preview",
    "llama-3.2-11b-vision-preview",
    "llava-v1.5-7b-4096-preview"
]

def compress_image(image, max_size=(800, 800), quality=95):
    img = Image.open(image) if isinstance(image, str) else image
    img.thumbnail(max_size)
    buffered = io.BytesIO()
    img.save(buffered, format="JPEG", quality=quality)
    return buffered.getvalue()

def encode_image(image):
    if isinstance(image, Image.Image):
        buffered = io.BytesIO()
        image.save(buffered, format="JPEG", quality=95)
        return base64.b64encode(buffered.getvalue()).decode('utf-8')
    else:
        compressed = compress_image(image)
        return base64.b64encode(compressed).decode('utf-8')

def create_client(api_key=None):
    if not api_key:
        api_key = os.getenv("YOUR_API_KEY")
    return Groq(api_key=api_key)

def analyze_input(text_input, Quick_Input, Language):
    if Quick_Input == "自行輸入":
        return text_input.strip()
    elif Quick_Input == "描述圖片":
        if Language == "English":
            Input_Text = "Take a close look at the image and describe it in as much detail as possible. Be sure to mention the main subject, the background, the colors used, the mood or feeling it evokes, and any specific elements that stand out."
        else:
            Input_Text = "仔細觀察圖片,依據圖片意境為圖片命名,並盡可能詳細的描述其內容。務必提及主體、背景、使用的顏色、所引發的情緒或感受,以及任何突出的特定元素。必須用「繁體中文」回覆我。"
            if text_input.strip():
                Input_Text += " " + text_input.strip()
        return Input_Text.strip()
    elif Quick_Input == "圖片文字檢索":
        if Language == "English":
            Input_Text = "What does the text in this photo say?"
        else:
            Input_Text = "根據圖片中寫得文字內容是什麼?詳細列出所有文字,必須用「繁體中文」回覆我。"
        return Input_Text.strip()
    elif Quick_Input == "圖像推理":
        if Language == "English":
            Input_Text = "Let's work this out in a step by step way to be sure we have the right answer. Deduce from the image and provide a quick answer."
        else:
            Input_Text = "讓我們一步一步地解決這個問題,以確保我們得到正確的答案。根據圖片進行推理並提供答案,必須用「繁體中文」回覆我。"
            if text_input.strip():
                Input_Text += " " + text_input.strip()
        return Input_Text.strip()
    else:
        return text_input.strip()

def process_image_and_text(image, text_input, Quick_Input, Language, model, api_key):
    gr.Info("圖片正在分析中,請稍待片刻......")
    if Quick_Input == "自行輸入" and not text_input.strip():
        return "錯誤:請輸入問題或選擇快速輸入選項!!"
    if not Language:
        Language = "English"
    text_input = text_input.strip()
    client = create_client(api_key)
    base64_image = encode_image(image)
    Input_Text = analyze_input(text_input, Quick_Input, Language)
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": Input_Text},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                            },
                        },
                    ],
                }
            ],
            model=model,
            temperature=1,
        )
        return zhconv(chat_completion.choices[0].message.content.strip(), "zh-tw")
    except Exception as e:
        error_traceback = traceback.format_exc()
        gr.Warning(f"發生錯誤:{error_traceback}")
        return f"發生錯誤:{error_traceback}"

with gr.Blocks(theme=gr.themes.Monochrome(), css=custom_css) as iface:
    gr.Markdown("""

    # 🐹 Large Multimodal Models - 財政部財政資訊中心 🐹

    > ### **※ 玩轉視覺推理與分析,探索我們的未來藍圖,系統布署:江信宗,Meta開源模型:Llama-3.2-90B-Vision。**

    """, elem_classes="center-aligned")

    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.Image(type="pil", label="上傳圖片", elem_classes="image-background")
            model_select = gr.Dropdown(choices=MODELS, label="選擇多模態模型", value=MODELS[0], elem_classes="model-background")
            api_key = gr.Textbox(label="請輸入您的 API Key", type="password", placeholder="API authentication key", elem_classes="api-background")
            clear_button = gr.Button("清除回答及圖片", variant="secondary", elem_classes="clear-button")
            gr.Markdown("""

            ### **※ 可推理分析影片每秒每幀搜尋特定人事物所在時間軸。**

            """, elem_classes="center-aligned")

        with gr.Column(scale=1):
            text_input = gr.Textbox(label="請輸入您的問題", placeholder="Enter your question ...", autofocus=True, elem_classes="input-background", max_lines=5)
            with gr.Row():
                Quick_Input = gr.Dropdown(
                choices=["自行輸入", "描述圖片", "圖像推理", "圖片文字檢索"],
                value="自行輸入",
                label="快速輸入",
                interactive=True,
                elem_classes="lng-background"
                )
                Language = gr.Dropdown(
                choices=["繁體中文", "English"],
                value="繁體中文",
                label="回答語言",
                interactive=True,
                elem_classes="lng-background"
                )
            submit_button = gr.Button("傳送", variant="primary", elem_classes="gen-button")
            output = gr.Textbox(label="模型推理分析結果", elem_classes="script-background", max_lines=40)

    submit_button.click(
        fn=process_image_and_text,
        inputs=[image_input, text_input, Quick_Input, Language, model_select, api_key],
        outputs=[output]
    )

    def clear_outputs():
        return None, None, ""

    clear_button.click(
        fn=clear_outputs,
        inputs=[],
        outputs=[image_input, text_input, output]
    )

if __name__ == "__main__":
    if "SPACE_ID" in os.environ:
        iface.launch()
    else:
        iface.launch(share=True, show_api=False)