Spaces:

DeepLearning101
/

Multimodal-Playground

Sleeping

File size: 10,994 Bytes

import gradio as gr
import requests
import mimetypes
import json, os
import asyncio
import aiohttp
import subprocess

# --- 1. 環境設定 ---
# pip 升級 (通常 Space 啟動時跑一次即可)
def upgrade_pip():
    try:
        subprocess.check_call([os.sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
        print("pip 升級成功")
    except subprocess.CalledProcessError:
        print("pip 升級失敗")

upgrade_pip()

LLM_API = os.environ.get("LLM_API", "").strip()
LLM_URL = os.environ.get("LLM_URL", "").strip() # 確保去除空格
USER_ID = "HuggingFace Space"

# --- 2. 上傳檔案函式 (修正版) ---
async def upload_file(LLM_URL, LLM_API, file_path, user_id):
    """
    將本地暫存檔案上傳到 LLM Server，取得 file_id
    """
    if not os.path.exists(file_path):
        return {"error": f"File {file_path} not found"}

    mime_type, _ = mimetypes.guess_type(file_path)
    if mime_type is None:
        mime_type = 'application/octet-stream'
        
    filename = os.path.basename(file_path)
    print(f"正在上傳檔案: {filename} ({mime_type})")

    try:
        data = aiohttp.FormData()
        # 注意: 這裡必須再次 open file，aiohttp 會自動處理串流
        data.add_field('file', open(file_path, 'rb'), filename=filename, content_type=mime_type)
        data.add_field('user', user_id)

        async with aiohttp.ClientSession() as session:
            async with session.post(
                f"{LLM_URL}/files/upload",
                headers={"Authorization": f"Bearer {LLM_API}"},
                data=data
            ) as response:
                response_text = await response.text()
                print(f"上傳回應狀態: {response.status}")
                
                if response.status != 200 and response.status != 201:
                    print(f"上傳失敗回應: {response_text}")
                    return {"error": f"Upload failed: {response.status} - {response_text}"}
                
                return json.loads(response_text)

    except Exception as e:
        print(f"上傳過程發生例外: {e}")
        return {"error": str(e)}

# --- 3. 對話請求函式 (改用 file_id) ---
async def send_chat_message(LLM_URL, LLM_API, category, file_id):
    """
    使用 file_id 發送對話請求
    """
    payload = {
        "inputs": {},
        "query": category,
        "conversation_id": "",
        "user": USER_ID,
        "response_mode": "streaming",
        "files": [
            {
                "type": "image",
                "transfer_method": "local_file", # 注意：使用 ID 時這裡通常是 local_file
                "upload_file_id": file_id
            }
        ]
    }

    print(f"發送請求中... (File ID: {file_id})")
    answer = ""
    
    try:
        async with aiohttp.ClientSession() as session:
            async with session.post(
                f"{LLM_URL}/chat-messages",
                headers={
                    "Authorization": f"Bearer {LLM_API}",
                    "Content-Type": "application/json"
                },
                json=payload
            ) as response:
                
                if response.status != 200:
                    error_text = await response.text()
                    return f"Chat Error {response.status}: {error_text}"

                async for line_bytes in response.content:
                    line = line_bytes.decode("utf-8").strip()
                    if line.startswith("data: "):
                        try:
                            data = json.loads(line[6:])
                            if "answer" in data:
                                answer += data["answer"]
                            if "error" in data:
                                return f"Stream Error: {data}"
                        except:
                            continue
                            
    except Exception as e:
        return f"Request Exception: {str(e)}"

    return answer or "No answer returned."

# --- 4. 主處理邏輯 ---
async def handle_input(file_path, category):
    if not file_path:
        return "請先上傳圖片"

    # 步驟 1: 上傳檔案
    upload_result = await upload_file(LLM_URL, LLM_API, file_path, USER_ID)
    
    # 檢查上傳是否成功
    if "error" in upload_result:
        return f"上傳錯誤: {upload_result['error']}"
    
    file_id = upload_result.get("id")
    if not file_id:
        return f"錯誤: 上傳成功但未回傳 ID。回應: {upload_result}"

    # 步驟 2: 發送對話
    return await send_chat_message(LLM_URL, LLM_API, category, file_id)

# UI 元件 & 資料
examples = [
    ['DEMO/Medical1.jpg', '診斷證明書'],
    ['DEMO/Medical2.jpg', '診斷證明書'],
    ['DEMO/passport.png', '護照'],
    ['DEMO/residence.png', '居留證'],    
    ['DEMO/boarding-pass.png', '機票'],    
    ['DEMO/taxi.jpg', '計程車乘車證明'],    
    ['DEMO/etag.jpg', '通行明細 (etag)'],
    ["DEMO/qrcode.jpg", 'QRCODE發票'],
    ['DEMO/mthsr.JPG', '超商高鐵車票'],
    ['DEMO/thsr.jpg', '高鐵車票'],
    ['DEMO/mtra.jpg', '超商台鐵車票'],
    ['DEMO/tra.JPG', '台鐵車票'],
    ['DEMO/ID-back.png', '身份證背面'],
    ['DEMO/ID.png', '身份證正面'],
    ['DEMO/health.png', '健保卡'],
]

TITLE = """<h1>Multimodal Playground 💬 輸入各種單據並選擇種類，解析得到各種關鍵資訊 </h1>"""
SUBTITLE = """<h2><a href='https://deep-learning-101.github.io' target='_blank'>deep-learning-101.github.io</a> | <a href='https://www.twman.org/AI' target='_blank'> AI </a> | <a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> | <a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a><br></h2>"""
LINKS = """
<a href='https://github.com/Deep-Learning-101' target='_blank'>Deep Learning 101 Github</a> | <a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> | <a href='https://www.facebook.com/groups/525579498272187/' target='_blank'>台灣人工智慧社團 FB</a> | <a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a><br>
<a href='https://blog.twman.org/2025/04/AI-Robot.html' target='_blank'>AI 陪伴機器人：2025 趨勢分析技術突破、市場潛力與未來展望</a> | <a href='https://blog.twman.org/2025/04/FinanceGenAI.html' target='_blank'>金融科技新浪潮：生成式 AI (GenAI) 應用場景、效益與導入挑戰</a><br>
<a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>避開 AI Agent 開發陷阱：常見問題、挑戰與解決方案 (實戰經驗)</a>：<a href="https://deep-learning-101.github.io/agent" target="_blank">探討多種 AI 代理人工具的應用經驗與挑戰，分享實用經驗與工具推薦。</a><br>
<a href="https://blog.twman.org/2024/08/LLM.html" target="_blank">白話文手把手帶你科普 GenAI</a></b>：<a href="https://deep-learning-101.github.io/GenAI" target="_blank">淺顯介紹生成式人工智慧核心概念，強調硬體資源和數據的重要性。</a><br>
<a href="https://blog.twman.org/2024/09/LLM.html" target="_blank">大型語言模型直接就打完收工？</a></b>：<a href="https://deep-learning-101.github.io/1010LLM" target="_blank">回顧 LLM 領域探索歷程，討論硬體升級對 AI 開發的重要性。</a><br>
<a href="https://blog.twman.org/2024/07/RAG.html" target="_blank">檢索增強生成(RAG)不是萬靈丹之優化挑戰技巧</a></b>：<a href="https://deep-learning-101.github.io/RAG" target="_blank">探討 RAG 技術應用與挑戰，提供實用經驗分享和工具建議。</a><br>
<a href="https://blog.twman.org/2024/02/LLM.html" target="_blank">大型語言模型 (LLM) 入門完整指南：原理、應用與未來</a></b>：<a href="https://deep-learning-101.github.io/0204LLM" target="_blank">探討多種 LLM 工具的應用與挑戰，強調硬體資源的重要性。</a><br>
<a href="https://blog.twman.org/2023/04/GPT.html" target="_blank">解析探索大型語言模型：模型發展歷史、訓練及微調技術的 VRAM 估算</a></b>：<a href="https://deep-learning-101.github.io/GPU" target="_blank">探討 LLM 的發展與應用，強調硬體資源在開發中的關鍵作用。</a><br>
<a href="https://blog.twman.org/2024/11/diffusion.html" target="_blank">Diffusion Model 完全解析：從原理、應用到實作 (AI 圖像生成)</a></b>；<a href="https://deep-learning-101.github.io/diffusion" target="_blank">深入探討影像生成與分割技術的應用，強調硬體資源的重要性。</a><br>
<a href="https://blog.twman.org/2024/02/asr-tts.html" target="_blank">ASR/TTS 開發避坑指南：語音辨識與合成的常見挑戰與對策</a></b>：<a href="https://deep-learning-101.github.io/asr-tts" target="_blank">探討 ASR 和 TTS 技術應用中的問題，強調數據質量的重要性。</a><br>
<a href="https://blog.twman.org/2021/04/NLP.html" target="_blank">那些 NLP 踩的坑</a></b>：<a href="https://deep-learning-101.github.io/nlp" target="_blank">分享 NLP 領域的實踐經驗，強調數據質量對模型效果的影響。</a><br>
<a href="https://blog.twman.org/2021/04/ASR.html" target="_blank">那些語音處理踩的坑</a></b>：<a href="https://deep-learning-101.github.io/speech" target="_blank">分享語音處理領域的實務經驗，強調資料品質對模型效果的影響。</a><br>
<a href="https://blog.twman.org/2020/05/DeepLearning.html" target="_blank">手把手學深度學習安裝環境</a></b>：<a href="https://deep-learning-101.github.io/101" target="_blank">詳細介紹在 Ubuntu 上安裝深度學習環境的步驟，分享實際操作經驗。</a><br>
<a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a><br>
<a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
"""

with gr.Blocks() as iface:
    gr.HTML(TITLE)
    gr.HTML(LINKS) # 需要時取消註解

    with gr.Row():
        file_input = gr.Image(label='圖片上傳', type='filepath') # type='filepath' 很重要
        category = gr.Radio(label="文件類型", choices=[
            "機票", "計程車乘車證明", "通行明細 (etag)", "QRCODE發票",
            "超商高鐵車票", "高鐵車票", "超商台鐵車票", "台鐵車票", 
            "診斷證明書", "身份證正面", "身份證反面", "健保卡", "護照", "居留證"
        ])
    
    submit_btn = gr.Button("解析", variant="primary")
    output_text = gr.Textbox(label="解析結果", lines=10)

    submit_btn.click(fn=handle_input, inputs=[file_input, category], outputs=output_text)

    gr.Examples(
        examples=examples,
        inputs=[file_input, category],
        label="點擊範例直接測試"
    )

iface.launch()