# app.py - Microsoft Fara-7B Multi-Modal Demo
import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
from PIL import Image
import requests
from io import BytesIO

# 加载模型（首次加载约需 5–10 分钟）
MODEL_NAME = "microsoft/Fara-7B"

print("正在加载模型，请稍候...")

processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

def chat_with_image(image: Image.Image, question: str, max_new_tokens: int = 200):
    if image is None:
        return "请上传一张图片。"
    if not question.strip():
        return "请输入问题。"
    
    try:
        # 构造消息格式
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": question}
                ]
            }
        ]
        
        # 应用聊天模板
        prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        # 处理输入
        inputs = processor(
            text=prompt,
            images=image,
            return_tensors="pt"
        ).to(model.device)

        # 生成回答
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=processor.tokenizer.pad_token_id,
                eos_token_id=processor.tokenizer.eos_token_id
            )

        response = processor.decode(outputs[0], skip_special_tokens=True)
        
        # 清理输出（只保留 Assistant 回答部分）
        if "Assistant:" in response:
            response = response.split("Assistant:")[-1].strip()
        
        return response

    except Exception as e:
        return f"处理出错: {str(e)}"

# Gradio 界面
with gr.Blocks(title="Fara-7B 多模态问答") as demo:
    gr.Markdown("# 🖼️ Microsoft Fara-7B 图像问答系统\n上传图片并提问，AI 将为你解答！")
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="上传图片")
            question_input = gr.Textbox(label="你的问题", placeholder="例如：图中有什么动物？")
            max_tokens = gr.Slider(50, 500, value=200, step=10, label="最大生成长度")
            submit_btn = gr.Button("提交")
        with gr.Column():
            output = gr.Textbox(label="模型回答", lines=5)
    
    submit_btn.click(
        fn=chat_with_image,
        inputs=[image_input, question_input, max_tokens],
        outputs=output
    )
    
    gr.Examples(
        examples=[
            ["https://tse2-mm.cn.bing.net/th/id/OIP-C.OkY4eWXcSyyit75R53WOBQAAAA?w=330&h=174&c=7&r=0&o=7&cb=ucfimg2&pid=1.7&rm=3&ucfimg=1", "What animal is on the candy?"],
            ["https://tse2-mm.cn.bing.net/th/id/OIP-C.OkY4eWXcSyyit75R53WOBQAAAA?w=330&h=174&c=7&r=0&o=7&cb=ucfimg2&pid=1.7&rm=3&ucfimg=1", "Describe the scene in detail."]
        ],
        inputs=[image_input, question_input]
    )

demo.launch()