# app.py - Microsoft Fara-7B Multi-Modal Demo import gradio as gr from transformers import AutoProcessor, AutoModelForVision2Seq import torch from PIL import Image import requests from io import BytesIO # 加载模型(首次加载约需 5–10 分钟) MODEL_NAME = "microsoft/Fara-7B" print("正在加载模型,请稍候...") processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True) model = AutoModelForVision2Seq.from_pretrained( MODEL_NAME, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto" ) def chat_with_image(image: Image.Image, question: str, max_new_tokens: int = 200): if image is None: return "请上传一张图片。" if not question.strip(): return "请输入问题。" try: # 构造消息格式 messages = [ { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": question} ] } ] # 应用聊天模板 prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # 处理输入 inputs = processor( text=prompt, images=image, return_tensors="pt" ).to(model.device) # 生成回答 with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=False, pad_token_id=processor.tokenizer.pad_token_id, eos_token_id=processor.tokenizer.eos_token_id ) response = processor.decode(outputs[0], skip_special_tokens=True) # 清理输出(只保留 Assistant 回答部分) if "Assistant:" in response: response = response.split("Assistant:")[-1].strip() return response except Exception as e: return f"处理出错: {str(e)}" # Gradio 界面 with gr.Blocks(title="Fara-7B 多模态问答") as demo: gr.Markdown("# 🖼️ Microsoft Fara-7B 图像问答系统\n上传图片并提问,AI 将为你解答!") with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="上传图片") question_input = gr.Textbox(label="你的问题", placeholder="例如:图中有什么动物?") max_tokens = gr.Slider(50, 500, value=200, step=10, label="最大生成长度") submit_btn = gr.Button("提交") with gr.Column(): output = gr.Textbox(label="模型回答", lines=5) submit_btn.click( fn=chat_with_image, inputs=[image_input, question_input, max_tokens], outputs=output ) gr.Examples( examples=[ ["https://tse2-mm.cn.bing.net/th/id/OIP-C.OkY4eWXcSyyit75R53WOBQAAAA?w=330&h=174&c=7&r=0&o=7&cb=ucfimg2&pid=1.7&rm=3&ucfimg=1", "What animal is on the candy?"], ["https://tse2-mm.cn.bing.net/th/id/OIP-C.OkY4eWXcSyyit75R53WOBQAAAA?w=330&h=174&c=7&r=0&o=7&cb=ucfimg2&pid=1.7&rm=3&ucfimg=1", "Describe the scene in detail."] ], inputs=[image_input, question_input] ) demo.launch()