import os import torch from langchain.agents import load_tools from langchain.agents import initialize_agent from langchain.agents import AgentType from langchain.llms import OpenAI from langchain.chat_models import ChatOpenAI from langchain.chains.conversation.memory import ConversationBufferWindowMemory from transformers import BlipProcessor,BlipForConditionalGeneration from transformers.models.oneformer.modeling_oneformer import OneFormerModelOutput import requests from PIL import Image from langchain.tools import BaseTool import gradio as gr from langchain import PromptTemplate, FewShotPromptTemplate, LLMChain OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # OPENAI_API_BASE = os.getenv("OPENAI_AI_BASE") # DEP_NAME = os.getenv("deployment name") llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=0, model_name='gpt-3.5-turbo') # llm = AzureChatOpenAI(deployment_name=DEP_NAME,openai_api_base=OPENAI_API_BASE,openai_api_key=OPENAI_API_KEY,openai_api_version="2023-03-15-preview",model_name="gpt-3.5-turbo") image_to_text_model = "Salesforce/blip-image-captioning-large" # image_to_text_model = "F:\\code\\Anaconda\\blip-image-captioning-large" device = 'cuda' if torch.cuda.is_available() else 'cpu' processor = BlipProcessor.from_pretrained(image_to_text_model) model = BlipForConditionalGeneration.from_pretrained(image_to_text_model).to(device) def describeImage(image): # image_object = Image.open(requests.get(image_url, stream=True).raw).convert('RGB') image_object = Image.open(image).convert('RGB') # image inputs = processor(image_object, return_tensors="pt").to(device) outputs = model.generate(**inputs) return processor.decode(outputs[0], skip_special_tokens=True) #img_url = 'https://img1.baidu.com/it/u=1919509102,1927615551&fm=253&fmt=auto&app=120&f=JPEG?w=889&h=500' # description = describeImage(img_url) # print(description) class DescribeImageTool(BaseTool): name = "Describe Image Tool" description = 'use this tool to describe an image.' def _run(self, url: str): description = describeImage(url) return description def _arun(self, query: str): raise NotImplementedError("Async operation not supported yet") tools = [DescribeImageTool()] agent = initialize_agent( agent='chat-conversational-react-description', tools=tools, llm=llm, verbose=True, max_iterations=3, early_stopping_method='generate', memory=ConversationBufferWindowMemory( memory_key='chat_history', k=5, return_messages=True ) ) # 翻译 def to_chinese(title): pp = "翻译下面语句到中文\n{en}" prompt = PromptTemplate( input_variables=["en"], template=pp ) llchain = LLMChain(llm=llm, prompt=prompt) return llchain.run(title) def descImage(input_text , image_url) : output = agent(f"{input_text}:\n{image_url}") print( output ) desc = output['output'] # 对描述文字进行转换 desc_ch = to_chinese(desc) return desc_ch # ret = descImage("描述该图片","https://lmg.jj20.com/up/allimg/4k/s/02/2109250006343S5-0-lp.jpg") # print(ret) with gr.Blocks() as demo: with gr.Column(): file = gr.Image(type='filepath') user_input = gr.Textbox(show_label=False,placeholder="请输入问题",lines=1) with gr.Column(): submitBtn = gr.Button("提交",variant="primary") with gr.Column(): output = gr.TextArea(show_label=False,placeholder="输出结果",lines=5) submitBtn.click(descImage,[user_input,file],output,show_progress=True) demo.launch()