Spaces:

allinaigc
/

coder001

Paused

App Files Files Community

coder001 / st_openai_assistant.py

allinaigc

Upload 26 files

b862271 verified almost 2 years ago

raw

history blame contribute delete

12 kB

	'''
	使用OpenAI的官方Assistant，实现code_interpreter功能:
	1. 使用OpenAI官方的API教程：
	https://cookbook.openai.com/examples/assistants_api_overview_python （重要）
	https://platform.openai.com/docs/assistants/overview
	2. YT视频： https://www.youtube.com/watch?v=5rcjGjgJNQc
	3. 基础代码参考：openai_interpreter.py
	4. 函数化text_response和image_response和prompt。
	5. 中文字体显示还是有问题，现在妥协的方式是转换成英文来显示。不影响分析过程和结果。

	##TEMP

	import matplotlib
	import seaborn as sns
	SimHei = FontProperties(fname='SimHei.ttf')
	matplotlib.rcParams['font.sans-serif'] = ['SimHei']
	sns.set(rc={'axes.facecolor':'#FFF9ED','figure.facecolor':'#FFF9ED'}, palette='dark', font='SimHei')
	fontproperties = SimHei

	每次输出图表时，你都需要在matplotlib和seaborn中安装中文字体'SimHei'，安装的路径在：'/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/code_interpreter/SimHei.ttf'。然后，你需要在mattplotlib和seaborn中设置字体为'SimHei'。

	'''
	##TODO:

	# -- coding: utf-8 --
	import csv
	import os
	import pathlib
	import re
	import tempfile
	import time
	import warnings
	from itertools import product
	from pathlib import Path
	from re import sub
	from tempfile import NamedTemporaryFile
	from time import sleep

	import matplotlib.pyplot as plt
	import numpy as np
	import openai
	import pandas as pd
	import requests
	import seaborn as sns
	from dotenv import load_dotenv
	from matplotlib.pyplot import style
	from rich import print
	from tqdm import tqdm, tqdm_notebook, trange
	import chatgpt ## 自定义的chatgpt函数，用于优化text response的回答。

	import streamlit as st
	import add_fonts

	warnings.filterwarnings('ignore')
	sns.set()

	load_dotenv()
	### 设置openai的API key
	os.environ["OPENAI_API_KEY"] = os.environ['user_token']
	openai.api_key = os.environ['user_token']

	from openai import Client
	client = Client()

	## JSON output display.
	import json
	def show_json(name=None,obj=None):
	# display(json.loads(obj.model_dump_json())) ## orignial code.
	print(name,":",json.loads(obj.model_dump_json()))
	print("--"*40)

	def save_json(obj):
	json_message = json.dumps(obj.model_dump_json(), indent=4)
	with open('message.json', 'w') as file:
	file.write(json_message)

	### set up a function to wrap up the code-interpreter.
	def openai_assistant(prompt=None, filepath=None, username=None):
	client = Client()
	file = client.files.create(
	file=open(filepath, 'rb'),
	# file=open('./大模型LLM解决方案调研问卷.pdf', 'rb'),
	purpose='assistants',
	)
	# show_json(file) ## view the file info.

	### create an ansistant.
	assistant = client.beta.assistants.create(
	name="AI Expert",
	# instructions="You are a helpful and powerful AI. When asked a question, you need to answer the question based on the information in the file provided to you.",
	instructions="""
	你是一个强大的AI助手。当被问到一个问题时，你需要根据提供给你的文件中的信息来回答这个问题。如果我没有告诉你任何定制化的要求，那么请你按照以下的默认要求来回答：
	-------------------------------------------------------------------------
	1. 你需要用我提问的语言来回答。
	2. 如果要求你输出图表，那么图的解析度dpi需要设定为600。图尽量使用seaborn库。
	3. 图表上如果有非英文的文字，那么你需要将字体翻译为英文，然后显示。
	4. 你回答的文字内容必须尽可能的详细且通俗易懂。
	5. 回答时尽可能地展示分析所对应的图表，并提供分析结果。你需要按如下格式提供内容：
	5.1 提供详细且专业的分析结果，提供足够的分析依据。
	5.2 给出可能造成这一结果的可能原因有哪些？
	以上内容全部用1, 2, 3这样的序列号格式来表达。
	""",
	# tools=[{"type": "retrieval"}],
	tools=[{"type": "code_interpreter"}],
	# model="gpt-4-1106-preview",
	model="gpt-3.5-turbo-1106", ## 注意这个版本以上才有retrieval功能。
	file_ids=[file.id],
	# file_ids=["file-6ZaFecYrnYDFXH7WM3HbtQRK"], ## what are the OpenAI assistant API's file_ids look like.
	)
	# print('assistant ID:', assistant.id)
	# show_json(assistant)

	### 这里必须要提供file.id。
	thread = client.beta.threads.create(
	messages=[
	{
	"role": "user",
	# "content": "analyze the dataset and plot something interesting.",
	"content": prompt,
	# "file_ids": ["file-6ZaFecYrnYDFXH7WM3HbtQRK"],
	"file_ids": [file.id]
	}
	],
	)
	run = client.beta.threads.runs.create(
	thread_id = thread.id,
	assistant_id = assistant.id,
	)

	run = client.beta.threads.runs.retrieve(
	run_id = run.id,
	thread_id = thread.id,
	timeout=100,
	)

	##NOTE: 因为run是异步的，所以这里必须要等待一段时间，直到run完成。否则返回的结果不会有内容，只有user的prompt。
	import time
	def wait_on_run(run, thread):
	while run.status == "queued" or run.status == "in_progress":
	run = client.beta.threads.runs.retrieve(
	thread_id=thread.id,
	run_id=run.id,
	)
	time.sleep(0.5)
	return run
	run = wait_on_run(run, thread)

	### retrieve the message from OpenAI.
	messages = client.beta.threads.messages.list(thread_id=thread.id) ##NOTE: 注意这里的返回格式默认是倒序，可以设置成正序。但是最后一个往往是我们要的答案。

	show_json(name='messages:',obj=messages)
	print('--'*40)
	save_json(obj=messages)

	## check the messages format. find image and context accordingly.
	# text_response, image_response = None, None ## single image and text response.

	###NOTE: find all the image files inside the reponsed message.
	image_response = [] ## multiple images and text responses.
	imagefile_count = 0
	imagefile_position = []
	for i in range(len(messages.data)):
	for j in range(len(messages.data[i].content)):
	try:
	if messages.data[i].content[j].image_file:
	imagefile_count += 1
	imagefile_position.append((i,j))
	except:
	pass

	print('--'*30)
	print("总共有几张图片？：", imagefile_count)
	print('--'*30)

	print('start the image and text repsonse process!')

	image_name = []
	image_files = []
	for x, y in imagefile_position:
	random_num = np.random.randint(10000, 50000)
	print('x,y=', x, y)
	try:
	if messages.data[x].content[y].image_file:
	### save the image file.
	image_file = openai.files.content(file_id=messages.data[x].content[y].image_file.file_id) ## message's image file id. note the format.
	image_files.append(image_file)

	##NOTE: 这里会保存图片在本地目录中，考虑关闭。
	# with open(f'./{username}/output{x}{y}_{random_num}.png', "wb") as f:
	# f.write(image_file.content)

	# # Load the PNG file
	# import matplotlib.image as mpimg
	# import matplotlib.pyplot as plt
	# # image_response.append(mpimg.imread(f'./{username}/output{x}{y}.png'))
	# image_response = mpimg.imread(f'./{username}/output{x}{y}_{random_num}.png')
	# plt.imshow(image_response)
	# plt.show()
	except Exception as e:
	print(f"An error occurred: {e}")
	pass

	### consolidate text_response below.
	text_response_num = 0
	my_msg = []
	for x in range(5): #NOTE: 遍历所有的数据点，但是注意要去掉最后的一段。
	for y in range(5):
	try:
	if messages.data[x].content[y].text:
	print('x, y=', x, y)
	my_msg.append(messages.data[x].content[y].text.value)
	text_response_num += 1
	except:
	pass
	final_msg = sorted(my_msg[1:], reverse=True) ## 需要去除最后两段，因为是默认的user的输入，还有就是第一个无用的assistant接受命令的回答。
	text_response = str()
	for i in range(len(final_msg)):
	text_response += final_msg[i]
	print('final_msg:', final_msg)
	print('总共有几个text response：', text_response_num)

	## convert a readiable markdown format.
	# text_response = final_msg[0] #NOTE：输出的格式是list，需要转换成Markdown可以是识别的内容。这里是获得一个str格式的内容，否则一个list对象。
	# text_response = text_response.replace("['",'"""').replace("']",'"""') ## 这里不需要处理首尾的list符号。

	## 用LLM优化所有的text response的回答。
	import chatgpt
	# text_response = '中国在哪里？'
	user_prompt = f"""首先，我会向你提供一段【文字内容】，这段文字中可能包括了一系列的多轮对话的内容。接着，我需要你根据这段文字中的内容整理成一段文字结论。你的回答风格需要很专业，包括：尽可能的包含统计数据、数字和专业的结论，不能有口语化的表达。【文字内容】如下{text_response}。"""
	final_answer = chatgpt.chatgpt(user_prompt=user_prompt)
	# final_answer = chatgpt.chatgpt(user_prompt=user_prompt, openai_model="gpt-3.5-turbo-16k") ### original code here.

	return messages, text_response, image_response, image_files, final_answer


	# filepath = '/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/code_interpreter/rawdata/iris.csv'
	# # # # # # # prompt = "analyze the dataset and plot something interesting."

	# prompt = "analyze the dataset and plot something interesting. give me at least 2 plots. please reply in Chinese instead of English."
	# # # # # # # messages, text_reponse, image_response = openai_assistant(filepath=filepath,username='joeshi') ## working.
	# messages, text_response, image_response, image_files, final_answer = openai_assistant(prompt, filepath=filepath,username='joeshi') ## probable multiple images and text responses.

	# print("--"*40)
	# print('final_answer:', final_answer)
	# print("--"*40)


	# # # '''temp work area'''
	# # # # msg_json = show_json(messages)
	# # show_json(name='messages:',obj=messages)

	# # # messages.data[0].content[0].image_file
	# # # messages.data[0].content[2].text.value

	# # # tt = text_response

	# # # import re
	# # # regex = re.compile(r'(\n\n)\|(\n1)')
	# # # def convert_markdown(text):
	# # # """
	# # # 将markdown格式的文字转换成可读性强的文字格式。
	# # # Args:
	# # # text: markdown格式的文字。
	# # # Returns:
	# # # 可读性强的文字。
	# # # """
	# # # result = re.sub(regex, lambda m: m.group(1) if m.group(1) else '\n1.', text)
	# # # return result
	# # # print(convert_markdown(str(text_response)))

	# # # messages.data[1].content[1].text.value

	# my_msg = []
	# for x in range(5):
	# for y in range(5):
	# try:
	# if messages.data[x].content[y].text:
	# print('x, y=', x, y)
	# my_msg.append(messages.data[x].content[y].text.value)
	# # else:
	# # continue
	# except:
	# pass
	# print(sorted(my_msg[:-1], reverse=True))

	# msg = str()
	# for i in range(len(final_msg)):
	# msg += final_msg[i]

	# print(msg)
	# type(msg)