Spaces:

binaychandra
/

datasenseapp

Sleeping

App Files Files Community

datasenseapp / lang_assistant /langhelper.py

binaychandra

Update lang_assistant/langhelper.py

6551d94 verified over 1 year ago

raw

history blame contribute delete

20.2 kB

	import langchain
	from langchain.chat_models import ChatOpenAI
	from langchain.prompts import ChatPromptTemplate
	import openai
	from langchain.memory import ConversationBufferMemory
	import pandas as pd
	from langchain import memory
	import sqlite3
	import os
	import json
	from langchain.agents import create_pandas_dataframe_agent
	from langchain.agents.agent_types import AgentType
	import io
	import logging
	import requests
	import urllib3
	from langchain.prompts import ChatPromptTemplate
	from langchain.output_parsers import ResponseSchema
	from langchain.output_parsers import StructuredOutputParser
	from langchain.callbacks import LLMonitorCallbackHandler
	from langchain.chat_models import ChatOpenAI
	from langchain.prompts.chat import SystemMessage, HumanMessagePromptTemplate, HumanMessage, AIMessage
	from langchain.agents import create_pandas_dataframe_agent

	#current_model_id = 'gpt-4o-mini'

	from dotenv import load_dotenv, find_dotenv
	_ = load_dotenv(find_dotenv()) # read local .env file

	current_model_id = os.getenv('model_id')

	def summary_extractor_from_df(df:pd.DataFrame)-> str:

	try:
	chatmodel = ChatOpenAI()
	template = "You are an AI assistant and your task is to summarize the workforce distribution based on gender delimited by triple backticks \
	and output should clearly indicate how much percentage one gender is higher than other one, and based on the findings make some comments on includsiveness \
	if this is good for company or not.. limit the output in 50 words```{json_genderdf}```"
	prompt_template = ChatPromptTemplate.from_template(template)

	user_message = prompt_template.format_messages(json_genderdf = df)
	response = chatmodel(user_message)
	response_content = response.content
	except Exception as e:
	response_content = f"Gender statistics reveal a balanced representation within our dataset, with males comprising 56% \
	and females 44%. This equitable distribution indicates a positive representation across genders, \
	fostering diversity and inclusivity within our data."
	return response_content

	def chat_response(text:str)->str:
	chatmodel = ChatOpenAI(max_tokens=50)
	response = chatmodel.predict(text)
	return response

	def chat_with_df(query, table_name = None):
	if table_name is None:
	df = pd.read_csv(r"https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
	else:
	#get data
	# con = sqlite3.connect("database.db")
	# df = pd.read_sql_query(f"SELECT * from {table_name}", con)
	print(f"----------table_name : {table_name}-----------")
	filename = f"referencefiles/{table_name}.csv"
	df = pd.read_csv(filename)

	agent = create_pandas_dataframe_agent(
	ChatOpenAI(temperature=0, model=current_model_id),
	df,
	agent_type=AgentType.OPENAI_FUNCTIONS,
	verbose=True
	)
	try:
	res_query = agent.run(query)
	except Exception as e:
	print(str(e))
	res_query = f"ERROR : {str(e)[:150]}"

	return res_query

	def gen_chartdata(df_out, label, chart_type):

	print("Inside gen_chartdata.....")
	print(f"df_out --> {df_out}")
	data_dict = {
	"labels": None,
	"datasets": [
	{
	"label": "Default Label",
	"data": None,
	"type" : chart_type,
	"backgroundColor": ["rgba(250, 240, 230, 0.7)"],
	"hoverBackgroundColor": ["rgba(250, 240, 230, 1)"],
	"borderColor": ["rgba(250, 240, 230, 1)"]
	}
	]
	}
	print(f"df_out.iloc[:,0].to_list() ---> {df_out.iloc[:,0].to_list()}")
	print(f"data_dict[\"datasets\"][0][\"label\"] --> {data_dict['datasets'][0]['label']}")
	print(f"label : {label}")
	print(f"data_dict['datasets'][0]['data'] --> {data_dict['datasets'][0]['data']}")
	print(f"df_out.iloc[:,1].to_list() --> {df_out.iloc[:,1].to_list()}")

	data_dict["labels"] = df_out.iloc[:,0].to_list()
	data_dict["datasets"][0]["label"] = label
	data_dict["datasets"][0]["data"] = df_out.iloc[:,1].to_list()
	if chart_type == 'doughnut':
	data_dict["datasets"][0]["backgroundColor"] = ["rgba(120, 214, 198, 0.7)", "rgba(255, 105, 105, 0.7)", "rgba(150, 194, 145, 0.7)", "rgba(250, 240, 230, 0.7)"]
	data_dict["datasets"][0]["hoverBackgroundColor"] = ["rgba(120, 214, 198, 1)", "rgba(255, 105, 105, 1)", "rgba(150, 194, 145, 1)", "rgba(250, 240, 230, 1)"]
	data_dict["datasets"][0]["borderColor"] = ["rgba(120, 214, 198, 1)", "rgba(255, 105, 105, 1)", "rgba(150, 194, 145, 1)", "rgba(250, 240, 230, 1)"]

	# Serializing json
	json_object = json.dumps(data_dict, indent = 4)

	return json_object

	def generate_graphdata(query, table_name = None):
	if table_name is None:
	df = pd.read_csv(r"https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
	else:
	#get data from sqlite db
	#con = sqlite3.connect("database.db")
	#df = pd.read_sql_query(f"SELECT * from {table_name}", con)
	print(f"----------table_name : {table_name}-----------")
	filename = f"referencefiles/{table_name}.csv"
	df = pd.read_csv(filename)

	jsondf = df.head(5).to_json(orient='records')

	prompt_template = ChatPromptTemplate.from_messages(
	[
	SystemMessage(
	content=(
	"You are a data manager and you have to assign the task to a pandas coder \
	who will be working on the steps provided by you. Assuming the pandas coder has already loaded \
	the dataset in pandas dataframe, your task is to properly provide the steps with \
	proper column names so that pandas coder can easily understand the ask and code till the processing of dataset\
	which can be used for graph creation. include steps only to process the data \
	THE STEPS SHOULD NOT CONTAIN ANY GRAPH CREATION OR PLOTTING INSTRUCTIONS.. \
	always select columns only that are mentioned \
	not any extra columns. also think through if output contains two columns if not then create the steps to \
	only include two appropriate columns in output. Also do not include any steps to select top few records.. \
	the output should be all records from the steps execution.")
	),
	HumanMessage(content=("\
	user_input = ```plot the distribution of males and females who survived```\
	use below dataset for reference delimited by <<<>>>\
	<<<[{\"PassengerId\":1,\"Survived\":0,\"Pclass\":3,\"Name\":\"Braund, Mr. Owen Harris\",\"Sex\":\"male\",\"Age\":22.0,\"SibSp\":1,\"Parch\":0,\"Ticket\":\"A\/5 21171\",\"Fare\":7.25,\"Cabin\":null,\"Embarked\":\"S\"},{\"PassengerId\":2,\"Survived\":1,\"Pclass\":1,\"Name\":\"Cumings, Mrs. John Bradley (Florence Briggs Thayer)\",\"Sex\":\"female\",\"Age\":38.0,\"SibSp\":1,\"Parch\":0,\"Ticket\":\"PC 17599\",\"Fare\":71.2833,\"Cabin\":\"C85\",\"Embarked\":\"C\"},{\"PassengerId\":3,\"Survived\":1,\"Pclass\":3,\"Name\":\"Heikkinen, Miss. Laina\",\"Sex\":\"female\",\"Age\":26.0,\"SibSp\":0,\"Parch\":0,\"Ticket\":\"STON\/O2. 3101282\",\"Fare\":7.925,\"Cabin\":null,\"Embarked\":\"S\"},{\"PassengerId\":4,\"Survived\":1,\"Pclass\":1,\"Name\":\"Futrelle, Mrs. Jacques Heath (Lily May Peel)\",\"Sex\":\"female\",\"Age\":35.0,\"SibSp\":1,\"Parch\":0,\"Ticket\":\"113803\",\"Fare\":53.1,\"Cabin\":\"C123\",\"Embarked\":\"S\"},{\"PassengerId\":5,\"Survived\":0,\"Pclass\":3,\"Name\":\"Allen, Mr. William Henry\",\"Sex\":\"male\",\"Age\":35.0,\"SibSp\":0,\"Parch\":0,\"Ticket\":\"373450\",\"Fare\":8.05,\"Cabin\":null,\"Embarked\":\"S\"}]>>>\
	{format_instructions}\
	from these inputs extract the following information\
	steps: the steps to be performed for the execution of task\
	label: suggest label to be displayed on the graph for the required ask in less than 3 words\
	chart_type: extract what type of chart user want to see.. if not specified then suggest which one \
	would be suitable for the task.. should be one of these ['bar', 'line', 'doughnut', 'scatter', 'text']\
	output text if graph cant be plotted or user required the answer in text format")),
	AIMessage(content=("{\n\t\"steps\": \"1. Filter the dataframe to include only the rows where \'Survived\' column is equal to 1.\\n\
	2. Group the filtered dataframe by \'Sex\' column.\\n\
	3. Count the number of occurrences of each unique value in the \'Sex\' column.\\n\
	4. Create a new dataframe with two columns: \'Sex\' and \'Count\', where \'Sex\' contains the unique values from the \'Sex\' column and \'Count\' contains the corresponding counts.\\n\
	5. Select top two records from the dataframe.\",\\n\
	6. Convert the dataset to Json and create the bar graph.\"t\
	\"label\": \"Distribution of Males and Females who Survived\",\\n\t\
	\"chart_type\": \"bar\"\\n}")),
	HumanMessage(content=("The output seems not correct. the provided steps include importing matplotlib and creating the bar diagram\
	which was clearly mentioned not to include. Also steps include selecting top two records where it was mentioned \
	not to select top few records for output. you should output all records unless asked otherwise. Also steps include \
	the conversion to json which should not have been included. the output should be only pandas dataframe..\
	can you correct these and provide the response.")),
	AIMessage(content=("{\n\t\"steps\": 1. Filter the dataframe to include only the rows where \'Survived\' column is equal to 1.\\n\
	2. Group the filtered dataframe by \'Sex\' column.\\n\
	3. Count the number of occurrences of each unique value in the \'Sex\' column.\\n\
	4. Create a new dataframe with two columns: \'Sex\' and \'Count\', where \'Sex\' contains the unique values from the \'Sex\' column and \'Count\' contains the corresponding counts.\\n\
	5. Return the new dataframe.\",\\n\t\
	\"label\": \"Distribution of Males and Females who Survived\",\\n\t\
	\"chart_type\": \"bar\"\\n}")),
	HumanMessage(content=("Now the output looks perfect. This was just the way I wanted. Thanks a lot..")),
	AIMessage(content=("Thanks for the feedback, I will double check these things next time..")),
	HumanMessage(content=("\
	user_input = ```draw the bar chart for distribution of pclass and for only sex=males```\
	use below dataset for reference delimited by <<<>>>\
	<<<[{\"PassengerId\":1,\"Survived\":0,\"Pclass\":3,\"Name\":\"Braund, Mr. Owen Harris\",\"Sex\":\"male\",\"Age\":22.0,\"SibSp\":1,\"Parch\":0,\"Ticket\":\"A\/5 21171\",\"Fare\":7.25,\"Cabin\":null,\"Embarked\":\"S\"},{\"PassengerId\":2,\"Survived\":1,\"Pclass\":1,\"Name\":\"Cumings, Mrs. John Bradley (Florence Briggs Thayer)\",\"Sex\":\"female\",\"Age\":38.0,\"SibSp\":1,\"Parch\":0,\"Ticket\":\"PC 17599\",\"Fare\":71.2833,\"Cabin\":\"C85\",\"Embarked\":\"C\"},{\"PassengerId\":3,\"Survived\":1,\"Pclass\":3,\"Name\":\"Heikkinen, Miss. Laina\",\"Sex\":\"female\",\"Age\":26.0,\"SibSp\":0,\"Parch\":0,\"Ticket\":\"STON\/O2. 3101282\",\"Fare\":7.925,\"Cabin\":null,\"Embarked\":\"S\"},{\"PassengerId\":4,\"Survived\":1,\"Pclass\":1,\"Name\":\"Futrelle, Mrs. Jacques Heath (Lily May Peel)\",\"Sex\":\"female\",\"Age\":35.0,\"SibSp\":1,\"Parch\":0,\"Ticket\":\"113803\",\"Fare\":53.1,\"Cabin\":\"C123\",\"Embarked\":\"S\"},{\"PassengerId\":5,\"Survived\":0,\"Pclass\":3,\"Name\":\"Allen, Mr. William Henry\",\"Sex\":\"male\",\"Age\":35.0,\"SibSp\":0,\"Parch\":0,\"Ticket\":\"373450\",\"Fare\":8.05,\"Cabin\":null,\"Embarked\":\"S\"}]>>>\
	{format_instructions}\
	from these inputs extract the following information\
	steps: the steps to be performed for the execution of task\
	label: suggest label to be displayed on the graph for the required ask in less than 3 words\
	chart_type: extract what type of chart user want to see.. if not specified then suggest which one \
	would be suitable for the task.. should be one of these ['bar', 'line', 'doughnut', 'scatter', 'text']\
	output text if graph cant be plotted or user required the answer in text format")),
	AIMessage(content=("{\"\n\t\"steps\": \"1. Filter the dataframe to include only rows where Sex is \'male\'\\n\
	2. Group the data by Pclass and count the number of occurrences\\n\
	3. Create a new dataframe with the Pclass and count columns\\n\
	4. Sort the dataframe by Pclass\\n\
	5. Output the resulting dataframe\",\n\t\
	\"label\": \"Distribution of Pclass for males\",\n\t\
	\"chart_type\": \"bar\"\n}")),
	HumanMessage(content=("The output looks perfect. This is exactly the response I was looking for.")),
	AIMessage(content=("Thanks! Now I am more confident about the requirement. ")),
	HumanMessagePromptTemplate.from_template(
	"user_input = ```{user_input}```\
	use sample dataset below for reference delimited by <<<>>>\
	<<<{reference_data}>>>\
	{format_instructions}\
	from these inputs extract the following information\
	steps: the steps to be performed for the execution of task. stritcly remember to not include any graph or plotting related instructions..\
	label: suggest label to be displayed on the graph for the required ask in less than 3 words\
	chart_type: extract what type of chart user want to see.. if not specified then suggest which one \
	would be suitable for the task.. should be one of these ['bar', 'line', 'doughnut', 'scatter', 'text']\
	output text if graph cant be plotted or user required the answer in text format"
	),
	]
	)

	step_schema = ResponseSchema(name="steps",
	description="steps to be performed for the execution of task. stritcly remember to not include any graph or plotting related instructions..")
	label_schema = ResponseSchema(name="label",
	description="suggest label to be displayed on the graph for the required ask")
	chart_type_schema = ResponseSchema(name="chart_type",
	description="Suggest the type of chart to be created.. if user specified\
	any chart type, select that.. otherwise suggest what would be suitable for the user ask\
	From the user input.. should be one of these ['bar', 'line', 'doughnut', 'scatter', 'text']\
	output text if graph cant be plotted or user required the answer in text format")

	response_schemas = [step_schema, label_schema, chart_type_schema]
	output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
	format_instructions = output_parser.get_format_instructions()

	customer_messages = prompt_template.format_messages(user_input=query,
	reference_data=jsondf,
	format_instructions=format_instructions)

	llm = ChatOpenAI(model = current_model_id, temperature = 0)
	response = llm(customer_messages)
	json_pandassteps_charttype = output_parser.parse(response.content)
	print("pandas steps and charttype" , json_pandassteps_charttype)
	agent = create_pandas_dataframe_agent(
	ChatOpenAI(temperature=0, model=current_model_id),
	df,
	agent_type=AgentType.OPENAI_FUNCTIONS,
	verbose=True,
	return_intermediate_steps=True
	)
	try:
	if json_pandassteps_charttype['chart_type'] != "text":
	agent_response = agent({
	'input': f"Remove any steps related to charts or plotting from the steps mentioned in triple backticks\
	and Execute the Python script combining all the steps after remoing the chart related steps\
	Please don't use any print statements. Output should be a pandas dataframe only \
	steps to be performed : {json_pandassteps_charttype['steps']}"
	})
	else:
	chart_type, chart_label, chart_json_data = "text", None, None
	print("The text block has been activated")
	text_to_display = chat_with_df(query, table_name)
	success = True
	except langchain.schema.output_parser.OutputParserException as e:
	success = False
	chart_type, chart_label, chart_json_data = "text", None, None
	text_to_display = "OutputParserException : " + str(e)[:25] + "...."
	print(f"OutputParserException Occurred --> {str(e)}")
	except Exception as e:
	success = False
	chart_type, chart_label, chart_json_data = "text", None, None
	text_to_display = "AgentException : " + str(e)[:25] + "...."
	print(f"Exception Occurred --> {str(e)}")
	else:
	graph_data_df = agent_response['intermediate_steps'][-1][-1]
	print("intermediate steps " , agent_response['intermediate_steps'])
	if isinstance(graph_data_df, pd.DataFrame):
	final_data = dataframe_sanitizer(graph_data_df)
	chart_label = json_pandassteps_charttype['label']
	chart_type = json_pandassteps_charttype['chart_type']
	chart_json_data = gen_chartdata(final_data, chart_label, chart_type)
	success = True
	text_to_display = None
	else:
	chart_type, chart_label, chart_json_data = "text", None, None
	text_to_display = graph_data_df
	success = True
	finally:
	final_response = dict(success=success,
	chart_type=chart_type,
	chart_label=chart_label,
	chart_json_data=chart_json_data,
	text_to_display=text_to_display)
	return final_response

	def dataframe_sanitizer(raw_dataframe):
	# Check if reset index is required or not
	new_df = raw_dataframe.copy()
	if new_df.index.name:
	new_df = new_df.reset_index()

	print("new_df --> ", new_df)
	# Rearrange the columns as object and int
	object_columns = [col for col, dtype in new_df.dtypes.items() if dtype not in ['int8','int16','int32','int64', 'float64', 'float32']]
	int_columns = [col for col, dtype in new_df.dtypes.items() if dtype in ['int8','int16','int32','int64', 'float64', 'float32']]
	reordered_columns = object_columns + int_columns
	print(f"object Columns , reordered columns : {object_columns}, {reordered_columns}")
	new_df_reordered = new_df[reordered_columns]
	print(f"New ordered columns : {new_df_reordered}")

	#Check if first column is object and second column is int
	try:
	if (new_df_reordered.shape[1] != 2) & (new_df_reordered.dtypes[1] not in ('int8','int16','int32','int64', 'float64', 'float32')):
	print("========Exception in shape and type==========")
	return_string = f"The output dataframe has columns count as {new_df_reordered.shape[1]}..\
	and the datatypes of columns present as {new_df_reordered.dtypes}..\
	so cant proceed further with plotting the same.."
	print(return_string)
	raise Exception(return_string)

	else:
	new_df_reordered[new_df_reordered.columns[0]] = new_df_reordered[new_df_reordered.columns[0]].astype(str)
	return new_df_reordered
	except Exception as e:
	print("Exception occurred" , str(e))