Spaces:
Sleeping
Sleeping
| import langchain | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.prompts import ChatPromptTemplate | |
| import openai | |
| from langchain.memory import ConversationBufferMemory | |
| import pandas as pd | |
| from langchain import memory | |
| import sqlite3 | |
| import os | |
| import json | |
| from langchain.agents import create_pandas_dataframe_agent | |
| from langchain.agents.agent_types import AgentType | |
| import io | |
| import logging | |
| import requests | |
| import urllib3 | |
| from langchain.prompts import ChatPromptTemplate | |
| from langchain.output_parsers import ResponseSchema | |
| from langchain.output_parsers import StructuredOutputParser | |
| from langchain.callbacks import LLMonitorCallbackHandler | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.prompts.chat import SystemMessage, HumanMessagePromptTemplate, HumanMessage, AIMessage | |
| from langchain.agents import create_pandas_dataframe_agent | |
| #current_model_id = 'gpt-4o-mini' | |
| from dotenv import load_dotenv, find_dotenv | |
| _ = load_dotenv(find_dotenv()) # read local .env file | |
| current_model_id = os.getenv('model_id') | |
| def summary_extractor_from_df(df:pd.DataFrame)-> str: | |
| try: | |
| chatmodel = ChatOpenAI() | |
| template = "You are an AI assistant and your task is to summarize the workforce distribution based on gender delimited by triple backticks \ | |
| and output should clearly indicate how much percentage one gender is higher than other one, and based on the findings make some comments on includsiveness \ | |
| if this is good for company or not.. limit the output in 50 words```{json_genderdf}```" | |
| prompt_template = ChatPromptTemplate.from_template(template) | |
| user_message = prompt_template.format_messages(json_genderdf = df) | |
| response = chatmodel(user_message) | |
| response_content = response.content | |
| except Exception as e: | |
| response_content = f"Gender statistics reveal a balanced representation within our dataset, with males comprising 56% \ | |
| and females 44%. This equitable distribution indicates a positive representation across genders, \ | |
| fostering diversity and inclusivity within our data." | |
| return response_content | |
| def chat_response(text:str)->str: | |
| chatmodel = ChatOpenAI(max_tokens=50) | |
| response = chatmodel.predict(text) | |
| return response | |
| def chat_with_df(query, table_name = None): | |
| if table_name is None: | |
| df = pd.read_csv(r"https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv") | |
| else: | |
| #get data | |
| # con = sqlite3.connect("database.db") | |
| # df = pd.read_sql_query(f"SELECT * from {table_name}", con) | |
| print(f"----------table_name : {table_name}-----------") | |
| filename = f"referencefiles/{table_name}.csv" | |
| df = pd.read_csv(filename) | |
| agent = create_pandas_dataframe_agent( | |
| ChatOpenAI(temperature=0, model=current_model_id), | |
| df, | |
| agent_type=AgentType.OPENAI_FUNCTIONS, | |
| verbose=True | |
| ) | |
| try: | |
| res_query = agent.run(query) | |
| except Exception as e: | |
| print(str(e)) | |
| res_query = f"ERROR : {str(e)[:150]}" | |
| return res_query | |
| def gen_chartdata(df_out, label, chart_type): | |
| print("Inside gen_chartdata.....") | |
| print(f"df_out --> {df_out}") | |
| data_dict = { | |
| "labels": None, | |
| "datasets": [ | |
| { | |
| "label": "Default Label", | |
| "data": None, | |
| "type" : chart_type, | |
| "backgroundColor": ["rgba(250, 240, 230, 0.7)"], | |
| "hoverBackgroundColor": ["rgba(250, 240, 230, 1)"], | |
| "borderColor": ["rgba(250, 240, 230, 1)"] | |
| } | |
| ] | |
| } | |
| print(f"df_out.iloc[:,0].to_list() ---> {df_out.iloc[:,0].to_list()}") | |
| print(f"data_dict[\"datasets\"][0][\"label\"] --> {data_dict['datasets'][0]['label']}") | |
| print(f"label : {label}") | |
| print(f"data_dict['datasets'][0]['data'] --> {data_dict['datasets'][0]['data']}") | |
| print(f"df_out.iloc[:,1].to_list() --> {df_out.iloc[:,1].to_list()}") | |
| data_dict["labels"] = df_out.iloc[:,0].to_list() | |
| data_dict["datasets"][0]["label"] = label | |
| data_dict["datasets"][0]["data"] = df_out.iloc[:,1].to_list() | |
| if chart_type == 'doughnut': | |
| data_dict["datasets"][0]["backgroundColor"] = ["rgba(120, 214, 198, 0.7)", "rgba(255, 105, 105, 0.7)", "rgba(150, 194, 145, 0.7)", "rgba(250, 240, 230, 0.7)"] | |
| data_dict["datasets"][0]["hoverBackgroundColor"] = ["rgba(120, 214, 198, 1)", "rgba(255, 105, 105, 1)", "rgba(150, 194, 145, 1)", "rgba(250, 240, 230, 1)"] | |
| data_dict["datasets"][0]["borderColor"] = ["rgba(120, 214, 198, 1)", "rgba(255, 105, 105, 1)", "rgba(150, 194, 145, 1)", "rgba(250, 240, 230, 1)"] | |
| # Serializing json | |
| json_object = json.dumps(data_dict, indent = 4) | |
| return json_object | |
| def generate_graphdata(query, table_name = None): | |
| if table_name is None: | |
| df = pd.read_csv(r"https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv") | |
| else: | |
| #get data from sqlite db | |
| #con = sqlite3.connect("database.db") | |
| #df = pd.read_sql_query(f"SELECT * from {table_name}", con) | |
| print(f"----------table_name : {table_name}-----------") | |
| filename = f"referencefiles/{table_name}.csv" | |
| df = pd.read_csv(filename) | |
| jsondf = df.head(5).to_json(orient='records') | |
| prompt_template = ChatPromptTemplate.from_messages( | |
| [ | |
| SystemMessage( | |
| content=( | |
| "You are a data manager and you have to assign the task to a pandas coder \ | |
| who will be working on the steps provided by you. Assuming the pandas coder has already loaded \ | |
| the dataset in pandas dataframe, your task is to properly provide the steps with \ | |
| proper column names so that pandas coder can easily understand the ask and code till the processing of dataset\ | |
| which can be used for graph creation. include steps only to process the data \ | |
| THE STEPS SHOULD NOT CONTAIN ANY GRAPH CREATION OR PLOTTING INSTRUCTIONS.. \ | |
| always select columns only that are mentioned \ | |
| not any extra columns. also think through if output contains two columns if not then create the steps to \ | |
| only include two appropriate columns in output. Also do not include any steps to select top few records.. \ | |
| the output should be all records from the steps execution.") | |
| ), | |
| HumanMessage(content=("\ | |
| user_input = ```plot the distribution of males and females who survived```\ | |
| use below dataset for reference delimited by <<<>>>\ | |
| <<<[{\"PassengerId\":1,\"Survived\":0,\"Pclass\":3,\"Name\":\"Braund, Mr. Owen Harris\",\"Sex\":\"male\",\"Age\":22.0,\"SibSp\":1,\"Parch\":0,\"Ticket\":\"A\/5 21171\",\"Fare\":7.25,\"Cabin\":null,\"Embarked\":\"S\"},{\"PassengerId\":2,\"Survived\":1,\"Pclass\":1,\"Name\":\"Cumings, Mrs. John Bradley (Florence Briggs Thayer)\",\"Sex\":\"female\",\"Age\":38.0,\"SibSp\":1,\"Parch\":0,\"Ticket\":\"PC 17599\",\"Fare\":71.2833,\"Cabin\":\"C85\",\"Embarked\":\"C\"},{\"PassengerId\":3,\"Survived\":1,\"Pclass\":3,\"Name\":\"Heikkinen, Miss. Laina\",\"Sex\":\"female\",\"Age\":26.0,\"SibSp\":0,\"Parch\":0,\"Ticket\":\"STON\/O2. 3101282\",\"Fare\":7.925,\"Cabin\":null,\"Embarked\":\"S\"},{\"PassengerId\":4,\"Survived\":1,\"Pclass\":1,\"Name\":\"Futrelle, Mrs. Jacques Heath (Lily May Peel)\",\"Sex\":\"female\",\"Age\":35.0,\"SibSp\":1,\"Parch\":0,\"Ticket\":\"113803\",\"Fare\":53.1,\"Cabin\":\"C123\",\"Embarked\":\"S\"},{\"PassengerId\":5,\"Survived\":0,\"Pclass\":3,\"Name\":\"Allen, Mr. William Henry\",\"Sex\":\"male\",\"Age\":35.0,\"SibSp\":0,\"Parch\":0,\"Ticket\":\"373450\",\"Fare\":8.05,\"Cabin\":null,\"Embarked\":\"S\"}]>>>\ | |
| {format_instructions}\ | |
| from these inputs extract the following information\ | |
| steps: the steps to be performed for the execution of task\ | |
| label: suggest label to be displayed on the graph for the required ask in less than 3 words\ | |
| chart_type: extract what type of chart user want to see.. if not specified then suggest which one \ | |
| would be suitable for the task.. should be one of these ['bar', 'line', 'doughnut', 'scatter', 'text']\ | |
| output text if graph cant be plotted or user required the answer in text format")), | |
| AIMessage(content=("{\n\t\"steps\": \"1. Filter the dataframe to include only the rows where \'Survived\' column is equal to 1.\\n\ | |
| 2. Group the filtered dataframe by \'Sex\' column.\\n\ | |
| 3. Count the number of occurrences of each unique value in the \'Sex\' column.\\n\ | |
| 4. Create a new dataframe with two columns: \'Sex\' and \'Count\', where \'Sex\' contains the unique values from the \'Sex\' column and \'Count\' contains the corresponding counts.\\n\ | |
| 5. Select top two records from the dataframe.\",\\n\ | |
| 6. Convert the dataset to Json and create the bar graph.\"t\ | |
| \"label\": \"Distribution of Males and Females who Survived\",\\n\t\ | |
| \"chart_type\": \"bar\"\\n}")), | |
| HumanMessage(content=("The output seems not correct. the provided steps include importing matplotlib and creating the bar diagram\ | |
| which was clearly mentioned not to include. Also steps include selecting top two records where it was mentioned \ | |
| not to select top few records for output. you should output all records unless asked otherwise. Also steps include \ | |
| the conversion to json which should not have been included. the output should be only pandas dataframe..\ | |
| can you correct these and provide the response.")), | |
| AIMessage(content=("{\n\t\"steps\": 1. Filter the dataframe to include only the rows where \'Survived\' column is equal to 1.\\n\ | |
| 2. Group the filtered dataframe by \'Sex\' column.\\n\ | |
| 3. Count the number of occurrences of each unique value in the \'Sex\' column.\\n\ | |
| 4. Create a new dataframe with two columns: \'Sex\' and \'Count\', where \'Sex\' contains the unique values from the \'Sex\' column and \'Count\' contains the corresponding counts.\\n\ | |
| 5. Return the new dataframe.\",\\n\t\ | |
| \"label\": \"Distribution of Males and Females who Survived\",\\n\t\ | |
| \"chart_type\": \"bar\"\\n}")), | |
| HumanMessage(content=("Now the output looks perfect. This was just the way I wanted. Thanks a lot..")), | |
| AIMessage(content=("Thanks for the feedback, I will double check these things next time..")), | |
| HumanMessage(content=("\ | |
| user_input = ```draw the bar chart for distribution of pclass and for only sex=males```\ | |
| use below dataset for reference delimited by <<<>>>\ | |
| <<<[{\"PassengerId\":1,\"Survived\":0,\"Pclass\":3,\"Name\":\"Braund, Mr. Owen Harris\",\"Sex\":\"male\",\"Age\":22.0,\"SibSp\":1,\"Parch\":0,\"Ticket\":\"A\/5 21171\",\"Fare\":7.25,\"Cabin\":null,\"Embarked\":\"S\"},{\"PassengerId\":2,\"Survived\":1,\"Pclass\":1,\"Name\":\"Cumings, Mrs. John Bradley (Florence Briggs Thayer)\",\"Sex\":\"female\",\"Age\":38.0,\"SibSp\":1,\"Parch\":0,\"Ticket\":\"PC 17599\",\"Fare\":71.2833,\"Cabin\":\"C85\",\"Embarked\":\"C\"},{\"PassengerId\":3,\"Survived\":1,\"Pclass\":3,\"Name\":\"Heikkinen, Miss. Laina\",\"Sex\":\"female\",\"Age\":26.0,\"SibSp\":0,\"Parch\":0,\"Ticket\":\"STON\/O2. 3101282\",\"Fare\":7.925,\"Cabin\":null,\"Embarked\":\"S\"},{\"PassengerId\":4,\"Survived\":1,\"Pclass\":1,\"Name\":\"Futrelle, Mrs. Jacques Heath (Lily May Peel)\",\"Sex\":\"female\",\"Age\":35.0,\"SibSp\":1,\"Parch\":0,\"Ticket\":\"113803\",\"Fare\":53.1,\"Cabin\":\"C123\",\"Embarked\":\"S\"},{\"PassengerId\":5,\"Survived\":0,\"Pclass\":3,\"Name\":\"Allen, Mr. William Henry\",\"Sex\":\"male\",\"Age\":35.0,\"SibSp\":0,\"Parch\":0,\"Ticket\":\"373450\",\"Fare\":8.05,\"Cabin\":null,\"Embarked\":\"S\"}]>>>\ | |
| {format_instructions}\ | |
| from these inputs extract the following information\ | |
| steps: the steps to be performed for the execution of task\ | |
| label: suggest label to be displayed on the graph for the required ask in less than 3 words\ | |
| chart_type: extract what type of chart user want to see.. if not specified then suggest which one \ | |
| would be suitable for the task.. should be one of these ['bar', 'line', 'doughnut', 'scatter', 'text']\ | |
| output text if graph cant be plotted or user required the answer in text format")), | |
| AIMessage(content=("{\"\n\t\"steps\": \"1. Filter the dataframe to include only rows where Sex is \'male\'\\n\ | |
| 2. Group the data by Pclass and count the number of occurrences\\n\ | |
| 3. Create a new dataframe with the Pclass and count columns\\n\ | |
| 4. Sort the dataframe by Pclass\\n\ | |
| 5. Output the resulting dataframe\",\n\t\ | |
| \"label\": \"Distribution of Pclass for males\",\n\t\ | |
| \"chart_type\": \"bar\"\n}")), | |
| HumanMessage(content=("The output looks perfect. This is exactly the response I was looking for.")), | |
| AIMessage(content=("Thanks! Now I am more confident about the requirement. ")), | |
| HumanMessagePromptTemplate.from_template( | |
| "user_input = ```{user_input}```\ | |
| use sample dataset below for reference delimited by <<<>>>\ | |
| <<<{reference_data}>>>\ | |
| {format_instructions}\ | |
| from these inputs extract the following information\ | |
| steps: the steps to be performed for the execution of task. stritcly remember to not include any graph or plotting related instructions..\ | |
| label: suggest label to be displayed on the graph for the required ask in less than 3 words\ | |
| chart_type: extract what type of chart user want to see.. if not specified then suggest which one \ | |
| would be suitable for the task.. should be one of these ['bar', 'line', 'doughnut', 'scatter', 'text']\ | |
| output text if graph cant be plotted or user required the answer in text format" | |
| ), | |
| ] | |
| ) | |
| step_schema = ResponseSchema(name="steps", | |
| description="steps to be performed for the execution of task. stritcly remember to not include any graph or plotting related instructions..") | |
| label_schema = ResponseSchema(name="label", | |
| description="suggest label to be displayed on the graph for the required ask") | |
| chart_type_schema = ResponseSchema(name="chart_type", | |
| description="Suggest the type of chart to be created.. if user specified\ | |
| any chart type, select that.. otherwise suggest what would be suitable for the user ask\ | |
| From the user input.. should be one of these ['bar', 'line', 'doughnut', 'scatter', 'text']\ | |
| output text if graph cant be plotted or user required the answer in text format") | |
| response_schemas = [step_schema, label_schema, chart_type_schema] | |
| output_parser = StructuredOutputParser.from_response_schemas(response_schemas) | |
| format_instructions = output_parser.get_format_instructions() | |
| customer_messages = prompt_template.format_messages(user_input=query, | |
| reference_data=jsondf, | |
| format_instructions=format_instructions) | |
| llm = ChatOpenAI(model = current_model_id, temperature = 0) | |
| response = llm(customer_messages) | |
| json_pandassteps_charttype = output_parser.parse(response.content) | |
| print("pandas steps and charttype" , json_pandassteps_charttype) | |
| agent = create_pandas_dataframe_agent( | |
| ChatOpenAI(temperature=0, model=current_model_id), | |
| df, | |
| agent_type=AgentType.OPENAI_FUNCTIONS, | |
| verbose=True, | |
| return_intermediate_steps=True | |
| ) | |
| try: | |
| if json_pandassteps_charttype['chart_type'] != "text": | |
| agent_response = agent({ | |
| 'input': f"Remove any steps related to charts or plotting from the steps mentioned in triple backticks\ | |
| and Execute the Python script combining all the steps after remoing the chart related steps\ | |
| Please don't use any print statements. Output should be a pandas dataframe only \ | |
| steps to be performed : {json_pandassteps_charttype['steps']}" | |
| }) | |
| else: | |
| chart_type, chart_label, chart_json_data = "text", None, None | |
| print("The text block has been activated") | |
| text_to_display = chat_with_df(query, table_name) | |
| success = True | |
| except langchain.schema.output_parser.OutputParserException as e: | |
| success = False | |
| chart_type, chart_label, chart_json_data = "text", None, None | |
| text_to_display = "OutputParserException : " + str(e)[:25] + "...." | |
| print(f"OutputParserException Occurred --> {str(e)}") | |
| except Exception as e: | |
| success = False | |
| chart_type, chart_label, chart_json_data = "text", None, None | |
| text_to_display = "AgentException : " + str(e)[:25] + "...." | |
| print(f"Exception Occurred --> {str(e)}") | |
| else: | |
| graph_data_df = agent_response['intermediate_steps'][-1][-1] | |
| print("intermediate steps " , agent_response['intermediate_steps']) | |
| if isinstance(graph_data_df, pd.DataFrame): | |
| final_data = dataframe_sanitizer(graph_data_df) | |
| chart_label = json_pandassteps_charttype['label'] | |
| chart_type = json_pandassteps_charttype['chart_type'] | |
| chart_json_data = gen_chartdata(final_data, chart_label, chart_type) | |
| success = True | |
| text_to_display = None | |
| else: | |
| chart_type, chart_label, chart_json_data = "text", None, None | |
| text_to_display = graph_data_df | |
| success = True | |
| finally: | |
| final_response = dict(success=success, | |
| chart_type=chart_type, | |
| chart_label=chart_label, | |
| chart_json_data=chart_json_data, | |
| text_to_display=text_to_display) | |
| return final_response | |
| def dataframe_sanitizer(raw_dataframe): | |
| # Check if reset index is required or not | |
| new_df = raw_dataframe.copy() | |
| if new_df.index.name: | |
| new_df = new_df.reset_index() | |
| print("new_df --> ", new_df) | |
| # Rearrange the columns as object and int | |
| object_columns = [col for col, dtype in new_df.dtypes.items() if dtype not in ['int8','int16','int32','int64', 'float64', 'float32']] | |
| int_columns = [col for col, dtype in new_df.dtypes.items() if dtype in ['int8','int16','int32','int64', 'float64', 'float32']] | |
| reordered_columns = object_columns + int_columns | |
| print(f"object Columns , reordered columns : {object_columns}, {reordered_columns}") | |
| new_df_reordered = new_df[reordered_columns] | |
| print(f"New ordered columns : {new_df_reordered}") | |
| #Check if first column is object and second column is int | |
| try: | |
| if (new_df_reordered.shape[1] != 2) & (new_df_reordered.dtypes[1] not in ('int8','int16','int32','int64', 'float64', 'float32')): | |
| print("========Exception in shape and type==========") | |
| return_string = f"The output dataframe has columns count as {new_df_reordered.shape[1]}..\ | |
| and the datatypes of columns present as {new_df_reordered.dtypes}..\ | |
| so cant proceed further with plotting the same.." | |
| print(return_string) | |
| raise Exception(return_string) | |
| else: | |
| new_df_reordered[new_df_reordered.columns[0]] = new_df_reordered[new_df_reordered.columns[0]].astype(str) | |
| return new_df_reordered | |
| except Exception as e: | |
| print("Exception occurred" , str(e)) |