# Import packages and libraries import os import openai # from google.colab import userdata import mysql.connector as connection from langchain_openai import OpenAI from langchain.agents import create_sql_agent from langchain.agents import AgentExecutor from langchain.agents.agent_types import AgentType from langchain_community.utilities import SQLDatabase #from langchain.sql_database import SQLDatabase from langchain_community.agent_toolkits import SQLDatabaseToolkit from langchain_community.agent_toolkits.sql.prompt import SQL_FUNCTIONS_SUFFIX from langchain_core.messages import AIMessage, SystemMessage from langchain_core.prompts.chat import ( ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder, ) from langchain.agents import create_openai_tools_agent from langchain.agents.agent import AgentExecutor from langchain_openai import ChatOpenAI from langchain.prompts import PromptTemplate from langchain.schema import ( AIMessage, HumanMessage, SystemMessage ) import json import pandas as pd import urllib.parse import re # suppress all warnings import warnings warnings.filterwarnings("ignore") import gradio as gr # OpenAI API Key OPENAI_API_KEY = 'sk-proj-KZCd8lIDjM2YvBE8hHvCT3BlbkFJ4kcykY1Bx270oab70FSP' # SAJANI # Template TEMPLATE = """You are an agent designed to interact with a SQL database. Given an input question, create a syntactically correct MySQL query to run, then look at the results of the query and return the answer. Unless the user specifies in the question a specific number of results to obtain, query for at most 5 results using the LIMIT clause as per MySQL and return those 5 results as the Final Answer. You can order the results to return the most informative data in the database. If the user specifies in the question the number of examples to obtain, query for the specified amount of examples using the LIMIT clause as per MySQL and return those specified number of results as the Final Answer. Never query for all the columns from a specific table, only ask for the relevant columns given the question. To start you should ALWAYS look at the tables in the database to see what you can query. Do NOT skip this step. Then you should query the schema of the most relevant tables. If you are unsure about the answer (e.g. when multiple and repitive errors occur when using the below tools or when the SQL result fetching is unsuccesful), do not give imagined or dummy answers such as 'project ABC', 'credits 123'. Instead, simply state "Apologies. I cannot provide an answer to this question with the information I currently have." You MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again. DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the database. You have access to tools for interacting with the database. Only use the below tools. Only use the information returned by the below tools to construct your final answer. sql_db_list_tables: Input is an empty string, output is a comma separated list of tables in the database. sql_db_schema: Input to this tool is a comma-separated list of tables, output is the schema and sample rows for tables most relevant to the Question. Be sure that the tables actually exist by calling sql_db_list_tables first! Example Input: 'table1, table2, table3' sql_db_query: Input to this tool is a detailed and correct SQL query, output is a result from the database. If the query is not correct, an error message will be returned. If an error is returned, rewrite the query, check the query, and try again. If you encounter an issue with Unknown column 'xxxx' in 'field list', using sql_db_schema to query the correct table fields. sql_db_query_checker: Use this tool to double check if your query is correct before executing it. Always use this tool before executing a query with sql_db_query! ALWAYS execute the correct SQL query and get correct results from the database by using the sql_db_query tool. Use the following format: Question: "Question here" SQLQuery: "SQL Query to run" SQLResult: "Result of the SQLQuery" Final Answer: "Final answer here" With any SQLQuery that you execute always select and join the'registry_id' columns from relevant tables along with it and in the final answer, \ add the registry_ids relevant to the SQLResult to the answer. Use this format to append the relevant registry_ids at the end of the actual answer: \ "Project IDs relevant to this answer: "registry_ids here". Question: {input}""" # Manually split the URI components scheme = "mysql+mysqlconnector" username = "csp_db_ext" password = "Csp@145#" hostname = "3.130.217.226" port = 3306 database = "scrapped_csp_db" # Encode the password encoded_password = urllib.parse.quote(password, safe='') # Reconstruct the URI fixed_uri = f"{scheme}://{username}:{encoded_password}@{hostname}:{port}/{database}" # Using the fixed URI in SQLDatabase.from_uri db = SQLDatabase.from_uri(fixed_uri, ignore_tables=["crediting", "documents", "documenttypes", "draft_crediting", "draft_documents", "draft_locations", "draft_pdd", "draft_projects", "draft_proponents", "dropdowndata", "dropdowns", "locations", "pdd", "projects", "projectsectoralscopes", "projectstatus", "projecttypes", "proponents", "publish_project_status", "registerstatus", "seals", "user", "user_roles", 'companies', 'countryRegionTable', 'email_verify_tokens', 'temp_reset_token', 'project_favourites', 'numbers1', 'numbers2', 'numbers3', 'project_methodologies', 'project_sectoral_scopes', 'project_types'], sample_rows_in_table_info=0, view_support=True) # llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, openai_api_key = os.environ["OPENAI_API_KEY"]) # AMA llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, openai_api_key = OPENAI_API_KEY) # SAJANI #llm=OpenAI(temperature=0) toolkit = SQLDatabaseToolkit(db=db, llm=llm) context = toolkit.get_context() tools = toolkit.get_tools() messages = [ HumanMessagePromptTemplate.from_template(TEMPLATE), AIMessage(content=SQL_FUNCTIONS_SUFFIX), MessagesPlaceholder(variable_name="agent_scratchpad"),] prompt = ChatPromptTemplate.from_messages(messages) prompt = prompt.partial(**context) print(db.dialect) print(db.get_usable_table_names()) agent_executor = create_sql_agent(llm, db=db, agent_type="openai-tools", prompt = prompt, verbose=True, memory = None, top_k = 5) print(prompt) print(db.table_info) #Defining a function to fetch sample rows from database to be used in the relevance checker v2 function def fetch_sample_rows(db, tables, num_rows=5): sample_data = {} for table in tables: query = f"SELECT * FROM {table} LIMIT {num_rows}" # Use the 'run' method to execute the query result = db.run(query) sample_data[table] = result return sample_data # Initialize the database connection db = SQLDatabase.from_uri(fixed_uri, ignore_tables=["crediting", "documents", "documenttypes", "draft_crediting", "draft_documents", "draft_locations", "draft_pdd", "draft_projects", "draft_proponents", "dropdowndata", "dropdowns", "locations", "pdd", "projects", "projectsectoralscopes", "projectstatus", "projecttypes", "proponents", "publish_project_status", "registerstatus", "seals", "user", "user_roles", 'companies', 'countryRegionTable', 'email_verify_tokens', 'temp_reset_token', 'project_favourites', 'numbers1', 'numbers2', 'numbers3'], sample_rows_in_table_info=0, view_support=True) # # Fetch sample rows tables = ['crediting_data', 'project_data'] #'project_methodologies', 'project_sectoral_scopes', 'project_types' sample_data = fetch_sample_rows(db, tables) RELEVANCE_PROMPT = PromptTemplate( input_variables=["input", "sample_data"], template=""" You are an intelligent agent. Determine if the given question is relevant to a database of carbon projects. The database includes information on registry name, estimated emission reduction (i.e. estimated credits), project verifier/validator, crediting start/end dates, country, region and state/province of projects, generated credits (i.e. issued credits), retired credits, buffer credits, vintage start/end dates, and credit issuance dates. Here are some sample rows from the database: {sample_data} If the provided question is not in valid English, return "ANSWER_UNKNOWN". If the question involves criteria not related to the database (e.g., capital of countries, current presidents, relative/vague attributes like relative/vague attributes like 'richest person on earth'), quantitative answers/calculations not relevant to the database, or purely fictional characters, return "ANSWER_UNKNOWN". Otherwise, evaluate the relevance of the input and assign a score from 1 to 10, where 10 represents the highest level of relevance. Return only the relevance score. Examples of irrelevant questions: - What is the capital of Lesoto? - Who is the current president of China? - Who is the most handsome artist? - What is the wealth of Elon Musk? - How long would it take to fly to Asgard? - What did Snow White tell to Cinderella? - Who would win in a battle: Spiderman or Superman? Given question: {input} """ ) def check_relevance2(question): sample_data_str = json.dumps(sample_data, indent=2) relevance_check = RELEVANCE_PROMPT.format(input=question, sample_data=sample_data_str) # Pass the question as a HumanMessage object within a list response = llm([HumanMessage(content=relevance_check)]) return response.content.strip() # Extract the text content from the AIMessage object sample_data = { 'crediting_data': "[(54048, 'CH255608', '4264', 'Silivri WPP', datetime.date(2023, 1, 26), datetime.date(2020, 1, 1), None, None, 402, None, datetime.date(2023, 6, 6), None), (54048, 'CH255608', '4264', 'Silivri WPP', datetime.date(2023, 1, 26), datetime.date(2019, 1, 1), None, None, 29881, None, datetime.date(2023, 6, 6), None), (54048, 'CH255608', '4264', 'Silivri WPP', datetime.date(2023, 1, 26), datetime.date(2019, 1, 1), None, None, 14717, None, datetime.date(2023, 6, 6), None), (54048, 'CH255608', '4264', 'Silivri WPP', datetime.date(2020, 1, 3), datetime.date(2019, 1, 1), None, None, 125, None, datetime.date(2023, 9, 13), None), (54048, 'CH255608', '4264', 'Silivri WPP', datetime.date(2020, 1, 3), datetime.date(2019, 1, 1), None, None, 1, None, datetime.date(2022, 1, 13), None)]", 'project_data': "[(54048, 'CH255608', '4264', 822, None, 'Silivri WPP', 22, datetime.date(2021, 8, 20), datetime.date(2028, 8, 19), '80442', 735, None, 763, '28.0', '41.0', 'Gold Standard', 'Turkey', 'Europe', 'Certified Project'), (54049, 'CH255611', '7715', 822, None, 'GS1366 VPA 51 Coastal Kenya Borehole Rehabilitation Project', 32, datetime.date(2020, 3, 26), datetime.date(2025, 3, 25), '10000', 627, None, 761, None, None, 'Gold Standard', 'Kenya', 'Africa', 'Listed'), (54050, 'CH255612', '11607', 822, None, 'Top Third Ventures Stove Programme ', 21, datetime.date(2019, 9, 18), datetime.date(2024, 9, 17), '0', 627, None, 761, None, None, 'Gold Standard', 'Kenya', 'Africa', 'Certified Design'), (54051, 'CH255613', '382', 822, None, '*Sri Balaji 6 MW Non-Conventional Renewable Sources Biomass Power Project', 22, datetime.date(2011, 4, 15), datetime.date(2018, 4, 14), '28590', 613, None, 762, '78.799438', '14.542047', 'Gold Standard', 'India', 'Asia', 'Certified Project'), (54052, 'CH255615', '2510', 822, None, 'Xe Namnoy 2 - Xe Katam 1 Hydropower project', 22, datetime.date(2022, 1, 1), datetime.date(2028, 12, 31), '46438', 631, None, 762, None, None, 'Gold Standard', 'Lao', 'Asia', 'Certified Project')]" } REPHRASE_PROMPT = PromptTemplate( input_variables=["input", "sample_data"], template=""" You are an intelligent agent. Given a vague question about a carbon projects database, rephrase it to be more specific and relevant. The database includes information on registry name, estimated emission reduction (i.e. estimated credits), project verifier/validator, crediting start/end dates, country, region and state/province of projects, generated credits (i.e. issued credits), retired credits, buffer credits, vintage start/end dates, and credit issuance dates. Here are some sample rows from the database: {sample_data} Below are examples of vague questions and their rephrased versions: - Vague question: "How many projects are present?" | Rephrased: "How many carbon projects are present in the database?" - Vague question: "How many methodologies are in operation?" | Rephrased: "How many carbon crediting methodologies are in the database?" - Vague question: "What are the largest forestry projects in Brazil?" | Rephrased: "What are the largest forestry projects in Brazil in terms of generated credits?" - Vague question: "Which project validator is most popular?" | Rephrased: "Which project validator is most popular in terms of project count?" - Vague question: "Which vintage start year is the most successful?" | Rephrased: "Which vintage start year is the most successful in terms of generated credits?" - Vague question: "Search for the lowest crediting project" | Rephrased: "Search for the lowest crediting project in terms of sum of generated credits" - Vague question: "What is the minimum amount of estimated credits?" | Rephrased: "What is the minimum amount of estimated emission reduction in the database?" - Vague question: "How many projects are from Verra?" | Rephrased: "How many carbon projects are from the Verra registry?" - Vague question: "What is the average estimated emission reduction for Gold Standard projects?" | Rephrased: ""What is the average estimated emission reduction for carbon projects in the Gold Standard registry?" If the question is unrelated or cannot be rephrased, return "ANSWER_UNKNOWN". Given the question: {input} """ ) def rephrase_question(question, sample_data): sample_data_str = json.dumps(sample_data, indent=2) rephrase_prompt = REPHRASE_PROMPT.format(input=question, sample_data=sample_data_str) response = llm([HumanMessage(content=rephrase_prompt)]) return response.content.strip() sample_data = { 'crediting_data': "[(54048, 'CH255608', '4264', 'Silivri WPP', datetime.date(2023, 1, 26), datetime.date(2020, 1, 1), None, None, 402, None, datetime.date(2023, 6, 6), None), (54048, 'CH255608', '4264', 'Silivri WPP', datetime.date(2023, 1, 26), datetime.date(2019, 1, 1), None, None, 29881, None, datetime.date(2023, 6, 6), None), (54048, 'CH255608', '4264', 'Silivri WPP', datetime.date(2023, 1, 26), datetime.date(2019, 1, 1), None, None, 14717, None, datetime.date(2023, 6, 6), None), (54048, 'CH255608', '4264', 'Silivri WPP', datetime.date(2020, 1, 3), datetime.date(2019, 1, 1), None, None, 125, None, datetime.date(2023, 9, 13), None), (54048, 'CH255608', '4264', 'Silivri WPP', datetime.date(2020, 1, 3), datetime.date(2019, 1, 1), None, None, 1, None, datetime.date(2022, 1, 13), None)]", 'project_data': "[(54048, 'CH255608', '4264', 822, None, 'Silivri WPP', 22, datetime.date(2021, 8, 20), datetime.date(2028, 8, 19), '80442', 735, None, 763, '28.0', '41.0', 'Gold Standard', 'Turkey', 'Europe', 'Certified Project'), (54049, 'CH255611', '7715', 822, None, 'GS1366 VPA 51 Coastal Kenya Borehole Rehabilitation Project', 32, datetime.date(2020, 3, 26), datetime.date(2025, 3, 25), '10000', 627, None, 761, None, None, 'Gold Standard', 'Kenya', 'Africa', 'Listed'), (54050, 'CH255612', '11607', 822, None, 'Top Third Ventures Stove Programme ', 21, datetime.date(2019, 9, 18), datetime.date(2024, 9, 17), '0', 627, None, 761, None, None, 'Gold Standard', 'Kenya', 'Africa', 'Certified Design'), (54051, 'CH255613', '382', 822, None, '*Sri Balaji 6 MW Non-Conventional Renewable Sources Biomass Power Project', 22, datetime.date(2011, 4, 15), datetime.date(2018, 4, 14), '28590', 613, None, 762, '78.799438', '14.542047', 'Gold Standard', 'India', 'Asia', 'Certified Project'), (54052, 'CH255615', '2510', 822, None, 'Xe Namnoy 2 - Xe Katam 1 Hydropower project', 22, datetime.date(2022, 1, 1), datetime.date(2028, 12, 31), '46438', 631, None, 762, None, None, 'Gold Standard', 'Lao', 'Asia', 'Certified Project')]" } # Example usage of rephrasing function with sample data and static few-shot examples #question = "How many projects are present?" #question = "What is the longest carbon project?" #question = "What is the most successful country?" #question = "Which project is the least effective?" #question = "combien y a-t-il de projets?" question = "How many Gold standard projects are present?" rephrased_question = rephrase_question(question, sample_data) print(rephrased_question) # Should print the rephrased question or "ANSWER_UNKNOWN" #Defining a function to remove the term "Rephrased" from the result returned by the rephrase_question function import re def remove_rephrased(text): # Define the regex pattern to match "Rephrased:" at the beginning of the string pattern = r'^\s*Rephrased:\s*' # Use re.sub to replace the pattern with an empty string result = re.sub(pattern, '', text) # Strip leading and trailing spaces from the result result = result.strip() return result Answer_Unknown = "Apologies. I cannot provide an answer to this question with the information I currently have." def execute_query(question): relevance_response = check_relevance2(question) try: relevance_response_int = int(relevance_response) print(f"Relevancy level score is {relevance_response_int}") if relevance_response_int > 0: rephrased_question = rephrase_question(question, sample_data) if "ANSWER_UNKNOWN" in rephrased_question: return Answer_Unknown else: response = agent_executor.invoke(remove_rephrased(rephrased_question)) print(f"Original Question: {question}") print(f"Rephrased Question: {remove_rephrased(rephrased_question)}") return response except ValueError: if "ANSWER_UNKNOWN" in relevance_response: return Answer_Unknown else: return relevance_response # Output preprocessing ### INTERFACE WITH ALL THE OUTPUTS ### def carbon_space(prompt): result = execute_query(prompt) print("result:", result) #print("keys:", result.keys()) #print("values:", result.values()) #print(type(result.values())) #values_ = list(result.values()) ans = result['output'] if "```" in ans: ans_v2 = ans.split("```")[-1] elif "Answer" not in ans and "SQLResult" in ans: ans_v2 = ans.split("SQLResult")[-1] # ans_v2 = ans.split("SQLResult:")[-1] else: ans_v2 = ans print("ANSWER V2:", ans_v2) # remove unwanted punctuation ans_v2 = ans_v2.replace("*", "") print("ANSWER V2:", ans_v2) # Define the regex pattern to remove text between SQLResult and Final Answer pattern_between = r'SQLResult:.*?Final Answer:' ans_v2 = re.sub(pattern_between, '', ans_v2, flags=re.DOTALL) # remove words # Define the regex pattern pattern_words = r'\b(Answer:|Answer|Final\s*Answer:?|SQL\s*Result:?|Final Answer: )\b' # Use re.sub to remove the matched patterns cleaned_text = re.sub(pattern_words, '', ans_v2) print("cleaned_text:", cleaned_text) # Print the cleaned text print(cleaned_text) # remove leading whitespaces and some ans_v2 = cleaned_text.replace(":", "") ans_v2 = ans_v2.replace("#", "") ans_v2 = ans_v2.lstrip() # remove "Final Answer" ans_v2 = ans_v2.replace("Final Answer:", "") return ans_v2 # PROMPTS prompt = "What is the title of project with id 4082?" prompt = "What are the energy projects in Brazil?" prompt = "Which project have been validated by 'NSF International Strategic Registrations, Ltd. (NSF-ISR)'?" prompt = "How many projects are there in Brazil?" prompt = "Which projects with vintage start year 2010 have more than 10,000 retired credits in total?" prompt = "What are the ids of projects with vintage start year 2010?" # prompt = "Which projects have estimated emission reductions less than 10,000?" # prompt = "What is the largest amount of buffer credits?" # prompt = "What are the forestry projects in Brazil?" # had a very different output ### test if needed without the interface ### k = carbon_space("How many projects are in the Verra registry?") print(k) ### Refined Gradio Output ### title = "CarbonQuery" long_desc = """**Guidelines for ‘CarbonQuery’: The AI-powered Search Tool for CarbonSpace:** 1. This AI-powered search tool, ‘CarbonQuery’, is designed to provide a platform for users to ask questions in English about projects listed on CarbonSpace. It is aimed at facilitating convenient access to information on carbon projects and directing users towards further exploration in CarbonSpace. 2. Users may ask questions relevant to CarbonSpace that are similar in nature and complexity to the sample questions provided. 3. ‘CarbonQuery’ can respond to questions limited to the following attributes: - Project ID - Project Title - Registry - Project Verifier/Validator - Locations of Projects (Region, Country, State/Province) - Crediting Start/End Dates - Estimated Annual Emission Reduction - Generated Credits - Retired Credits - Buffer Credits - Vintage Start/End Dates - Date of Issuance 4. Please note that ‘CarbonQuery’ might occasionally provide inaccurate information. Users are advised to verify important information independently. 5. ‘CarbonQuery’ is currently in an experimental phase. Adherence to these guidelines will assist us in improving the tool to encompass a broader range of questions and to fulfil your requirements better. 6. The information provided by ‘CarbonQuery’ is for general informational purposes only and should not be considered as professional or legal advice. CarbonSpace and its affiliates are not responsible for any errors or omissions, or for the results obtained from the use of this information. 7. Users are expected to use ‘CarbonQuery’ responsibly and refrain from asking inappropriate or irrelevant questions. Misuse of the tool may result in restricted access. By using ‘CarbonQuery’, you agree to comply with these guidelines and understand the experimental nature of the tool.""" desc = "Guidelines for ‘CarbonQuery’: The AI-powered Search Tool for CarbonSpace:" demo = gr.Interface(fn=carbon_space, inputs="text", outputs="text", theme=gr.themes.Glass(), examples=["How many projects have been verified and validated by 'KBS Certification Services Limited'?", "How many projects are from China?", "What is the project with the maximum amount of buffer credits and its buffer credit amount?", "What is the smallest amount of estimated emission reduction in projects which started crediting before 2020?", "What is the current value of average estimated emission reduction?", "What is the sum of generated credits of all projects?", "Which state/province is leading in terms of project count?", "What is the largest project in terms of estimated emission reductions?", "What credit start year issued the largest number of estimated emission reductions?"], title=title, article=long_desc) # description=desc, demo.launch() if __name__ == "__main__": title = "CarbonQuery" long_desc = """**Guidelines for ‘CarbonQuery’: The AI-powered Search Tool for CarbonSpace:** 1. This AI-powered search tool, ‘CarbonQuery’, is designed to provide a platform for users to ask questions in English about projects listed on CarbonSpace. It is aimed at facilitating convenient access to information on carbon projects and directing users towards further exploration in CarbonSpace. 2. Users may ask questions relevant to CarbonSpace that are similar in nature and complexity to the sample questions provided. 3. ‘CarbonQuery’ can respond to questions limited to the following attributes: - Project ID - Project Title - Registry - Project Verifier/Validator - Locations of Projects (Region, Country, State/Province) - Crediting Start/End Dates - Estimated Annual Emission Reduction - Generated Credits - Retired Credits - Buffer Credits - Vintage Start/End Dates - Date of Issuance 4. Please note that ‘CarbonQuery’ might occasionally provide inaccurate information. Users are advised to verify important information independently. 5. ‘CarbonQuery’ is currently in an experimental phase. Adherence to these guidelines will assist us in improving the tool to encompass a broader range of questions and to fulfil your requirements better. 6. The information provided by ‘CarbonQuery’ is for general informational purposes only and should not be considered as professional or legal advice. CarbonSpace and its affiliates are not responsible for any errors or omissions, or for the results obtained from the use of this information. 7. Users are expected to use ‘CarbonQuery’ responsibly and refrain from asking inappropriate or irrelevant questions. Misuse of the tool may result in restricted access. By using ‘CarbonQuery’, you agree to comply with these guidelines and understand the experimental nature of the tool.""" desc = "Guidelines for ‘CarbonQuery’: The AI-powered Search Tool for CarbonSpace:" demo = gr.Interface(fn=carbon_space, inputs="text", outputs="text", theme=gr.themes.Glass(), examples=[ "How many projects have been verified and validated by 'KBS Certification Services Limited'?", "How many projects are from China?", "What is the project with the maximum amount of buffer credits and its buffer credit amount?", "What is the smallest amount of estimated emission reduction in projects which started crediting before 2020?", "What is the current value of average estimated emission reduction?", "What is the sum of generated credits of all projects?", "Which state/province is leading in terms of project count?", "What is the largest project in terms of estimated emission reductions?", "What credit start year issued the largest number of estimated emission reductions?"], title=title, article=long_desc) # description=desc, demo.launch()