import gradio as gr
import os
import pinecone
import openai

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone

from langchain.agents.openai_functions_agent.agent_token_buffer_memory import AgentTokenBufferMemory
from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
from langchain.schema.messages import SystemMessage
from langchain.prompts import MessagesPlaceholder
from langchain.agents import AgentExecutor
from langchain.agents.agent_toolkits import create_retriever_tool

from langchain.callbacks.base import BaseCallbackHandler

from queue import Queue
from threading import Thread

print("CHECK - Pinecone vector db setup")

# set up OpenAI environment vars and embeddings
openai.api_key = os.environ.get("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings()

# initialize pinecone db
index_name = "kellogg-course-assistant"

pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
	environment=os.getenv("PINECONE_ENV"),  # next to api key in console
)

# load existing index
vectorsearch = Pinecone.from_existing_index(index_name, embeddings)
retriever = vectorsearch.as_retriever()

print("CHECK - setting up conversational retrieval agent")

# callback handler for streaming
class QueueCallback(BaseCallbackHandler):
    """Callback handler for streaming LLM responses to a queue."""

    def __init__(self, q):
        self.q = q

    def on_llm_new_token(self, token: str, **kwargs: any) -> None:
        self.q.put(token)

    def on_llm_end(self, *args, **kwargs: any) -> None:
        return self.q.empty()

# create retrieval tool
tool = create_retriever_tool(
    retriever, 
    "search_kellogg_site",
    "Searches and returns content from within the Kellogg website."
)
tools = [tool]

system_message = SystemMessage(
        content=(
            "You are a helpful educational expert providing advice to students of the Northwestern business school Kellogg. "
            "Use both your knowledge and the Kellogg site search tool to generate helpful answers for questions about courses and create a list of suggested web course articles for more information. "
            "Do not include details of your intermediate steps in the final response. "
            "At the end of your response, provide links to relevant web course articles returned by the retriever."
        )
)

print("CHECK - setting up gradio chatbot UI")

# build Gradio selectable options in Chat UI
model_type=gr.Dropdown(choices=["gpt-4 + rag", 
								"gpt-3.5-turbo + rag"], 
								value="gpt-4 + rag",
								type="index",
								label="LLM Models"
)

# RAG agent function
def predict(message, model_type):
	# clearing RAG memory
	# memory.clear()
	
	# Create a Queue
	q = Queue()
	job_done = object()
	
	# conversational retrieval agent component construction - memory, prompt template, agent, agent executor
	# specifying LLM to use
	if (model_type==1):
		llm =  ChatOpenAI(temperature = 0.1, model_name="gpt-3.5-turbo-16k", streaming=True, callbacks=[QueueCallback(q)])
	else:
		llm =  ChatOpenAI(temperature = 0.1, model_name="gpt-4-turbo-preview", streaming=True, callbacks=[QueueCallback(q)])

	# This is needed for both the memory and the prompt
	memory_key = "history"
	memory = AgentTokenBufferMemory(memory_key=memory_key, llm=llm)

	prompt = OpenAIFunctionsAgent.create_prompt(
        	system_message=system_message,
        	extra_prompt_messages=[MessagesPlaceholder(variable_name=memory_key)]
	)

	agent = OpenAIFunctionsAgent(llm=llm, tools=tools, prompt=prompt)
	agent_executor = AgentExecutor(agent=agent, tools=tools, memory=memory, verbose=False, return_intermediate_steps=True)	

	# Create a funciton to call - this will run in a thread
	def task():
		resp = agent_executor({"input":message})
		q.put(job_done)

	# Create a thread and start the function
	t = Thread(target=task)
	t.start()

	content = ""

	# Get each new token from the queue and yield for our generator
	while True:
		try:
			next_token = q.get(True, timeout=1)
			if next_token is job_done:
				break
			content += next_token
			yield next_token, content
		except:
			pass

def ask_llm(message, history, model_type):
    for next_token, content in predict(message, model_type):
        yield(content)

# set up and run chat interface
kellogg_agent = gr.ChatInterface(
	fn=ask_llm,
	chatbot=gr.Chatbot(height=500),
	textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7),
	title="Kellogg Course AI Assistant",
	description="Please provide your questions about courses offered by Kellogg.",
	additional_inputs=[model_type],
	additional_inputs_accordion_name="AI Assistant Options:",
	examples=[["Can you tell me about a marketing major? What would I want from my career if I went that way instead of say strategy?"],
		["I'm interested in strategy. Can you give me a recommendation of courses I should consider over the next year?"],
		["I'm wanting to know more about advertising. Can you recommend some courses on that subject?"],
		["How many credits do I need to graduate?"],
		["I loved the Competitive Strategy and industrial structure class. Can you tell me others like that one?"]],
#    cache_examples=True,
#    retry_btn=None,
#	undo_btn="Delete Previous",
	clear_btn="Clear",
)

def main():
	kellogg_agent.queue().launch()

# start UI
if __name__ == "__main__":
	main()