Spaces:

sudeepgeorge
/

eval-llm

Runtime error

App Files Files Community

eval-llm / app.py

sudeepgeorge

Update app.py

8aade0d over 2 years ago

raw

history blame contribute delete

11 kB

	import pinecone
	import os, sys
	import gradio as gr
	import logging
	import openai
	import matplotlib.pyplot as plt
	from dotenv import load_dotenv
	#from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
	from llama_index import GPTVectorStoreIndex, GPTListIndex, SimpleDirectoryReader, ServiceContext, LLMPredictor, PromptHelper, EmptyIndex
	from llama_index.callbacks import CallbackManager, LlamaDebugHandler, CBEventType
	from llama_index import VectorStoreIndex, load_index_from_storage, StorageContext
	from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
	from llama_index.logger import LlamaLogger


	from llama_index.vector_stores import PineconeVectorStore
	from llama_index.llms import Anthropic
	from llama_index.llms import OpenAI

	from nomic import atlas
	import nomic
	import numpy as np
	from llama_index.llms import Replicate




	# # Query function
	# def query_index(query):

	# # Profile index operations
	# cProfile.run('query_tokens = llama_model.tokenize_docs([query])')
	# cProfile.run('results = index.query(query_tokens[0].embeddings, top_k=5)')

	# # Log results
	# logging.debug(f'Query tokens: {query_tokens}')
	# logging.debug(f'Top 5 results: {results.ids[:5]}')

	# # Visualize projections
	# #projected = umap.UMAP().fit_transform(doc_embeddings)
	# #plt.scatter(projected[:,0], projected[:,1])
	# #plt.savefig('projection.png')

	# return results


	query_examples=[
	("What is iMerit's biggest strength?"),
	("How did CrowdReason benefit by working with iMerit?"),
	("What do you mean by human-in-the-loop workflows?")
	]

	model_name1="gpt-4-0613"#"gpt-3.5-turbo-0613" #gpt-4-0613
	model_name2= "claude-2"#"claude-instant-1" "claude-instant-1.2"
	model_name3="llama2"
	pinecone_vector_index='doc-index'
	env_set=0
	index1=0
	index2=0
	llm_predictor1=0
	llm_predictor2=0
	prompt_helper=0
	prompt_helper_anthropic=0
	llama_debug=0
	DEBUG_VIS=0
	LLAMA_13B_V2_CHAT = "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5"
	#LLAMA_13B_V2_CHAT = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"

	# Logging setup
	logging.basicConfig(stream=sys.stdout, level=logging.INFO)
	logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

	def construct_index(directory_path):
	global llm_predictor1, llm_predictor2, prompt_helper, prompt_helper_anthropic
	openai_max_input_size = 4096
	openai_num_outputs = 512
	openai_max_chunk_overlap = 0.9
	openai_chunk_size_limit = 600

	anthropic_max_input_size = 100000
	anthropic_num_output = 2048
	anthropic_max_chunk_overlap = 0.9

	load_dotenv()

	prompt_helper = PromptHelper(openai_max_input_size, openai_num_outputs, openai_max_chunk_overlap, chunk_size_limit=openai_chunk_size_limit)
	prompt_helper_anthropic = PromptHelper(anthropic_max_input_size, anthropic_num_output, anthropic_max_chunk_overlap)

	storage_context_index2=StorageContext.from_defaults()
	storage_context_index1 = StorageContext.from_defaults()


	llm_predictor1 = LLMPredictor(llm=OpenAI(model=model_name1, temperature=0.5, model_name=model_name1, max_tokens=openai_num_outputs)) #gpt-3.5-turbo
	llm_predictor2 = LLMPredictor(llm = Anthropic(model=model_name2))

	# service_context2 = ServiceContext.from_defaults(llm_predictor=llm_predictor2, prompt_helper=prompt_helper_anthropic)

	# # Pinecone setup
	# pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
	# if ((pinecone.list_indexes()) != [pinecone_vector_index]):
	# pinecone.create_index(pinecone_vector_index, dimension=1536, metric="euclidean", pod_type="p1")
	# pinecone_index = pinecone.Index(pinecone_vector_index)

	# documents = SimpleDirectoryReader(directory_path).load_data()
	# index1= VectorStoreIndex(documents, llm_predictor=llm_predictor1, prompt_helper=prompt_helper, storage_context=storage_context_index1)
	# index1.set_index_id = "index1"
	# index1.storage_context.persist(persist_dir="./storage1")
	# #index2 = VectorStoreIndex(documents, llm_predictor=llm_predictor2, prompt_helper=prompt_helper)

	# vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
	# storage_context_index2 = StorageContext.from_defaults(vector_store=vector_store)
	# index2 = VectorStoreIndex.from_documents(documents, llm_predictor=llm_predictor2, storage_context=storage_context_index2)
	# index2.storage_context.persist(persist_dir="./storage2")


	# return index1



	# def get_ids_from_query(index,input_vector):
	# print("searching pinecone...")
	# results = index.query(vector=input_vector, top_k=10000,include_values=False)
	# ids = set()
	# for result in results['matches']:
	# ids.add(result['id'])
	# return ids

	# def get_all_ids_from_index(index, num_dimensions, namespace=""):
	# num_vectors = index.describe_index_stats()["namespaces"][namespace]['vector_count']
	# all_ids = set()
	# while len(all_ids) < num_vectors:
	# print("Length of ids list is shorter than the number of total vectors...")
	# input_vector = np.random.rand(num_dimensions).tolist()
	# print("creating random vector...")
	# ids = get_ids_from_query(index,input_vector)
	# print("getting ids from a vector query...")
	# all_ids.update(ids)
	# print("updating ids set...")
	# print(f"Collected {len(all_ids)} ids out of {num_vectors}.")

	# return all_ids


	# def visualize(index, num_embeddings):
	# load_dotenv()
	# nomic.login(os.environ.get('NOMIC_API_KEY'))

	# print("Visualizing embedding count", num_embeddings)
	# print(index.describe_index_stats())


	# all_ids = get_all_ids_from_index(index, num_dimensions=1536, namespace="")
	# id_list=list(all_ids)
	# vectors = index.fetch(id_list)


	# ids = []
	# embeddings = []
	# for id, vector in vectors['vectors'].items():
	# ids.append(id)
	# embeddings.append(vector['values'])

	# embeddings = np.array(embeddings)
	# atlas.map_embeddings(embeddings=embeddings, data=[{'id': id} for id in ids], id_field='id')

	# inject custom system prompt into llama-2
	def custom_completion_to_prompt(completion: str) -> str:
	return completion_to_prompt(
	completion,
	system_prompt=(
	"You are a Q&A assistant. Your goal is to answer questions as "
	"accurately as possible is the instructions and context provided."
	),
	)

	def chatbot_init():
	global llm_predictor1, llm_predictor2, llama_debug, prompt_helper_anthropic, prompt_helper
	load_dotenv()

	llama_debug = LlamaDebugHandler(print_trace_on_end=True)
	callback_manager = CallbackManager([llama_debug])
	llama_logger_openai = LlamaLogger()
	llama_logger_anthropic = LlamaLogger()



	#service_context_Anthropic = ServiceContext.from_defaults(callback_manager=callback_manager, prompt_helper=prompt_helper_anthropic, llm_predictor=llm_predictor2, llama_logger=llama_logger_anthropic)
	#service_context_OpenAI = ServiceContext.from_defaults(callback_manager=callback_manager, prompt_helper=prompt_helper, llm_predictor=llm_predictor1, llama_logger=llama_logger_openai)


	service_context_Anthropic = ServiceContext.from_defaults(llm = Anthropic(model=model_name2), callback_manager=callback_manager, prompt_helper=prompt_helper_anthropic, llama_logger=llama_logger_anthropic)
	service_context_OpenAI = ServiceContext.from_defaults(llm=OpenAI(model=model_name1, temperature=0.5),callback_manager=callback_manager, prompt_helper=prompt_helper,llama_logger=llama_logger_openai)

	# storage_context_index1 = StorageContext.from_defaults(persist_dir="./storage1")
	# index1 = load_index_from_storage(storage_context_index1, service_context=service_context_OpenAI)


	# pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
	# pinecone_index = pinecone.Index(pinecone_vector_index)

	# vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
	# storage_context_index2 = StorageContext.from_defaults(vector_store=vector_store)
	# index2 = VectorStoreIndex([], storage_context=storage_context_index2, service_context=service_context_Anthropic)

	# if (((pinecone.list_indexes()) == [pinecone_vector_index]) and DEBUG_VIS==1):
	# print(pinecone.describe_index("doc-index"))
	# print(pinecone_index.describe_index_stats())
	# num_vectors = pinecone_index.describe_index_stats()["namespaces"][""]['vector_count']
	# print(num_vectors)
	# visualize(pinecone_index, num_vectors)

	test_Index1 = EmptyIndex(service_context=service_context_OpenAI)
	test_Index2 = EmptyIndex(service_context=service_context_Anthropic)


	return test_Index1, test_Index2


	def chatbot(input_text):
	global env_set, index1, index2, llama_debug
	if env_set == 0:
	index1, index2 = chatbot_init()
	env_set=1



	query_engine1 = index1.as_query_engine()
	response1 = query_engine1.query(input_text)
	query_engine2 = index2.as_query_engine()
	response2 = query_engine2.query(input_text)

	llama_logger3 = LlamaLogger()


	llm_replicate = Replicate(
	model=LLAMA_13B_V2_CHAT,
	temperature=0.1,
	# override max tokens since it's interpreted
	# as context window instead of max tokens
	context_window=4096,
	# override completion representation for llama 2
	completion_to_prompt=custom_completion_to_prompt,
	# if using llama 2 for data agents, also override the message representation
	messages_to_prompt=messages_to_prompt,
	)
	ctx = ServiceContext.from_defaults(llm=llm_replicate, llama_logger=llama_logger3)
	# query_engine3 = index2.as_query_engine(service_context=ctx)
	test_Index3 = EmptyIndex(service_context=ctx)
	query_engine3 = test_Index3.as_query_engine()
	response3 = query_engine3.query(input_text)

	# print(llama_debug.get_event_time_info(CBEventType.LLM))
	# event_pairs = llama_debug.get_llm_inputs_outputs()
	# print(event_pairs[0][0])
	print(ctx.llama_logger.get_logs())
	#print(event_pairs[0][1].payload.keys())
	#print(event_pairs[0][1].payload["response"])

	llama_debug.flush_event_logs()
	return response1.response, response2.response, response3.response



	iface = gr.Interface(fn=chatbot,
	inputs=gr.components.Textbox(lines=7, label="Enter your query"),
	outputs=[gr.components.Textbox(label=model_name1+ " Output"), gr.components.Textbox(label=model_name2+" Output"), gr.components.Textbox(label=model_name3+" Output")],
	#outputs="text",
	examples=query_examples,
	title="iMerit LLM Comparison Engine",
	description="Evaluating various LLM's reponses to the same query",
	)

	index = construct_index("docs")
	iface.launch(share=False)