eval-llm / app.py
sudeepgeorge's picture
Update app.py
8aade0d
import pinecone
import os, sys
import gradio as gr
import logging
import openai
import matplotlib.pyplot as plt
from dotenv import load_dotenv
#from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
from llama_index import GPTVectorStoreIndex, GPTListIndex, SimpleDirectoryReader, ServiceContext, LLMPredictor, PromptHelper, EmptyIndex
from llama_index.callbacks import CallbackManager, LlamaDebugHandler, CBEventType
from llama_index import VectorStoreIndex, load_index_from_storage, StorageContext
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.logger import LlamaLogger
from llama_index.vector_stores import PineconeVectorStore
from llama_index.llms import Anthropic
from llama_index.llms import OpenAI
from nomic import atlas
import nomic
import numpy as np
from llama_index.llms import Replicate
# # Query function
# def query_index(query):
# # Profile index operations
# cProfile.run('query_tokens = llama_model.tokenize_docs([query])')
# cProfile.run('results = index.query(query_tokens[0].embeddings, top_k=5)')
# # Log results
# logging.debug(f'Query tokens: {query_tokens}')
# logging.debug(f'Top 5 results: {results.ids[:5]}')
# # Visualize projections
# #projected = umap.UMAP().fit_transform(doc_embeddings)
# #plt.scatter(projected[:,0], projected[:,1])
# #plt.savefig('projection.png')
# return results
query_examples=[
("What is iMerit's biggest strength?"),
("How did CrowdReason benefit by working with iMerit?"),
("What do you mean by human-in-the-loop workflows?")
]
model_name1="gpt-4-0613"#"gpt-3.5-turbo-0613" #gpt-4-0613
model_name2= "claude-2"#"claude-instant-1" "claude-instant-1.2"
model_name3="llama2"
pinecone_vector_index='doc-index'
env_set=0
index1=0
index2=0
llm_predictor1=0
llm_predictor2=0
prompt_helper=0
prompt_helper_anthropic=0
llama_debug=0
DEBUG_VIS=0
LLAMA_13B_V2_CHAT = "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5"
#LLAMA_13B_V2_CHAT = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
# Logging setup
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
def construct_index(directory_path):
global llm_predictor1, llm_predictor2, prompt_helper, prompt_helper_anthropic
openai_max_input_size = 4096
openai_num_outputs = 512
openai_max_chunk_overlap = 0.9
openai_chunk_size_limit = 600
anthropic_max_input_size = 100000
anthropic_num_output = 2048
anthropic_max_chunk_overlap = 0.9
load_dotenv()
prompt_helper = PromptHelper(openai_max_input_size, openai_num_outputs, openai_max_chunk_overlap, chunk_size_limit=openai_chunk_size_limit)
prompt_helper_anthropic = PromptHelper(anthropic_max_input_size, anthropic_num_output, anthropic_max_chunk_overlap)
storage_context_index2=StorageContext.from_defaults()
storage_context_index1 = StorageContext.from_defaults()
llm_predictor1 = LLMPredictor(llm=OpenAI(model=model_name1, temperature=0.5, model_name=model_name1, max_tokens=openai_num_outputs)) #gpt-3.5-turbo
llm_predictor2 = LLMPredictor(llm = Anthropic(model=model_name2))
# service_context2 = ServiceContext.from_defaults(llm_predictor=llm_predictor2, prompt_helper=prompt_helper_anthropic)
# # Pinecone setup
# pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
# if ((pinecone.list_indexes()) != [pinecone_vector_index]):
# pinecone.create_index(pinecone_vector_index, dimension=1536, metric="euclidean", pod_type="p1")
# pinecone_index = pinecone.Index(pinecone_vector_index)
# documents = SimpleDirectoryReader(directory_path).load_data()
# index1= VectorStoreIndex(documents, llm_predictor=llm_predictor1, prompt_helper=prompt_helper, storage_context=storage_context_index1)
# index1.set_index_id = "index1"
# index1.storage_context.persist(persist_dir="./storage1")
# #index2 = VectorStoreIndex(documents, llm_predictor=llm_predictor2, prompt_helper=prompt_helper)
# vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
# storage_context_index2 = StorageContext.from_defaults(vector_store=vector_store)
# index2 = VectorStoreIndex.from_documents(documents, llm_predictor=llm_predictor2, storage_context=storage_context_index2)
# index2.storage_context.persist(persist_dir="./storage2")
# return index1
# def get_ids_from_query(index,input_vector):
# print("searching pinecone...")
# results = index.query(vector=input_vector, top_k=10000,include_values=False)
# ids = set()
# for result in results['matches']:
# ids.add(result['id'])
# return ids
# def get_all_ids_from_index(index, num_dimensions, namespace=""):
# num_vectors = index.describe_index_stats()["namespaces"][namespace]['vector_count']
# all_ids = set()
# while len(all_ids) < num_vectors:
# print("Length of ids list is shorter than the number of total vectors...")
# input_vector = np.random.rand(num_dimensions).tolist()
# print("creating random vector...")
# ids = get_ids_from_query(index,input_vector)
# print("getting ids from a vector query...")
# all_ids.update(ids)
# print("updating ids set...")
# print(f"Collected {len(all_ids)} ids out of {num_vectors}.")
# return all_ids
# def visualize(index, num_embeddings):
# load_dotenv()
# nomic.login(os.environ.get('NOMIC_API_KEY'))
# print("Visualizing embedding count", num_embeddings)
# print(index.describe_index_stats())
# all_ids = get_all_ids_from_index(index, num_dimensions=1536, namespace="")
# id_list=list(all_ids)
# vectors = index.fetch(id_list)
# ids = []
# embeddings = []
# for id, vector in vectors['vectors'].items():
# ids.append(id)
# embeddings.append(vector['values'])
# embeddings = np.array(embeddings)
# atlas.map_embeddings(embeddings=embeddings, data=[{'id': id} for id in ids], id_field='id')
# inject custom system prompt into llama-2
def custom_completion_to_prompt(completion: str) -> str:
return completion_to_prompt(
completion,
system_prompt=(
"You are a Q&A assistant. Your goal is to answer questions as "
"accurately as possible is the instructions and context provided."
),
)
def chatbot_init():
global llm_predictor1, llm_predictor2, llama_debug, prompt_helper_anthropic, prompt_helper
load_dotenv()
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])
llama_logger_openai = LlamaLogger()
llama_logger_anthropic = LlamaLogger()
#service_context_Anthropic = ServiceContext.from_defaults(callback_manager=callback_manager, prompt_helper=prompt_helper_anthropic, llm_predictor=llm_predictor2, llama_logger=llama_logger_anthropic)
#service_context_OpenAI = ServiceContext.from_defaults(callback_manager=callback_manager, prompt_helper=prompt_helper, llm_predictor=llm_predictor1, llama_logger=llama_logger_openai)
service_context_Anthropic = ServiceContext.from_defaults(llm = Anthropic(model=model_name2), callback_manager=callback_manager, prompt_helper=prompt_helper_anthropic, llama_logger=llama_logger_anthropic)
service_context_OpenAI = ServiceContext.from_defaults(llm=OpenAI(model=model_name1, temperature=0.5),callback_manager=callback_manager, prompt_helper=prompt_helper,llama_logger=llama_logger_openai)
# storage_context_index1 = StorageContext.from_defaults(persist_dir="./storage1")
# index1 = load_index_from_storage(storage_context_index1, service_context=service_context_OpenAI)
# pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
# pinecone_index = pinecone.Index(pinecone_vector_index)
# vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
# storage_context_index2 = StorageContext.from_defaults(vector_store=vector_store)
# index2 = VectorStoreIndex([], storage_context=storage_context_index2, service_context=service_context_Anthropic)
# if (((pinecone.list_indexes()) == [pinecone_vector_index]) and DEBUG_VIS==1):
# print(pinecone.describe_index("doc-index"))
# print(pinecone_index.describe_index_stats())
# num_vectors = pinecone_index.describe_index_stats()["namespaces"][""]['vector_count']
# print(num_vectors)
# visualize(pinecone_index, num_vectors)
test_Index1 = EmptyIndex(service_context=service_context_OpenAI)
test_Index2 = EmptyIndex(service_context=service_context_Anthropic)
return test_Index1, test_Index2
def chatbot(input_text):
global env_set, index1, index2, llama_debug
if env_set == 0:
index1, index2 = chatbot_init()
env_set=1
query_engine1 = index1.as_query_engine()
response1 = query_engine1.query(input_text)
query_engine2 = index2.as_query_engine()
response2 = query_engine2.query(input_text)
llama_logger3 = LlamaLogger()
llm_replicate = Replicate(
model=LLAMA_13B_V2_CHAT,
temperature=0.1,
# override max tokens since it's interpreted
# as context window instead of max tokens
context_window=4096,
# override completion representation for llama 2
completion_to_prompt=custom_completion_to_prompt,
# if using llama 2 for data agents, also override the message representation
messages_to_prompt=messages_to_prompt,
)
ctx = ServiceContext.from_defaults(llm=llm_replicate, llama_logger=llama_logger3)
# query_engine3 = index2.as_query_engine(service_context=ctx)
test_Index3 = EmptyIndex(service_context=ctx)
query_engine3 = test_Index3.as_query_engine()
response3 = query_engine3.query(input_text)
# print(llama_debug.get_event_time_info(CBEventType.LLM))
# event_pairs = llama_debug.get_llm_inputs_outputs()
# print(event_pairs[0][0])
print(ctx.llama_logger.get_logs())
#print(event_pairs[0][1].payload.keys())
#print(event_pairs[0][1].payload["response"])
llama_debug.flush_event_logs()
return response1.response, response2.response, response3.response
iface = gr.Interface(fn=chatbot,
inputs=gr.components.Textbox(lines=7, label="Enter your query"),
outputs=[gr.components.Textbox(label=model_name1+ " Output"), gr.components.Textbox(label=model_name2+" Output"), gr.components.Textbox(label=model_name3+" Output")],
#outputs="text",
examples=query_examples,
title="iMerit LLM Comparison Engine",
description="Evaluating various LLM's reponses to the same query",
)
index = construct_index("docs")
iface.launch(share=False)