# import os # import chainlit as cl # import openai import torch # from chainlit.input_widget import Select, Slider # from llama_index import ( # ServiceContext, # StorageContext, # TrafilaturaWebReader, # VectorStoreIndex, # load_index_from_storage, # ) # from llama_index.callbacks.base import CallbackManager # from llama_index.embeddings import HuggingFaceEmbedding # from llama_index.llms import HuggingFaceLLM, LiteLLM, MessageRole, OpenAI # from llama_index.prompts import PromptTemplate # from transformers import BitsAndBytesConfig print(f"Is CUDA available: {torch.cuda.is_available()}") print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") # def get_api_key(): # api_key = os.getenv("OPENAI_API_KEY") # if api_key is None: # print("OPENAI_API_KEY missing from environment variables") # api_key = input("Please enter your OPENAI_API_KEY: ") # return api_key # openai.api_key = get_api_key() # def load_index(): # try: # storage_context = StorageContext.from_defaults(persist_dir="./storage") # index = load_index_from_storage(storage_context) # except FileNotFoundError: # print("Storage file not found. Loading from web.") # documents = TrafilaturaWebReader().load_data(["https://bit.ly/45BncJA"]) # index = VectorStoreIndex.from_documents(documents) # index.storage_context.persist() # return index # index = load_index() # welcome_msg = ( # "Hi there! I’m your China Life chatbot, specialising in answering " # "[frequently asked questions](https://bit.ly/45BncJA). " # "How may I assist you today? " # "Feel free to ask questions like, " # "“Is there any action required after receiving the policy?” or " # "“Can I settle using a demand draft?”" # ) # @cl.on_chat_start # async def start(): # chat_profile = cl.user_session.get("chat_profile") # msg = cl.Message(content="") # for token in list(welcome_msg): # await cl.sleep(0.01) # await msg.stream_token(token) # await msg.send() # settings = await cl.ChatSettings( # [ # Select( # id="Model", # label="Model", # values=[ # "gpt-3.5-turbo", # "gpt-4", # "zephyr", # "litellm-gpt-3.5-turbo", # "litellm-opt-125m", # ], # initial_index=1, # ), # Slider( # id="Temperature", # label="Temperature", # initial=0.0, # min=0.0, # max=2.0, # step=0.1, # ), # ] # ).send() # await setup_query_engine(settings) # @cl.on_settings_update # async def setup_query_engine(settings): # print("on_settings_update", settings) # # def messages_to_prompt(messages): # # prompt = "" # # for message in messages: # # if message.role == "system": # # prompt += f"<|system|>\n{message.content}\n" # # elif message.role == "user": # # prompt += f"<|user|>\n{message.content}\n" # # elif message.role == "assistant": # # prompt += f"<|assistant|>\n{message.content}\n" # # if not prompt.startswith("<|system|>\n"): # # prompt = "<|system|>\n\n" + prompt # # prompt = prompt + "<|assistant|>\n" # # return prompt # if settings["Model"] == "zephyr": # # model_name = "HuggingFaceH4/zephyr-7b-beta" # # query_wrapper_prompt = PromptTemplate( # # "<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n" # # ) # # quantization_config = BitsAndBytesConfig( # # load_in_4bit=True, # # bnb_4bit_compute_dtype=torch.bfloat16, # # bnb_4bit_quant_type="nf4", # # bnb_4bit_use_double_quant=True, # # ) # # llm = HuggingFaceLLM( # # model_name=model_name, # # tokenizer_name=model_name, # # query_wrapper_prompt=query_wrapper_prompt, # # context_window=3900, # # max_new_tokens=256, # # model_kwargs={"quantization_config": quantization_config}, # # generate_kwargs={ # # "do_sample": True, # # "temperature": settings["Temperature"], # # "top_k": 50, # # "top_p": 0.95, # # }, # # messages_to_prompt=messages_to_prompt, # # device_map="auto", # # ) # llm = LiteLLM("gpt-3.5-turbo") # elif settings["Model"] == "litellm-gpt-3.5-turbo": # llm = LiteLLM("gpt-3.5-turbo") # elif settings["Model"] == "litellm-opt-125m": # llm = LiteLLM("vllm/facebook/opt-125m") # else: # llm = OpenAI(model=settings["Model"], temperature=settings["Temperature"]) # # embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5") # service_context = ServiceContext.from_defaults( # llm=llm, # # embed_model=embed_model, # callback_manager=CallbackManager([cl.LlamaIndexCallbackHandler()]), # ) # query_engine = index.as_query_engine( # service_context=service_context, # streaming=True, # ) # cl.user_session.set("query_engine", query_engine) # @cl.on_message # async def main(message: cl.Message): # query_engine = cl.user_session.get("query_engine") # if query_engine is None: # await start() # query_engine = cl.user_session.get("query_engine") # if query_engine: # query_result = await cl.make_async(query_engine.query)(message.content) # response_message = cl.Message(content=query_result.response_txt or "") # for token in query_result.response_gen: # await response_message.stream_token(token=token) # await response_message.send()