| # import os | |
| # import chainlit as cl | |
| # import openai | |
| import torch | |
| # from chainlit.input_widget import Select, Slider | |
| # from llama_index import ( | |
| # ServiceContext, | |
| # StorageContext, | |
| # TrafilaturaWebReader, | |
| # VectorStoreIndex, | |
| # load_index_from_storage, | |
| # ) | |
| # from llama_index.callbacks.base import CallbackManager | |
| # from llama_index.embeddings import HuggingFaceEmbedding | |
| # from llama_index.llms import HuggingFaceLLM, LiteLLM, MessageRole, OpenAI | |
| # from llama_index.prompts import PromptTemplate | |
| # from transformers import BitsAndBytesConfig | |
| print(f"Is CUDA available: {torch.cuda.is_available()}") | |
| print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") | |
| # def get_api_key(): | |
| # api_key = os.getenv("OPENAI_API_KEY") | |
| # if api_key is None: | |
| # print("OPENAI_API_KEY missing from environment variables") | |
| # api_key = input("Please enter your OPENAI_API_KEY: ") | |
| # return api_key | |
| # openai.api_key = get_api_key() | |
| # def load_index(): | |
| # try: | |
| # storage_context = StorageContext.from_defaults(persist_dir="./storage") | |
| # index = load_index_from_storage(storage_context) | |
| # except FileNotFoundError: | |
| # print("Storage file not found. Loading from web.") | |
| # documents = TrafilaturaWebReader().load_data(["https://bit.ly/45BncJA"]) | |
| # index = VectorStoreIndex.from_documents(documents) | |
| # index.storage_context.persist() | |
| # return index | |
| # index = load_index() | |
| # welcome_msg = ( | |
| # "Hi there! I’m your China Life chatbot, specialising in answering " | |
| # "[frequently asked questions](https://bit.ly/45BncJA). " | |
| # "How may I assist you today? " | |
| # "Feel free to ask questions like, " | |
| # "“Is there any action required after receiving the policy?” or " | |
| # "“Can I settle using a demand draft?”" | |
| # ) | |
| # @cl.on_chat_start | |
| # async def start(): | |
| # chat_profile = cl.user_session.get("chat_profile") | |
| # msg = cl.Message(content="") | |
| # for token in list(welcome_msg): | |
| # await cl.sleep(0.01) | |
| # await msg.stream_token(token) | |
| # await msg.send() | |
| # settings = await cl.ChatSettings( | |
| # [ | |
| # Select( | |
| # id="Model", | |
| # label="Model", | |
| # values=[ | |
| # "gpt-3.5-turbo", | |
| # "gpt-4", | |
| # "zephyr", | |
| # "litellm-gpt-3.5-turbo", | |
| # "litellm-opt-125m", | |
| # ], | |
| # initial_index=1, | |
| # ), | |
| # Slider( | |
| # id="Temperature", | |
| # label="Temperature", | |
| # initial=0.0, | |
| # min=0.0, | |
| # max=2.0, | |
| # step=0.1, | |
| # ), | |
| # ] | |
| # ).send() | |
| # await setup_query_engine(settings) | |
| # @cl.on_settings_update | |
| # async def setup_query_engine(settings): | |
| # print("on_settings_update", settings) | |
| # # def messages_to_prompt(messages): | |
| # # prompt = "" | |
| # # for message in messages: | |
| # # if message.role == "system": | |
| # # prompt += f"<|system|>\n{message.content}</s>\n" | |
| # # elif message.role == "user": | |
| # # prompt += f"<|user|>\n{message.content}</s>\n" | |
| # # elif message.role == "assistant": | |
| # # prompt += f"<|assistant|>\n{message.content}</s>\n" | |
| # # if not prompt.startswith("<|system|>\n"): | |
| # # prompt = "<|system|>\n</s>\n" + prompt | |
| # # prompt = prompt + "<|assistant|>\n" | |
| # # return prompt | |
| # if settings["Model"] == "zephyr": | |
| # # model_name = "HuggingFaceH4/zephyr-7b-beta" | |
| # # query_wrapper_prompt = PromptTemplate( | |
| # # "<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n" | |
| # # ) | |
| # # quantization_config = BitsAndBytesConfig( | |
| # # load_in_4bit=True, | |
| # # bnb_4bit_compute_dtype=torch.bfloat16, | |
| # # bnb_4bit_quant_type="nf4", | |
| # # bnb_4bit_use_double_quant=True, | |
| # # ) | |
| # # llm = HuggingFaceLLM( | |
| # # model_name=model_name, | |
| # # tokenizer_name=model_name, | |
| # # query_wrapper_prompt=query_wrapper_prompt, | |
| # # context_window=3900, | |
| # # max_new_tokens=256, | |
| # # model_kwargs={"quantization_config": quantization_config}, | |
| # # generate_kwargs={ | |
| # # "do_sample": True, | |
| # # "temperature": settings["Temperature"], | |
| # # "top_k": 50, | |
| # # "top_p": 0.95, | |
| # # }, | |
| # # messages_to_prompt=messages_to_prompt, | |
| # # device_map="auto", | |
| # # ) | |
| # llm = LiteLLM("gpt-3.5-turbo") | |
| # elif settings["Model"] == "litellm-gpt-3.5-turbo": | |
| # llm = LiteLLM("gpt-3.5-turbo") | |
| # elif settings["Model"] == "litellm-opt-125m": | |
| # llm = LiteLLM("vllm/facebook/opt-125m") | |
| # else: | |
| # llm = OpenAI(model=settings["Model"], temperature=settings["Temperature"]) | |
| # # embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5") | |
| # service_context = ServiceContext.from_defaults( | |
| # llm=llm, | |
| # # embed_model=embed_model, | |
| # callback_manager=CallbackManager([cl.LlamaIndexCallbackHandler()]), | |
| # ) | |
| # query_engine = index.as_query_engine( | |
| # service_context=service_context, | |
| # streaming=True, | |
| # ) | |
| # cl.user_session.set("query_engine", query_engine) | |
| # @cl.on_message | |
| # async def main(message: cl.Message): | |
| # query_engine = cl.user_session.get("query_engine") | |
| # if query_engine is None: | |
| # await start() | |
| # query_engine = cl.user_session.get("query_engine") | |
| # if query_engine: | |
| # query_result = await cl.make_async(query_engine.query)(message.content) | |
| # response_message = cl.Message(content=query_result.response_txt or "") | |
| # for token in query_result.response_gen: | |
| # await response_message.stream_token(token=token) | |
| # await response_message.send() | |