llm-app / app.py
terry-li-hm
`torch` only
7bb49ef
# import os
# import chainlit as cl
# import openai
import torch
# from chainlit.input_widget import Select, Slider
# from llama_index import (
# ServiceContext,
# StorageContext,
# TrafilaturaWebReader,
# VectorStoreIndex,
# load_index_from_storage,
# )
# from llama_index.callbacks.base import CallbackManager
# from llama_index.embeddings import HuggingFaceEmbedding
# from llama_index.llms import HuggingFaceLLM, LiteLLM, MessageRole, OpenAI
# from llama_index.prompts import PromptTemplate
# from transformers import BitsAndBytesConfig
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# def get_api_key():
# api_key = os.getenv("OPENAI_API_KEY")
# if api_key is None:
# print("OPENAI_API_KEY missing from environment variables")
# api_key = input("Please enter your OPENAI_API_KEY: ")
# return api_key
# openai.api_key = get_api_key()
# def load_index():
# try:
# storage_context = StorageContext.from_defaults(persist_dir="./storage")
# index = load_index_from_storage(storage_context)
# except FileNotFoundError:
# print("Storage file not found. Loading from web.")
# documents = TrafilaturaWebReader().load_data(["https://bit.ly/45BncJA"])
# index = VectorStoreIndex.from_documents(documents)
# index.storage_context.persist()
# return index
# index = load_index()
# welcome_msg = (
# "Hi there! I’m your China Life chatbot, specialising in answering "
# "[frequently asked questions](https://bit.ly/45BncJA). "
# "How may I assist you today? "
# "Feel free to ask questions like, "
# "“Is there any action required after receiving the policy?” or "
# "“Can I settle using a demand draft?”"
# )
# @cl.on_chat_start
# async def start():
# chat_profile = cl.user_session.get("chat_profile")
# msg = cl.Message(content="")
# for token in list(welcome_msg):
# await cl.sleep(0.01)
# await msg.stream_token(token)
# await msg.send()
# settings = await cl.ChatSettings(
# [
# Select(
# id="Model",
# label="Model",
# values=[
# "gpt-3.5-turbo",
# "gpt-4",
# "zephyr",
# "litellm-gpt-3.5-turbo",
# "litellm-opt-125m",
# ],
# initial_index=1,
# ),
# Slider(
# id="Temperature",
# label="Temperature",
# initial=0.0,
# min=0.0,
# max=2.0,
# step=0.1,
# ),
# ]
# ).send()
# await setup_query_engine(settings)
# @cl.on_settings_update
# async def setup_query_engine(settings):
# print("on_settings_update", settings)
# # def messages_to_prompt(messages):
# # prompt = ""
# # for message in messages:
# # if message.role == "system":
# # prompt += f"<|system|>\n{message.content}</s>\n"
# # elif message.role == "user":
# # prompt += f"<|user|>\n{message.content}</s>\n"
# # elif message.role == "assistant":
# # prompt += f"<|assistant|>\n{message.content}</s>\n"
# # if not prompt.startswith("<|system|>\n"):
# # prompt = "<|system|>\n</s>\n" + prompt
# # prompt = prompt + "<|assistant|>\n"
# # return prompt
# if settings["Model"] == "zephyr":
# # model_name = "HuggingFaceH4/zephyr-7b-beta"
# # query_wrapper_prompt = PromptTemplate(
# # "<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"
# # )
# # quantization_config = BitsAndBytesConfig(
# # load_in_4bit=True,
# # bnb_4bit_compute_dtype=torch.bfloat16,
# # bnb_4bit_quant_type="nf4",
# # bnb_4bit_use_double_quant=True,
# # )
# # llm = HuggingFaceLLM(
# # model_name=model_name,
# # tokenizer_name=model_name,
# # query_wrapper_prompt=query_wrapper_prompt,
# # context_window=3900,
# # max_new_tokens=256,
# # model_kwargs={"quantization_config": quantization_config},
# # generate_kwargs={
# # "do_sample": True,
# # "temperature": settings["Temperature"],
# # "top_k": 50,
# # "top_p": 0.95,
# # },
# # messages_to_prompt=messages_to_prompt,
# # device_map="auto",
# # )
# llm = LiteLLM("gpt-3.5-turbo")
# elif settings["Model"] == "litellm-gpt-3.5-turbo":
# llm = LiteLLM("gpt-3.5-turbo")
# elif settings["Model"] == "litellm-opt-125m":
# llm = LiteLLM("vllm/facebook/opt-125m")
# else:
# llm = OpenAI(model=settings["Model"], temperature=settings["Temperature"])
# # embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")
# service_context = ServiceContext.from_defaults(
# llm=llm,
# # embed_model=embed_model,
# callback_manager=CallbackManager([cl.LlamaIndexCallbackHandler()]),
# )
# query_engine = index.as_query_engine(
# service_context=service_context,
# streaming=True,
# )
# cl.user_session.set("query_engine", query_engine)
# @cl.on_message
# async def main(message: cl.Message):
# query_engine = cl.user_session.get("query_engine")
# if query_engine is None:
# await start()
# query_engine = cl.user_session.get("query_engine")
# if query_engine:
# query_result = await cl.make_async(query_engine.query)(message.content)
# response_message = cl.Message(content=query_result.response_txt or "")
# for token in query_result.response_gen:
# await response_message.stream_token(token=token)
# await response_message.send()