from huggingface_hub import hf_hub_download from llama_cpp import Llama import gradio as gr model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML" model_basename = "llama-2-13b-chat.ggmlv3.q2_K.bin" # the model is in bin format #llama-2-13b-chat.ggmlv3.q2_K.bin -> menos pesado | "llama-2-13b-chat.ggmlv3.q5_1.bin" -> más pesado model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename) # GPU lcpp_llm = None lcpp_llm = Llama( model_path=model_path, n_threads=2, # CPU cores n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. n_gpu_layers=32 # Change this value based on your model and your GPU VRAM pool. ) def CustomChatGPT(Ask): prompt = Ask prompt_template=f'''SYSTEM: You are an NBA expert that helps answering questions about the NBA, its teams and its players summarizing the most important information limiting to no more that one hundred tokens. USER: {prompt} ASSISTANT: ''' response=lcpp_llm(prompt=prompt_template, max_tokens=100, temperature=0.3, top_p=0.95, repeat_penalty=1.2, top_k=150, echo=True) #reply=response["choices"][0]["text"] reply = response["choices"][0]["text"].split("ASSISTANT:")[1].strip() return reply gui=gr.Interface(fn=CustomChatGPT, inputs="text", outputs="text", examples=["Who is the greatest basketball player in NBA history","What is the winning record in a season?"] , title="Ask the AI coach", description="Ask the AI coach all you want about NBA Teams and Players:") gui.launch()