from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import gradio as gr

model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
model_basename = "llama-2-13b-chat.ggmlv3.q2_K.bin" # the model is in bin format    #llama-2-13b-chat.ggmlv3.q2_K.bin  -> menos pesado   | "llama-2-13b-chat.ggmlv3.q5_1.bin" -> más pesado

model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

# GPU
lcpp_llm = None
lcpp_llm = Llama(
    model_path=model_path,
    n_threads=2, # CPU cores
    n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    n_gpu_layers=32 # Change this value based on your model and your GPU VRAM pool.
    )

def CustomChatGPT(Ask):
    prompt = Ask
    prompt_template=f'''SYSTEM: You are an NBA expert that helps answering questions about the NBA, its teams and its players summarizing the most important information limiting to no more that one hundred tokens.
    
    USER: {prompt}

    ASSISTANT:
    '''
    response=lcpp_llm(prompt=prompt_template, max_tokens=100, temperature=0.3, top_p=0.95,
                  repeat_penalty=1.2, top_k=150,
                  echo=True)
    
    #reply=response["choices"][0]["text"]
    reply = response["choices"][0]["text"].split("ASSISTANT:")[1].strip()
    return reply

gui=gr.Interface(fn=CustomChatGPT, inputs="text", outputs="text", examples=["Who is the greatest basketball player in NBA history","What is the winning record in a season?"] , title="Ask the AI coach", description="Ask the AI coach all you want about NBA Teams and Players:")

gui.launch()