Spaces:
Runtime error
Runtime error
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| import gradio as gr | |
| model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML" | |
| model_basename = "llama-2-13b-chat.ggmlv3.q2_K.bin" # the model is in bin format #llama-2-13b-chat.ggmlv3.q2_K.bin -> menos pesado | "llama-2-13b-chat.ggmlv3.q5_1.bin" -> más pesado | |
| model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename) | |
| # GPU | |
| lcpp_llm = None | |
| lcpp_llm = Llama( | |
| model_path=model_path, | |
| n_threads=2, # CPU cores | |
| n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. | |
| n_gpu_layers=32 # Change this value based on your model and your GPU VRAM pool. | |
| ) | |
| def CustomChatGPT(Ask): | |
| prompt = Ask | |
| prompt_template=f'''SYSTEM: You are an NBA expert that helps answering questions about the NBA, its teams and its players summarizing the most important information limiting to no more that one hundred tokens. | |
| USER: {prompt} | |
| ASSISTANT: | |
| ''' | |
| response=lcpp_llm(prompt=prompt_template, max_tokens=100, temperature=0.3, top_p=0.95, | |
| repeat_penalty=1.2, top_k=150, | |
| echo=True) | |
| #reply=response["choices"][0]["text"] | |
| reply = response["choices"][0]["text"].split("ASSISTANT:")[1].strip() | |
| return reply | |
| gui=gr.Interface(fn=CustomChatGPT, inputs="text", outputs="text", examples=["Who is the greatest basketball player in NBA history","What is the winning record in a season?"] , title="Ask the AI coach", description="Ask the AI coach all you want about NBA Teams and Players:") | |
| gui.launch() |