Spaces:
Running
Running
| import gradio as gr | |
| import copy | |
| import random | |
| import os | |
| import requests | |
| import time | |
| import sys | |
| from huggingface_hub import snapshot_download | |
| from llama_cpp import Llama | |
| repo_name = "kirp/TinyLlama-1.1B-Chat-v0.2-gguf" | |
| # model_name = "ggml-model-q4_k_m.gguf" | |
| model_name = "ggml-model-q2_k.gguf" | |
| snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name) | |
| model = Llama( | |
| model_path=model_name, | |
| n_ctx=2048, | |
| n_parts=1, | |
| ) | |
| template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" | |
| def generate( | |
| input=None, | |
| temperature=0.1, | |
| top_p=0.75, | |
| top_k=40, | |
| max_new_tokens=512, | |
| ): | |
| prompt = template.format(input) | |
| output = model(prompt, | |
| temperature = temperature, | |
| top_k = top_k, | |
| top_p = top_p, | |
| max_tokens = max_new_tokens + len(input), | |
| stop=["<|im_end|>"], | |
| echo=True) | |
| output = output["choices"][0]['text'] | |
| return output.split("assistant\n")[1] | |
| g = gr.Interface( | |
| fn=generate, | |
| inputs=[ | |
| gr.components.Textbox( | |
| lines=2, label="Prompt", placeholder="Tell me about huggingface." | |
| ), | |
| gr.components.Slider(minimum=0, maximum=1, value=0.7, label="Temperature"), | |
| gr.components.Slider(minimum=0, maximum=1, value=0.8, label="Top p"), | |
| gr.components.Slider(minimum=0, maximum=100, step=1, value=50, label="Top k"), | |
| gr.components.Slider( | |
| minimum=1, maximum=2048, step=1, value=512, label="Max tokens" | |
| ), | |
| ], | |
| outputs=[ | |
| gr.Textbox( | |
| lines=10, | |
| label="Output", | |
| ) | |
| ], | |
| title = "TinyLlama 1.1B Chat GGUF", | |
| description = """ | |
| original model: [PY007/TinyLlama-1.1B-Chat-v0.2](https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.2) | |
| quantized_model: [kirp/TinyLlama-1.1B-Chat-v0.2-gguf](https://huggingface.co/kirp/TinyLlama-1.1B-Chat-v0.2-gguf) | |
| """ | |
| ) | |
| g.queue(concurrency_count=2) | |
| g.launch() | |