Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| os.environ["LLAMA_CPP_USE_CUDA"] = "0" | |
| title = "SmolLM 2 - Bulgarian Joke Master - GGUF" | |
| description = """ | |
| 🔎 [SmolLM 2](https://huggingface.co/unsloth/SmolLM2-135M-Instruct-bnb-4bit) fine-tuned for Bulgarian jokes, running on CPU in GGUF format.\n | |
| This model is fine-tuned for generating humorous content in Bulgarian, utilizing the [Llama.cpp library](https://github.com/ggerganov/llama.cpp).\n | |
| Running on CPU, it can still produce impressive results, although larger models may require more processing power. | |
| """ | |
| model_dir = "models" | |
| model_name = "unsloth.Q4_K_M.gguf" | |
| model_path = os.path.join(model_dir, model_name) | |
| hf_hub_download( | |
| repo_id="vislupus/bulgarian-joke-master-SmolLM2-135M-Instruct-bnb-4bit-gguf", | |
| filename=model_name, | |
| local_dir=model_dir | |
| ) | |
| if not os.path.exists(model_path): | |
| raise FileNotFoundError(f"Model file not found at {model_path}") | |
| llm = Llama(model_path=model_path) | |
| def generate_response(message, history, temperature=0.7, top_p=1.0, max_tokens=1280): | |
| try: | |
| response = llm(message, max_tokens=max_tokens, temperature=temperature, top_p=top_p) | |
| return response["choices"][0]["text"].strip() | |
| except Exception as e: | |
| return f"Error generating response: {str(e)}" | |
| if __name__ == "__main__": | |
| gguf_demo = gr.ChatInterface( | |
| generate_response, | |
| title=title, | |
| description=description, | |
| ) | |
| gguf_demo.launch(share=True) | |