Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """Hugging Face Space App with CPU Quantization""" | |
| import os | |
| import gradio as gr | |
| from huggingface_hub import login | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| # 登錄 Hugging Face,使用訪問令牌進行身份驗證 | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| if not HF_TOKEN: | |
| raise ValueError( | |
| "未找到 Hugging Face 訪問令牌!請設置環境變數 'HF_TOKEN',或者直接提供有效的訪問令牌。" | |
| ) | |
| login(HF_TOKEN) | |
| # 配置 4-bit 量化 | |
| MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf" | |
| quantization_config = BitsAndBytesConfig(load_in_4bit=True) | |
| # 加載量化模型 | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| quantization_config=quantization_config, | |
| device_map="auto", | |
| token=HF_TOKEN | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN) | |
| # 定義推理函數 | |
| def generate_text(prompt): | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| outputs = model.generate( | |
| inputs.input_ids, | |
| max_length=200, | |
| num_beams=5, | |
| repetition_penalty=1.2, | |
| early_stopping=True | |
| ) | |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # 使用 Gradio 構建界面 | |
| interface = gr.Interface( | |
| fn=generate_text, | |
| inputs=gr.Textbox(lines=5, placeholder="Enter your prompt here..."), | |
| outputs="text", | |
| title="Llama 2 Text Generator (CPU Quantized)", | |
| description="Generate text using the Llama-2-13b-chat-hf model with CPU quantization hosted on Hugging Face Spaces." | |
| ) | |
| # 啟動應用 | |
| if __name__ == "__main__": | |
| interface.launch() | |