Spaces:
Runtime error
Runtime error
| import os | |
| from dotenv import load_dotenv | |
| from huggingface_hub import login | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig | |
| import gradio as gr | |
| load_dotenv() | |
| API_TOKEN = os.environ.get("HUGGINGFACE_TOKEN") | |
| login(API_TOKEN) | |
| model_id = "meta-llama/Llama-2-7b-chat-hf" | |
| quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True) | |
| device_map = { | |
| "transformer.word_embeddings": "cpu", | |
| "transformer.word_embeddings_layernorm": "cpu", | |
| "lm_head": "cpu", | |
| "transformer.h": "cpu", | |
| "transformer.ln_f": "cpu", | |
| "model.embed_tokens": "cpu", | |
| "model.layers":"cpu", | |
| "model.norm":"cpu" | |
| } | |
| model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device_map,quantization_config=quantization_config) | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| generate_text_pipeline = pipeline( | |
| model=model, tokenizer=tokenizer, | |
| return_full_text=True, | |
| task='text-generation', | |
| temperature=0.1, | |
| max_new_tokens=512, | |
| repetition_penalty=1.1 # without this output begins repeating | |
| ) | |
| def get_results(text): | |
| res = generate_text_pipeline(text) | |
| return res[0]["generated_text"] | |
| iface = gr.Interface(fn=get_results, inputs="text", outputs="text") | |
| iface.launch() | |