Spaces:
Runtime error
Runtime error
File size: 6,054 Bytes
ea2f813 f4e52ec d39dd11 ea2f813 f4e52ec d39dd11 818b367 f4e52ec ea2f813 f4e52ec ea2f813 f4e52ec d39dd11 f4e52ec 818b367 d8d0f11 3c3fb04 d8d0f11 3c3fb04 d8d0f11 3c3fb04 d8d0f11 3c3fb04 d8d0f11 3c3fb04 d8d0f11 d39dd11 d8d0f11 818b367 f4e52ec 818b367 f4e52ec ea2f813 f4e52ec 818b367 f4e52ec 818b367 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from huggingface_hub import login
# import torch
# import os
# # Authenticate using environment variable
# login(token=os.getenv('HF_TOKEN'))
# # Load model (will use cached version if available)
# model_id = "meta-llama/Llama-2-7b-chat-hf"
# device = "cuda" if torch.cuda.is_available() else "cpu"
# def load_model():
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
# return tokenizer, model
# tokenizer, model = load_model()
# def generate_text(prompt, max_length=200):
# inputs = tokenizer(prompt, return_tensors="pt").to(device)
# outputs = model.generate(
# **inputs,
# max_new_tokens=max_length,
# temperature=0.7,
# do_sample=True
# )
# return tokenizer.decode(outputs[0], skip_special_tokens=True)
# # Gradio interface
# with gr.Blocks() as demo:
# gr.Markdown("# LLaMA 2 7B Chat Demo")
# with gr.Row():
# input_text = gr.Textbox(label="Input Prompt", lines=3)
# output_text = gr.Textbox(label="Generated Response", lines=3)
# generate_btn = gr.Button("Generate")
# generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)
# demo.launch(server_name="0.0.0.0", server_port=7860)
# import gradio as gr
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from huggingface_hub import login, hf_hub_download
# from tenacity import retry, stop_after_attempt, wait_exponential
# import torch
# import os
# # Authentication
# login(token=os.getenv('HF_TOKEN'))
# # Configuration
# CACHE_REPO = "Juna190825/cacheRepo" # Your dataset repo for cached models
# MODEL_ID = "meta-llama/Llama-2-7b-chat-hf" # Original model ID
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
# def load_model():
# retries = 3
# for attempt in range(retries):
# try:
# # First try loading from cache repo
# model = AutoModelForCausalLM.from_pretrained(
# CACHE_REPO,
# cache_dir="/cache/models",
# local_files_only=True
# ).to(DEVICE)
# tokenizer = AutoTokenizer.from_pretrained(
# CACHE_REPO,
# cache_dir="/cache/models"
# )
# print("Loaded model from cache repo")
# return model, tokenizer
# except Exception as e:
# if attempt == retries - 1: # Final attempt
# print(f"Cache load failed: {str(e)}. Falling back to original repo")
# # Fallback to original repo
# model = AutoModelForCausalLM.from_pretrained(
# MODEL_ID,
# cache_dir="/cache/models"
# ).to(DEVICE)
# tokenizer = AutoTokenizer.from_pretrained(
# MODEL_ID,
# cache_dir="/cache/models"
# )
# return model, tokenizer
# print(f"Attempt {attempt + 1} failed, retrying...")
# time.sleep(2 ** attempt) # Exponential backoff
# # Load model and tokenizer
# model, tokenizer = load_model()
# def generate_text(prompt, max_length=200):
# inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
# outputs = model.generate(
# **inputs,
# max_new_tokens=max_length,
# temperature=0.7,
# do_sample=True
# )
# return tokenizer.decode(outputs[0], skip_special_tokens=True)
# # Gradio interface
# with gr.Blocks() as demo:
# gr.Markdown("# LLaMA 2 7B Chat Demo")
# with gr.Row():
# input_text = gr.Textbox(label="Input Prompt", lines=3)
# output_text = gr.Textbox(label="Generated Response", lines=3)
# generate_btn = gr.Button("Generate")
# generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)
# demo.launch(server_name="0.0.0.0", server_port=7860)
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
import torch
import os
# Authentication
login(token=os.getenv('HF_TOKEN'))
# Configuration
MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
CACHE_DIR = "/cache/models"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def load_model():
"""Load model with automatic cache handling"""
try:
# First try with local files only (uses cache if available)
print("Checking for cached model...")
return AutoModelForCausalLM.from_pretrained(
MODEL_ID,
cache_dir=CACHE_DIR,
local_files_only=True # Will fail if not cached
).to(DEVICE), AutoTokenizer.from_pretrained(
MODEL_ID,
cache_dir=CACHE_DIR,
local_files_only=True
)
except OSError:
# Fallback to download if not in cache
print("Downloading model...")
return AutoModelForCausalLM.from_pretrained(
MODEL_ID,
cache_dir=CACHE_DIR
).to(DEVICE), AutoTokenizer.from_pretrained(
MODEL_ID,
cache_dir=CACHE_DIR
)
# Load model
model, tokenizer = load_model()
def generate_text(prompt, max_length=200):
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
outputs = model.generate(
**inputs,
max_new_tokens=max_length,
temperature=0.7,
do_sample=True
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# LLaMA 2 7B Chat Demo")
with gr.Row():
input_text = gr.Textbox(label="Input Prompt", lines=3)
output_text = gr.Textbox(label="Generated Response", lines=3)
generate_btn = gr.Button("Generate")
generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)
demo.launch(server_name="0.0.0.0", server_port=7860)
|