# import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM # from huggingface_hub import login # import torch # import os # # Authenticate using environment variable # login(token=os.getenv('HF_TOKEN')) # # Load model (will use cached version if available) # model_id = "meta-llama/Llama-2-7b-chat-hf" # device = "cuda" if torch.cuda.is_available() else "cpu" # def load_model(): # tokenizer = AutoTokenizer.from_pretrained(model_id) # model = AutoModelForCausalLM.from_pretrained(model_id).to(device) # return tokenizer, model # tokenizer, model = load_model() # def generate_text(prompt, max_length=200): # inputs = tokenizer(prompt, return_tensors="pt").to(device) # outputs = model.generate( # **inputs, # max_new_tokens=max_length, # temperature=0.7, # do_sample=True # ) # return tokenizer.decode(outputs[0], skip_special_tokens=True) # # Gradio interface # with gr.Blocks() as demo: # gr.Markdown("# LLaMA 2 7B Chat Demo") # with gr.Row(): # input_text = gr.Textbox(label="Input Prompt", lines=3) # output_text = gr.Textbox(label="Generated Response", lines=3) # generate_btn = gr.Button("Generate") # generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text) # demo.launch(server_name="0.0.0.0", server_port=7860) # import gradio as gr # from transformers import AutoModelForCausalLM, AutoTokenizer # from huggingface_hub import login, hf_hub_download # from tenacity import retry, stop_after_attempt, wait_exponential # import torch # import os # # Authentication # login(token=os.getenv('HF_TOKEN')) # # Configuration # CACHE_REPO = "Juna190825/cacheRepo" # Your dataset repo for cached models # MODEL_ID = "meta-llama/Llama-2-7b-chat-hf" # Original model ID # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) # def load_model(): # retries = 3 # for attempt in range(retries): # try: # # First try loading from cache repo # model = AutoModelForCausalLM.from_pretrained( # CACHE_REPO, # cache_dir="/cache/models", # local_files_only=True # ).to(DEVICE) # tokenizer = AutoTokenizer.from_pretrained( # CACHE_REPO, # cache_dir="/cache/models" # ) # print("Loaded model from cache repo") # return model, tokenizer # except Exception as e: # if attempt == retries - 1: # Final attempt # print(f"Cache load failed: {str(e)}. Falling back to original repo") # # Fallback to original repo # model = AutoModelForCausalLM.from_pretrained( # MODEL_ID, # cache_dir="/cache/models" # ).to(DEVICE) # tokenizer = AutoTokenizer.from_pretrained( # MODEL_ID, # cache_dir="/cache/models" # ) # return model, tokenizer # print(f"Attempt {attempt + 1} failed, retrying...") # time.sleep(2 ** attempt) # Exponential backoff # # Load model and tokenizer # model, tokenizer = load_model() # def generate_text(prompt, max_length=200): # inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE) # outputs = model.generate( # **inputs, # max_new_tokens=max_length, # temperature=0.7, # do_sample=True # ) # return tokenizer.decode(outputs[0], skip_special_tokens=True) # # Gradio interface # with gr.Blocks() as demo: # gr.Markdown("# LLaMA 2 7B Chat Demo") # with gr.Row(): # input_text = gr.Textbox(label="Input Prompt", lines=3) # output_text = gr.Textbox(label="Generated Response", lines=3) # generate_btn = gr.Button("Generate") # generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text) # demo.launch(server_name="0.0.0.0", server_port=7860) import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from huggingface_hub import login import torch import os # Authentication login(token=os.getenv('HF_TOKEN')) # Configuration MODEL_ID = "meta-llama/Llama-2-7b-chat-hf" CACHE_DIR = "/cache/models" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" def load_model(): """Load model with automatic cache handling""" try: # First try with local files only (uses cache if available) print("Checking for cached model...") return AutoModelForCausalLM.from_pretrained( MODEL_ID, cache_dir=CACHE_DIR, local_files_only=True # Will fail if not cached ).to(DEVICE), AutoTokenizer.from_pretrained( MODEL_ID, cache_dir=CACHE_DIR, local_files_only=True ) except OSError: # Fallback to download if not in cache print("Downloading model...") return AutoModelForCausalLM.from_pretrained( MODEL_ID, cache_dir=CACHE_DIR ).to(DEVICE), AutoTokenizer.from_pretrained( MODEL_ID, cache_dir=CACHE_DIR ) # Load model model, tokenizer = load_model() def generate_text(prompt, max_length=200): inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE) outputs = model.generate( **inputs, max_new_tokens=max_length, temperature=0.7, do_sample=True ) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Gradio interface with gr.Blocks() as demo: gr.Markdown("# LLaMA 2 7B Chat Demo") with gr.Row(): input_text = gr.Textbox(label="Input Prompt", lines=3) output_text = gr.Textbox(label="Generated Response", lines=3) generate_btn = gr.Button("Generate") generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text) demo.launch(server_name="0.0.0.0", server_port=7860)