mylocalmodels / app.py
Juna190825's picture
Update Dockerfile
3c3fb04 verified
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from huggingface_hub import login
# import torch
# import os
# # Authenticate using environment variable
# login(token=os.getenv('HF_TOKEN'))
# # Load model (will use cached version if available)
# model_id = "meta-llama/Llama-2-7b-chat-hf"
# device = "cuda" if torch.cuda.is_available() else "cpu"
# def load_model():
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
# return tokenizer, model
# tokenizer, model = load_model()
# def generate_text(prompt, max_length=200):
# inputs = tokenizer(prompt, return_tensors="pt").to(device)
# outputs = model.generate(
# **inputs,
# max_new_tokens=max_length,
# temperature=0.7,
# do_sample=True
# )
# return tokenizer.decode(outputs[0], skip_special_tokens=True)
# # Gradio interface
# with gr.Blocks() as demo:
# gr.Markdown("# LLaMA 2 7B Chat Demo")
# with gr.Row():
# input_text = gr.Textbox(label="Input Prompt", lines=3)
# output_text = gr.Textbox(label="Generated Response", lines=3)
# generate_btn = gr.Button("Generate")
# generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)
# demo.launch(server_name="0.0.0.0", server_port=7860)
# import gradio as gr
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from huggingface_hub import login, hf_hub_download
# from tenacity import retry, stop_after_attempt, wait_exponential
# import torch
# import os
# # Authentication
# login(token=os.getenv('HF_TOKEN'))
# # Configuration
# CACHE_REPO = "Juna190825/cacheRepo" # Your dataset repo for cached models
# MODEL_ID = "meta-llama/Llama-2-7b-chat-hf" # Original model ID
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
# def load_model():
# retries = 3
# for attempt in range(retries):
# try:
# # First try loading from cache repo
# model = AutoModelForCausalLM.from_pretrained(
# CACHE_REPO,
# cache_dir="/cache/models",
# local_files_only=True
# ).to(DEVICE)
# tokenizer = AutoTokenizer.from_pretrained(
# CACHE_REPO,
# cache_dir="/cache/models"
# )
# print("Loaded model from cache repo")
# return model, tokenizer
# except Exception as e:
# if attempt == retries - 1: # Final attempt
# print(f"Cache load failed: {str(e)}. Falling back to original repo")
# # Fallback to original repo
# model = AutoModelForCausalLM.from_pretrained(
# MODEL_ID,
# cache_dir="/cache/models"
# ).to(DEVICE)
# tokenizer = AutoTokenizer.from_pretrained(
# MODEL_ID,
# cache_dir="/cache/models"
# )
# return model, tokenizer
# print(f"Attempt {attempt + 1} failed, retrying...")
# time.sleep(2 ** attempt) # Exponential backoff
# # Load model and tokenizer
# model, tokenizer = load_model()
# def generate_text(prompt, max_length=200):
# inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
# outputs = model.generate(
# **inputs,
# max_new_tokens=max_length,
# temperature=0.7,
# do_sample=True
# )
# return tokenizer.decode(outputs[0], skip_special_tokens=True)
# # Gradio interface
# with gr.Blocks() as demo:
# gr.Markdown("# LLaMA 2 7B Chat Demo")
# with gr.Row():
# input_text = gr.Textbox(label="Input Prompt", lines=3)
# output_text = gr.Textbox(label="Generated Response", lines=3)
# generate_btn = gr.Button("Generate")
# generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)
# demo.launch(server_name="0.0.0.0", server_port=7860)
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
import torch
import os
# Authentication
login(token=os.getenv('HF_TOKEN'))
# Configuration
MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
CACHE_DIR = "/cache/models"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def load_model():
"""Load model with automatic cache handling"""
try:
# First try with local files only (uses cache if available)
print("Checking for cached model...")
return AutoModelForCausalLM.from_pretrained(
MODEL_ID,
cache_dir=CACHE_DIR,
local_files_only=True # Will fail if not cached
).to(DEVICE), AutoTokenizer.from_pretrained(
MODEL_ID,
cache_dir=CACHE_DIR,
local_files_only=True
)
except OSError:
# Fallback to download if not in cache
print("Downloading model...")
return AutoModelForCausalLM.from_pretrained(
MODEL_ID,
cache_dir=CACHE_DIR
).to(DEVICE), AutoTokenizer.from_pretrained(
MODEL_ID,
cache_dir=CACHE_DIR
)
# Load model
model, tokenizer = load_model()
def generate_text(prompt, max_length=200):
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
outputs = model.generate(
**inputs,
max_new_tokens=max_length,
temperature=0.7,
do_sample=True
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# LLaMA 2 7B Chat Demo")
with gr.Row():
input_text = gr.Textbox(label="Input Prompt", lines=3)
output_text = gr.Textbox(label="Generated Response", lines=3)
generate_btn = gr.Button("Generate")
generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)
demo.launch(server_name="0.0.0.0", server_port=7860)