# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from huggingface_hub import login
# import torch
# import os

# # Authenticate using environment variable
# login(token=os.getenv('HF_TOKEN'))

# # Load model (will use cached version if available)
# model_id = "meta-llama/Llama-2-7b-chat-hf"
# device = "cuda" if torch.cuda.is_available() else "cpu"

# def load_model():
#     tokenizer = AutoTokenizer.from_pretrained(model_id)
#     model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
#     return tokenizer, model

# tokenizer, model = load_model()

# def generate_text(prompt, max_length=200):
#     inputs = tokenizer(prompt, return_tensors="pt").to(device)
#     outputs = model.generate(
#         **inputs,
#         max_new_tokens=max_length,
#         temperature=0.7,
#         do_sample=True
#     )
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

# # Gradio interface
# with gr.Blocks() as demo:
#     gr.Markdown("# LLaMA 2 7B Chat Demo")
#     with gr.Row():
#         input_text = gr.Textbox(label="Input Prompt", lines=3)
#         output_text = gr.Textbox(label="Generated Response", lines=3)
#     generate_btn = gr.Button("Generate")
#     generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)

# demo.launch(server_name="0.0.0.0", server_port=7860)


# import gradio as gr
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from huggingface_hub import login, hf_hub_download
# from tenacity import retry, stop_after_attempt, wait_exponential
# import torch
# import os

# # Authentication
# login(token=os.getenv('HF_TOKEN'))

# # Configuration
# CACHE_REPO = "Juna190825/cacheRepo"  # Your dataset repo for cached models
# MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"  # Original model ID
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
# def load_model():
#     retries = 3
#     for attempt in range(retries):
#         try:
#             # First try loading from cache repo
#             model = AutoModelForCausalLM.from_pretrained(
#                 CACHE_REPO,
#                 cache_dir="/cache/models",
#                 local_files_only=True
#             ).to(DEVICE)
#             tokenizer = AutoTokenizer.from_pretrained(
#                 CACHE_REPO,
#                 cache_dir="/cache/models"
#             )
#             print("Loaded model from cache repo")
#             return model, tokenizer
#         except Exception as e:
#             if attempt == retries - 1:  # Final attempt
#                 print(f"Cache load failed: {str(e)}. Falling back to original repo")
#                 # Fallback to original repo
#                 model = AutoModelForCausalLM.from_pretrained(
#                     MODEL_ID,
#                     cache_dir="/cache/models"
#                 ).to(DEVICE)
#                 tokenizer = AutoTokenizer.from_pretrained(
#                     MODEL_ID,
#                     cache_dir="/cache/models"
#                 )
#                 return model, tokenizer
#             print(f"Attempt {attempt + 1} failed, retrying...")
#             time.sleep(2 ** attempt)  # Exponential backoff

# # Load model and tokenizer
# model, tokenizer = load_model()

# def generate_text(prompt, max_length=200):
#     inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
#     outputs = model.generate(
#         **inputs,
#         max_new_tokens=max_length,
#         temperature=0.7,
#         do_sample=True
#     )
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

# # Gradio interface
# with gr.Blocks() as demo:
#     gr.Markdown("# LLaMA 2 7B Chat Demo")
#     with gr.Row():
#         input_text = gr.Textbox(label="Input Prompt", lines=3)
#         output_text = gr.Textbox(label="Generated Response", lines=3)
#     generate_btn = gr.Button("Generate")
#     generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)

# demo.launch(server_name="0.0.0.0", server_port=7860)

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
import torch
import os

# Authentication
login(token=os.getenv('HF_TOKEN'))

# Configuration
MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
CACHE_DIR = "/cache/models"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def load_model():
    """Load model with automatic cache handling"""
    try:
        # First try with local files only (uses cache if available)
        print("Checking for cached model...")
        return AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            cache_dir=CACHE_DIR,
            local_files_only=True  # Will fail if not cached
        ).to(DEVICE), AutoTokenizer.from_pretrained(
            MODEL_ID,
            cache_dir=CACHE_DIR,
            local_files_only=True
        )
    except OSError:
        # Fallback to download if not in cache
        print("Downloading model...")
        return AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            cache_dir=CACHE_DIR
        ).to(DEVICE), AutoTokenizer.from_pretrained(
            MODEL_ID,
            cache_dir=CACHE_DIR
        )

# Load model
model, tokenizer = load_model()

def generate_text(prompt, max_length=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,
        temperature=0.7,
        do_sample=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# LLaMA 2 7B Chat Demo")
    with gr.Row():
        input_text = gr.Textbox(label="Input Prompt", lines=3)
        output_text = gr.Textbox(label="Generated Response", lines=3)
    generate_btn = gr.Button("Generate")
    generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)

demo.launch(server_name="0.0.0.0", server_port=7860)