File size: 6,054 Bytes
ea2f813
f4e52ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d39dd11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea2f813
f4e52ec
d39dd11
818b367
f4e52ec
ea2f813
f4e52ec
 
ea2f813
f4e52ec
d39dd11
 
f4e52ec
818b367
d8d0f11
3c3fb04
d8d0f11
3c3fb04
 
 
d8d0f11
 
3c3fb04
 
d8d0f11
 
 
 
 
3c3fb04
 
 
d8d0f11
 
3c3fb04
d8d0f11
 
 
d39dd11
 
d8d0f11
818b367
 
f4e52ec
818b367
 
 
 
 
 
f4e52ec
ea2f813
f4e52ec
818b367
 
 
 
 
 
f4e52ec
818b367
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178

# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from huggingface_hub import login
# import torch
# import os

# # Authenticate using environment variable
# login(token=os.getenv('HF_TOKEN'))

# # Load model (will use cached version if available)
# model_id = "meta-llama/Llama-2-7b-chat-hf"
# device = "cuda" if torch.cuda.is_available() else "cpu"

# def load_model():
#     tokenizer = AutoTokenizer.from_pretrained(model_id)
#     model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
#     return tokenizer, model

# tokenizer, model = load_model()

# def generate_text(prompt, max_length=200):
#     inputs = tokenizer(prompt, return_tensors="pt").to(device)
#     outputs = model.generate(
#         **inputs,
#         max_new_tokens=max_length,
#         temperature=0.7,
#         do_sample=True
#     )
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

# # Gradio interface
# with gr.Blocks() as demo:
#     gr.Markdown("# LLaMA 2 7B Chat Demo")
#     with gr.Row():
#         input_text = gr.Textbox(label="Input Prompt", lines=3)
#         output_text = gr.Textbox(label="Generated Response", lines=3)
#     generate_btn = gr.Button("Generate")
#     generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)

# demo.launch(server_name="0.0.0.0", server_port=7860)


# import gradio as gr
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from huggingface_hub import login, hf_hub_download
# from tenacity import retry, stop_after_attempt, wait_exponential
# import torch
# import os

# # Authentication
# login(token=os.getenv('HF_TOKEN'))

# # Configuration
# CACHE_REPO = "Juna190825/cacheRepo"  # Your dataset repo for cached models
# MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"  # Original model ID
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
# def load_model():
#     retries = 3
#     for attempt in range(retries):
#         try:
#             # First try loading from cache repo
#             model = AutoModelForCausalLM.from_pretrained(
#                 CACHE_REPO,
#                 cache_dir="/cache/models",
#                 local_files_only=True
#             ).to(DEVICE)
#             tokenizer = AutoTokenizer.from_pretrained(
#                 CACHE_REPO,
#                 cache_dir="/cache/models"
#             )
#             print("Loaded model from cache repo")
#             return model, tokenizer
#         except Exception as e:
#             if attempt == retries - 1:  # Final attempt
#                 print(f"Cache load failed: {str(e)}. Falling back to original repo")
#                 # Fallback to original repo
#                 model = AutoModelForCausalLM.from_pretrained(
#                     MODEL_ID,
#                     cache_dir="/cache/models"
#                 ).to(DEVICE)
#                 tokenizer = AutoTokenizer.from_pretrained(
#                     MODEL_ID,
#                     cache_dir="/cache/models"
#                 )
#                 return model, tokenizer
#             print(f"Attempt {attempt + 1} failed, retrying...")
#             time.sleep(2 ** attempt)  # Exponential backoff

# # Load model and tokenizer
# model, tokenizer = load_model()

# def generate_text(prompt, max_length=200):
#     inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
#     outputs = model.generate(
#         **inputs,
#         max_new_tokens=max_length,
#         temperature=0.7,
#         do_sample=True
#     )
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

# # Gradio interface
# with gr.Blocks() as demo:
#     gr.Markdown("# LLaMA 2 7B Chat Demo")
#     with gr.Row():
#         input_text = gr.Textbox(label="Input Prompt", lines=3)
#         output_text = gr.Textbox(label="Generated Response", lines=3)
#     generate_btn = gr.Button("Generate")
#     generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)

# demo.launch(server_name="0.0.0.0", server_port=7860)

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
import torch
import os

# Authentication
login(token=os.getenv('HF_TOKEN'))

# Configuration
MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
CACHE_DIR = "/cache/models"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def load_model():
    """Load model with automatic cache handling"""
    try:
        # First try with local files only (uses cache if available)
        print("Checking for cached model...")
        return AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            cache_dir=CACHE_DIR,
            local_files_only=True  # Will fail if not cached
        ).to(DEVICE), AutoTokenizer.from_pretrained(
            MODEL_ID,
            cache_dir=CACHE_DIR,
            local_files_only=True
        )
    except OSError:
        # Fallback to download if not in cache
        print("Downloading model...")
        return AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            cache_dir=CACHE_DIR
        ).to(DEVICE), AutoTokenizer.from_pretrained(
            MODEL_ID,
            cache_dir=CACHE_DIR
        )

# Load model
model, tokenizer = load_model()

def generate_text(prompt, max_length=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,
        temperature=0.7,
        do_sample=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# LLaMA 2 7B Chat Demo")
    with gr.Row():
        input_text = gr.Textbox(label="Input Prompt", lines=3)
        output_text = gr.Textbox(label="Generated Response", lines=3)
    generate_btn = gr.Button("Generate")
    generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)

demo.launch(server_name="0.0.0.0", server_port=7860)