Spaces:

Juna190825
/

mylocalmodels

Runtime error

App Files Files Community

Juna190825 commited on Aug 11

Commit

d39dd11

verified ·

1 Parent(s): 11dc29d

Update Dockerfile

Browse files

Files changed (1) hide show

app.py +92 -32

app.py CHANGED Viewed

@@ -41,56 +41,116 @@
 # demo.launch(server_name="0.0.0.0", server_port=7860)
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import login, hf_hub_download
-from tenacity import retry, stop_after_attempt, wait_exponential
 import torch
 import os
 # Authentication
 login(token=os.getenv('HF_TOKEN'))
 # Configuration
-CACHE_REPO = "Juna190825/cacheRepo"  # Your dataset repo for cached models
-MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"  # Original model ID
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
-def load_model():
-    retries = 3
-    for attempt in range(retries):
         try:
-            # First try loading from cache repo
             model = AutoModelForCausalLM.from_pretrained(
-                CACHE_REPO,
-                cache_dir="/cache/models",
-                local_files_only=True
             ).to(DEVICE)
             tokenizer = AutoTokenizer.from_pretrained(
-                CACHE_REPO,
-                cache_dir="/cache/models"
             )
-            print("Loaded model from cache repo")
             return model, tokenizer
         except Exception as e:
-            if attempt == retries - 1:  # Final attempt
-                print(f"Cache load failed: {str(e)}. Falling back to original repo")
-                # Fallback to original repo
-                model = AutoModelForCausalLM.from_pretrained(
-                    MODEL_ID,
-                    cache_dir="/cache/models"
-                ).to(DEVICE)
-                tokenizer = AutoTokenizer.from_pretrained(
-                    MODEL_ID,
-                    cache_dir="/cache/models"
-                )
-                return model, tokenizer
-            print(f"Attempt {attempt + 1} failed, retrying...")
-            time.sleep(2 ** attempt)  # Exponential backoff
-# Load model and tokenizer
-model, tokenizer = load_model()
 def generate_text(prompt, max_length=200):
     inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

 # demo.launch(server_name="0.0.0.0", server_port=7860)
+# import gradio as gr
+# from transformers import AutoModelForCausalLM, AutoTokenizer
+# from huggingface_hub import login, hf_hub_download
+# from tenacity import retry, stop_after_attempt, wait_exponential
+# import torch
+# import os
+# # Authentication
+# login(token=os.getenv('HF_TOKEN'))
+# # Configuration
+# CACHE_REPO = "Juna190825/cacheRepo"  # Your dataset repo for cached models
+# MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"  # Original model ID
+# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
+# def load_model():
+#     retries = 3
+#     for attempt in range(retries):
+#         try:
+#             # First try loading from cache repo
+#             model = AutoModelForCausalLM.from_pretrained(
+#                 CACHE_REPO,
+#                 cache_dir="/cache/models",
+#                 local_files_only=True
+#             ).to(DEVICE)
+#             tokenizer = AutoTokenizer.from_pretrained(
+#                 CACHE_REPO,
+#                 cache_dir="/cache/models"
+#             )
+#             print("Loaded model from cache repo")
+#             return model, tokenizer
+#         except Exception as e:
+#             if attempt == retries - 1:  # Final attempt
+#                 print(f"Cache load failed: {str(e)}. Falling back to original repo")
+#                 # Fallback to original repo
+#                 model = AutoModelForCausalLM.from_pretrained(
+#                     MODEL_ID,
+#                     cache_dir="/cache/models"
+#                 ).to(DEVICE)
+#                 tokenizer = AutoTokenizer.from_pretrained(
+#                     MODEL_ID,
+#                     cache_dir="/cache/models"
+#                 )
+#                 return model, tokenizer
+#             print(f"Attempt {attempt + 1} failed, retrying...")
+#             time.sleep(2 ** attempt)  # Exponential backoff
+# # Load model and tokenizer
+# model, tokenizer = load_model()
+# def generate_text(prompt, max_length=200):
+#     inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
+#     outputs = model.generate(
+#         **inputs,
+#         max_new_tokens=max_length,
+#         temperature=0.7,
+#         do_sample=True
+#     )
+#     return tokenizer.decode(outputs[0], skip_special_tokens=True)
+# # Gradio interface
+# with gr.Blocks() as demo:
+#     gr.Markdown("# LLaMA 2 7B Chat Demo")
+#     with gr.Row():
+#         input_text = gr.Textbox(label="Input Prompt", lines=3)
+#         output_text = gr.Textbox(label="Generated Response", lines=3)
+#     generate_btn = gr.Button("Generate")
+#     generate_btn.click(fn=generate_text, inputs=input_text, outputs=output_text)
+# demo.launch(server_name="0.0.0.0", server_port=7860)
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import login
 import torch
 import os
+import time  # For manual retries
 # Authentication
 login(token=os.getenv('HF_TOKEN'))
 # Configuration
+MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
+CACHE_DIR = "/cache/models"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def load_model_with_retry(max_retries=3):
+    for attempt in range(max_retries):
         try:
+            # Try loading from cache first
             model = AutoModelForCausalLM.from_pretrained(
+                MODEL_ID,
+                cache_dir=CACHE_DIR,
+                local_files_only=(attempt > 0)  # Only check cache after first fail
             ).to(DEVICE)
             tokenizer = AutoTokenizer.from_pretrained(
+                MODEL_ID,
+                cache_dir=CACHE_DIR
             )
             return model, tokenizer
         except Exception as e:
+            if attempt == max_retries - 1:
+                raise
+            wait_time = 2 ** (attempt + 1)  # Exponential backoff (2s, 4s, 8s)
+            print(f"Attempt {attempt + 1} failed, retrying in {wait_time}s...")
+            time.sleep(wait_time)
+# Load model
+model, tokenizer = load_model_with_retry()
 def generate_text(prompt, max_length=200):
     inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)