deepseek-coder-33b-instruct

Runtime error

App Files Files Community

Starchik commited on Jan 29, 2025

Commit

56b32b5

verified ·

1 Parent(s): db66a89

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -26

app.py CHANGED Viewed

@@ -18,22 +18,22 @@ This space demonstrates model [DeepSeek-Coder](https://huggingface.co/deepseek-a
 **You can also try our 33B model in [official homepage](https://coder.deepseek.com/chat).**
 """
-# Проверяем доступность GPU
-use_cuda = torch.cuda.is_available()
-if not use_cuda:
-    DESCRIPTION += "\n<p>Running on CPU 🥶 Performance may be significantly slower.</p>"
-# Выбор устройства
-device = torch.device("cuda" if use_cuda else "cpu")
-torch_dtype = torch.bfloat16 if use_cuda else torch.float32
-# Загрузка модели и токенизатора
-model_id = "deepseek-ai/deepseek-coder-33b-instruct"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch_dtype, device_map="auto" if use_cuda else None)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-tokenizer.use_default_system_prompt = False
 @spaces.GPU
 def generate(
@@ -48,11 +48,8 @@ def generate(
 ) -> Iterator[str]:
     global total_count
     total_count += 1
-    print(f"Request number: {total_count}")
-    if use_cuda:
-        os.system("nvidia-smi")
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
@@ -61,16 +58,14 @@ def generate(
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        input_ids=input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=False,
@@ -80,7 +75,6 @@ def generate(
         repetition_penalty=repetition_penalty,
         eos_token_id=32021
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
@@ -89,7 +83,6 @@ def generate(
         outputs.append(text)
         yield "".join(outputs).replace("<|EOT|>", "")
 chat_interface = gr.ChatInterface(
     fn=generate,
     additional_inputs=[

 **You can also try our 33B model in [official homepage](https://coder.deepseek.com/chat).**
 """
+# Check if CUDA is available
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo might be slow on CPU.</p>"
+    device = torch.device("cpu")
+else:
+    device = torch.device("cuda")
+    model_id = "deepseek-ai/deepseek-coder-33b-instruct"
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer.use_default_system_prompt = False
+# Fallback to CPU for model loading if CUDA is unavailable
+if not torch.cuda.is_available():
+    model_id = "deepseek-ai/deepseek-coder-33b-instruct"
+    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
 @spaces.GPU
 def generate(
 ) -> Iterator[str]:
     global total_count
     total_count += 1
+    print(total_count)
+    os.system("nvidia-smi")
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=False,
         repetition_penalty=repetition_penalty,
         eos_token_id=32021
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
         outputs.append(text)
         yield "".join(outputs).replace("<|EOT|>", "")
 chat_interface = gr.ChatInterface(
     fn=generate,
     additional_inputs=[