iamthewalrus67 commited on
Commit
2dd438d
·
1 Parent(s): 6add81b
Files changed (1) hide show
  1. app.py +1 -9
app.py CHANGED
@@ -2,12 +2,8 @@ import os
2
  import subprocess
3
  import tempfile
4
 
5
- # subprocess.run('pip install flash-attn==2.8.0 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
6
-
7
  import threading
8
 
9
- # subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
10
-
11
  import spaces
12
  import gradio as gr
13
  import torch
@@ -16,9 +12,6 @@ from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer, Tex
16
  from kernels import get_kernel
17
  from typing import Any, Optional, Dict
18
 
19
- #vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3")
20
-
21
- #torch._dynamo.config.disable = True
22
 
23
  # Login to HF to get access to the model weights
24
  HF_LE_LLM_READ_TOKEN = os.environ.get('HF_LE_LLM_READ_TOKEN')
@@ -36,7 +29,6 @@ _model_cache: Dict[str, tuple[torch.nn.Module, AutoTokenizer]] = {}
36
 
37
 
38
  def load_model(model_id: str):
39
- """Load model + tokenizer, auto-detect whether it's embedding or regression."""
40
  if model_id in _model_cache:
41
  return _model_cache[model_id]
42
 
@@ -44,7 +36,7 @@ def load_model(model_id: str):
44
  tokenizer = AutoTokenizer.from_pretrained(model_id)
45
 
46
  model = AutoModelForSequenceClassification.from_pretrained(model_id, torch_dtype=torch.bfloat16)
47
- print(f"Detected embedding model: {model_id}")
48
 
49
  model.to(DEVICE).eval()
50
  _model_cache[model_id] = (model, tokenizer)
 
2
  import subprocess
3
  import tempfile
4
 
 
 
5
  import threading
6
 
 
 
7
  import spaces
8
  import gradio as gr
9
  import torch
 
12
  from kernels import get_kernel
13
  from typing import Any, Optional, Dict
14
 
 
 
 
15
 
16
  # Login to HF to get access to the model weights
17
  HF_LE_LLM_READ_TOKEN = os.environ.get('HF_LE_LLM_READ_TOKEN')
 
29
 
30
 
31
  def load_model(model_id: str):
 
32
  if model_id in _model_cache:
33
  return _model_cache[model_id]
34
 
 
36
  tokenizer = AutoTokenizer.from_pretrained(model_id)
37
 
38
  model = AutoModelForSequenceClassification.from_pretrained(model_id, torch_dtype=torch.bfloat16)
39
+ print(f"Detected model: {model_id}")
40
 
41
  model.to(DEVICE).eval()
42
  _model_cache[model_id] = (model, tokenizer)