Spaces:

lapa-llm
/

quality-estimation

Running

iamthewalrus67 commited on Oct 16, 2025

Commit

2dd438d

1 Parent(s): 6add81b

Cleanup

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,12 +2,8 @@ import os
 import subprocess
 import tempfile
-# subprocess.run('pip install flash-attn==2.8.0 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 import threading
-# subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
 import spaces
 import gradio as gr
 import torch
@@ -16,9 +12,6 @@ from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer, Tex
 from kernels import get_kernel
 from typing import Any, Optional, Dict
-#vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3")
-#torch._dynamo.config.disable = True
 # Login to HF to get access to the model weights
 HF_LE_LLM_READ_TOKEN = os.environ.get('HF_LE_LLM_READ_TOKEN')
@@ -36,7 +29,6 @@ _model_cache: Dict[str, tuple[torch.nn.Module, AutoTokenizer]] = {}
 def load_model(model_id: str):
-    """Load model + tokenizer, auto-detect whether it's embedding or regression."""
     if model_id in _model_cache:
         return _model_cache[model_id]
@@ -44,7 +36,7 @@ def load_model(model_id: str):
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = AutoModelForSequenceClassification.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-    print(f"Detected embedding model: {model_id}")
     model.to(DEVICE).eval()
     _model_cache[model_id] = (model, tokenizer)

 import subprocess
 import tempfile
 import threading
 import spaces
 import gradio as gr
 import torch
 from kernels import get_kernel
 from typing import Any, Optional, Dict
 # Login to HF to get access to the model weights
 HF_LE_LLM_READ_TOKEN = os.environ.get('HF_LE_LLM_READ_TOKEN')
 def load_model(model_id: str):
     if model_id in _model_cache:
         return _model_cache[model_id]
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = AutoModelForSequenceClassification.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+    print(f"Detected model: {model_id}")
     model.to(DEVICE).eval()
     _model_cache[model_id] = (model, tokenizer)