MSGEncrypted commited on
Commit
f409660
·
1 Parent(s): 11c0ad1
apps/gradio-space/src/gradio_space/app.py CHANGED
@@ -57,19 +57,42 @@ def chat(message: str, history: list, model_key: str) -> str:
57
  return get_backend(model_key).chat(messages)
58
 
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def warmup(model_key: str | None = None) -> str:
61
  key = model_key or _app_config.active_model
62
  model = get_model_config(key)
63
 
64
  if _load_state.get(key):
65
- return f"Model ready: {model.label}"
 
 
 
 
 
 
66
 
67
  if key in _load_errors:
68
  return _load_errors[key]
69
 
 
70
  return (
71
- f"Preset `{key}` selected ({model.backend}). "
72
- "Weights load on the first chat message — this can take a few minutes on CPU."
73
  )
74
 
75
 
 
57
  return get_backend(model_key).chat(messages)
58
 
59
 
60
+ def _runtime_device_hint(model_key: str) -> str:
61
+ model = get_model_config(model_key)
62
+ if model.backend == "transformers":
63
+ try:
64
+ import torch
65
+
66
+ if torch.cuda.is_available():
67
+ return f"GPU ({torch.cuda.get_device_name(0)})"
68
+ except ImportError:
69
+ pass
70
+ return "CPU"
71
+ if model.n_gpu_layers > 0:
72
+ return f"llama.cpp GPU offload ({model.n_gpu_layers} layers)"
73
+ return "CPU"
74
+
75
+
76
  def warmup(model_key: str | None = None) -> str:
77
  key = model_key or _app_config.active_model
78
  model = get_model_config(key)
79
 
80
  if _load_state.get(key):
81
+ backend = get_backend(key)
82
+ device = (
83
+ backend.device_label
84
+ if hasattr(backend, "device_label")
85
+ else _runtime_device_hint(key)
86
+ )
87
+ return f"Model ready: {model.label} on {device}"
88
 
89
  if key in _load_errors:
90
  return _load_errors[key]
91
 
92
+ device_hint = _runtime_device_hint(key)
93
  return (
94
+ f"Preset `{key}` selected ({model.backend}, {device_hint}). "
95
+ "Weights load on the first chat message."
96
  )
97
 
98
 
libs/inference/pyproject.toml CHANGED
@@ -13,6 +13,7 @@ dependencies = [
13
  "llama-cpp-python>=0.3.0",
14
  "pyyaml>=6.0.2",
15
  "torch>=2.5.0",
 
16
  "transformers>=5.7.0",
17
  ]
18
 
 
13
  "llama-cpp-python>=0.3.0",
14
  "pyyaml>=6.0.2",
15
  "torch>=2.5.0",
16
+ "torchvision>=0.20.0",
17
  "transformers>=5.7.0",
18
  ]
19
 
libs/inference/src/inference/config.py CHANGED
@@ -24,6 +24,7 @@ class ModelConfig:
24
  model_path: str | None = None
25
  model_id: str | None = None
26
  trust_remote_code: bool = False
 
27
  n_ctx: int = 4096
28
  n_gpu_layers: int = 0
29
  max_tokens: int = 512
@@ -37,6 +38,7 @@ class ModelConfig:
37
  self.model_path,
38
  self.model_id,
39
  self.trust_remote_code,
 
40
  self.n_ctx,
41
  self.n_gpu_layers,
42
  )
@@ -148,6 +150,7 @@ def _parse_model_entry(key: str, raw: dict[str, Any]) -> ModelConfig:
148
  model_path=raw.get("model_path"),
149
  model_id=raw.get("model_id"),
150
  trust_remote_code=bool(raw.get("trust_remote_code", False)),
 
151
  n_ctx=int(raw.get("n_ctx", 4096)),
152
  n_gpu_layers=int(raw.get("n_gpu_layers", 0)),
153
  max_tokens=int(raw.get("max_tokens", 512)),
 
24
  model_path: str | None = None
25
  model_id: str | None = None
26
  trust_remote_code: bool = False
27
+ multimodal: bool = False
28
  n_ctx: int = 4096
29
  n_gpu_layers: int = 0
30
  max_tokens: int = 512
 
38
  self.model_path,
39
  self.model_id,
40
  self.trust_remote_code,
41
+ self.multimodal,
42
  self.n_ctx,
43
  self.n_gpu_layers,
44
  )
 
150
  model_path=raw.get("model_path"),
151
  model_id=raw.get("model_id"),
152
  trust_remote_code=bool(raw.get("trust_remote_code", False)),
153
+ multimodal=bool(raw.get("multimodal", False)),
154
  n_ctx=int(raw.get("n_ctx", 4096)),
155
  n_gpu_layers=int(raw.get("n_gpu_layers", 0)),
156
  max_tokens=int(raw.get("max_tokens", 512)),
libs/inference/src/inference/transformers.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from inference.config import ModelConfig
2
 
3
 
@@ -6,6 +8,15 @@ class TransformersBackend:
6
  self._config = config
7
  self._model = None
8
  self._tokenizer = None
 
 
 
 
 
 
 
 
 
9
 
10
  def load(self) -> None:
11
  if self._model is not None:
@@ -18,44 +29,123 @@ class TransformersBackend:
18
 
19
  try:
20
  import torch
21
- from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 
 
 
22
  except ImportError as exc:
23
  raise ImportError(
24
  "transformers backend requires torch and transformers. "
25
  "Install with: uv sync --all-packages"
26
  ) from exc
27
 
28
- device = "cuda" if torch.cuda.is_available() else "cpu"
29
-
30
- self._tokenizer = AutoTokenizer.from_pretrained(
31
- self._config.model_id,
32
- trust_remote_code=self._config.trust_remote_code,
33
- )
34
- self._model = AutoModelForCausalLM.from_pretrained(
35
- self._config.model_id,
36
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
37
- device_map="auto" if device == "cuda" else None,
38
- trust_remote_code=self._config.trust_remote_code,
39
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  if device == "cpu":
41
  self._model.to(device)
42
 
43
- def generate(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  self,
45
- prompt: str,
46
  *,
47
  max_tokens: int | None = None,
48
  temperature: float | None = None,
49
  ) -> str:
50
  self.load()
51
  assert self._model is not None
52
- assert self._tokenizer is not None
53
-
54
- import torch
55
 
56
  max_new_tokens = max_tokens or self._config.max_tokens
57
  temp = self._config.temperature if temperature is None else temperature
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  inputs = self._tokenizer(prompt, return_tensors="pt").to(self._model.device)
60
  output = self._model.generate(
61
  **inputs,
@@ -66,6 +156,19 @@ class TransformersBackend:
66
  generated = output[0][inputs["input_ids"].shape[-1] :]
67
  return self._tokenizer.decode(generated, skip_special_tokens=True).strip()
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def chat(
70
  self,
71
  messages: list[dict[str, str]],
@@ -73,23 +176,9 @@ class TransformersBackend:
73
  max_tokens: int | None = None,
74
  temperature: float | None = None,
75
  ) -> str:
76
- self.load()
77
- assert self._model is not None
78
- assert self._tokenizer is not None
79
-
80
- if hasattr(self._tokenizer, "apply_chat_template"):
81
- prompt = self._tokenizer.apply_chat_template(
82
- messages,
83
- tokenize=False,
84
- add_generation_prompt=True,
85
- )
86
- else:
87
- parts = []
88
- for message in messages:
89
- role = message["role"]
90
- content = message["content"]
91
- parts.append(f"{role}: {content}")
92
- parts.append("assistant:")
93
- prompt = "\n".join(parts)
94
-
95
- return self.generate(prompt, max_tokens=max_tokens, temperature=temperature)
 
1
+ from __future__ import annotations
2
+
3
  from inference.config import ModelConfig
4
 
5
 
 
8
  self._config = config
9
  self._model = None
10
  self._tokenizer = None
11
+ self._processor = None
12
+ self._device_label: str | None = None
13
+
14
+ def _resolve_device(self):
15
+ import torch
16
+
17
+ if torch.cuda.is_available():
18
+ return "cuda", torch.float16, "auto"
19
+ return "cpu", torch.float32, None
20
 
21
  def load(self) -> None:
22
  if self._model is not None:
 
29
 
30
  try:
31
  import torch
32
+ from transformers import (
33
+ AutoModelForCausalLM,
34
+ AutoModelForImageTextToText,
35
+ AutoProcessor,
36
+ AutoTokenizer,
37
+ )
38
  except ImportError as exc:
39
  raise ImportError(
40
  "transformers backend requires torch and transformers. "
41
  "Install with: uv sync --all-packages"
42
  ) from exc
43
 
44
+ device, torch_dtype, device_map = self._resolve_device()
45
+ self._device_label = (
46
+ f"cuda ({torch.cuda.get_device_name(0)})"
47
+ if device == "cuda"
48
+ else "cpu"
 
 
 
 
 
 
49
  )
50
+
51
+ common_kwargs = {
52
+ "trust_remote_code": self._config.trust_remote_code,
53
+ }
54
+ model_kwargs = {
55
+ **common_kwargs,
56
+ "torch_dtype": torch_dtype,
57
+ "device_map": device_map,
58
+ }
59
+
60
+ if self._config.multimodal:
61
+ self._processor = AutoProcessor.from_pretrained(
62
+ self._config.model_id,
63
+ **common_kwargs,
64
+ )
65
+ self._model = AutoModelForImageTextToText.from_pretrained(
66
+ self._config.model_id,
67
+ **model_kwargs,
68
+ )
69
+ else:
70
+ self._tokenizer = AutoTokenizer.from_pretrained(
71
+ self._config.model_id,
72
+ **common_kwargs,
73
+ )
74
+ self._model = AutoModelForCausalLM.from_pretrained(
75
+ self._config.model_id,
76
+ **model_kwargs,
77
+ )
78
+
79
  if device == "cpu":
80
  self._model.to(device)
81
 
82
+ @property
83
+ def device_label(self) -> str:
84
+ self.load()
85
+ return self._device_label or "unknown"
86
+
87
+ def _normalize_messages(
88
+ self, messages: list[dict[str, str]]
89
+ ) -> list[dict[str, object]]:
90
+ if not self._config.multimodal:
91
+ return messages
92
+
93
+ normalized: list[dict[str, object]] = []
94
+ for message in messages:
95
+ content = message["content"]
96
+ if isinstance(content, str):
97
+ content = [{"type": "text", "text": content}]
98
+ normalized.append({"role": message["role"], "content": content})
99
+ return normalized
100
+
101
+ def _generate_from_messages(
102
  self,
103
+ messages: list[dict[str, object]],
104
  *,
105
  max_tokens: int | None = None,
106
  temperature: float | None = None,
107
  ) -> str:
108
  self.load()
109
  assert self._model is not None
 
 
 
110
 
111
  max_new_tokens = max_tokens or self._config.max_tokens
112
  temp = self._config.temperature if temperature is None else temperature
113
 
114
+ if self._config.multimodal:
115
+ assert self._processor is not None
116
+ inputs = self._processor.apply_chat_template(
117
+ messages,
118
+ tokenize=True,
119
+ add_generation_prompt=True,
120
+ return_dict=True,
121
+ return_tensors="pt",
122
+ )
123
+ inputs = inputs.to(self._model.device)
124
+ output = self._model.generate(
125
+ **inputs,
126
+ max_new_tokens=max_new_tokens,
127
+ temperature=temp,
128
+ do_sample=temp > 0,
129
+ )
130
+ generated = output[0][inputs["input_ids"].shape[-1] :]
131
+ return self._processor.decode(generated, skip_special_tokens=True).strip()
132
+
133
+ assert self._tokenizer is not None
134
+ if hasattr(self._tokenizer, "apply_chat_template"):
135
+ prompt = self._tokenizer.apply_chat_template(
136
+ messages,
137
+ tokenize=False,
138
+ add_generation_prompt=True,
139
+ )
140
+ else:
141
+ parts = []
142
+ for message in messages:
143
+ role = str(message["role"])
144
+ content = str(message["content"])
145
+ parts.append(f"{role}: {content}")
146
+ parts.append("assistant:")
147
+ prompt = "\n".join(parts)
148
+
149
  inputs = self._tokenizer(prompt, return_tensors="pt").to(self._model.device)
150
  output = self._model.generate(
151
  **inputs,
 
156
  generated = output[0][inputs["input_ids"].shape[-1] :]
157
  return self._tokenizer.decode(generated, skip_special_tokens=True).strip()
158
 
159
+ def generate(
160
+ self,
161
+ prompt: str,
162
+ *,
163
+ max_tokens: int | None = None,
164
+ temperature: float | None = None,
165
+ ) -> str:
166
+ return self.chat(
167
+ [{"role": "user", "content": prompt}],
168
+ max_tokens=max_tokens,
169
+ temperature=temperature,
170
+ )
171
+
172
  def chat(
173
  self,
174
  messages: list[dict[str, str]],
 
176
  max_tokens: int | None = None,
177
  temperature: float | None = None,
178
  ) -> str:
179
+ normalized = self._normalize_messages(messages)
180
+ return self._generate_from_messages(
181
+ normalized,
182
+ max_tokens=max_tokens,
183
+ temperature=temperature,
184
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models.yaml CHANGED
@@ -13,6 +13,7 @@ models:
13
  backend: transformers
14
  model_id: openbmb/MiniCPM-V-4.6
15
  trust_remote_code: true
 
16
 
17
  qwen3b-gguf:
18
  label: Qwen 2.5 3B Instruct (GGUF)
 
13
  backend: transformers
14
  model_id: openbmb/MiniCPM-V-4.6
15
  trust_remote_code: true
16
+ multimodal: true
17
 
18
  qwen3b-gguf:
19
  label: Qwen 2.5 3B Instruct (GGUF)