Patryk Studzinski commited on
Commit
c14ac43
·
1 Parent(s): 329abd1

add GBNF grammar for car advertisement gap filling; update LlamaCppModel to support loading grammar from file

Browse files
app/logic/answers.gbnf ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GBNF Grammar for Car Advertisement Gap Filling
2
+ # Forces model to output valid JSON with gap fills
3
+ # Supports 1-10 gaps with Polish characters
4
+
5
+ root ::= "{" ws "\"gaps\":" ws "[" ws gap-list ws "]" ws "}"
6
+
7
+ gap-list ::= gap-item (ws "," ws gap-item)*
8
+
9
+ gap-item ::= "{" ws "\"index\":" ws number ws "," ws "\"choice\":" ws "\"" phrase "\"" ws "}"
10
+
11
+ # Allow words with Polish characters, numbers, spaces
12
+ phrase ::= word (space word){0,4}
13
+ word ::= [a-zA-ZżźćńółęąśŻŹĆŃÓŁĘĄŚ0-9.,%-]+
14
+ space ::= " "
15
+ number ::= [0-9]+
16
+ ws ::= [ \t\n]*
app/main.py CHANGED
@@ -430,13 +430,18 @@ async def process_infill_item(
430
 
431
  grammar_str = None
432
  if use_grammar and hasattr(llm, 'llm') and llm.llm is not None:
433
- # Only use grammar for GGUF models (llama.cpp)
434
- try:
435
- from app.logic.grammar_utils import get_infill_grammar
436
- grammar_str = get_infill_grammar(len(gaps))
437
- print(f"DEBUG: Using GBNF grammar for {len(gaps)} gaps", flush=True)
438
- except ImportError:
439
- pass
 
 
 
 
 
440
 
441
  raw_output = await llm.generate(
442
  chat_messages=chat_messages,
 
430
 
431
  grammar_str = None
432
  if use_grammar and hasattr(llm, 'llm') and llm.llm is not None:
433
+ # Use model's default grammar (loaded from answers.gbnf) if available
434
+ if hasattr(llm, 'default_grammar') and llm.default_grammar:
435
+ grammar_str = llm.default_grammar
436
+ print(f"DEBUG: Using model's default GBNF grammar", flush=True)
437
+ else:
438
+ # Fallback to dynamic grammar generation
439
+ try:
440
+ from app.logic.grammar_utils import get_infill_grammar
441
+ grammar_str = get_infill_grammar(len(gaps))
442
+ print(f"DEBUG: Using dynamic GBNF grammar for {len(gaps)} gaps", flush=True)
443
+ except ImportError:
444
+ pass
445
 
446
  raw_output = await llm.generate(
447
  chat_messages=chat_messages,
app/models/huggingface_inference_api.py DELETED
@@ -1,93 +0,0 @@
1
- """
2
- HuggingFace Inference API client for remote model access.
3
- """
4
-
5
- import os
6
- from typing import List, Dict, Any, Optional
7
- from huggingface_hub import InferenceClient
8
-
9
- from app.models.base_llm import BaseLLM
10
-
11
-
12
- class HuggingFaceInferenceAPI(BaseLLM):
13
- """
14
- Remote model access via HuggingFace Inference API.
15
- Best for larger models (7B+) that don't fit in local RAM.
16
- """
17
-
18
- def __init__(self, name: str, model_id: str, token: str = None):
19
- super().__init__(name, model_id)
20
- self.token = token or os.getenv("HF_TOKEN")
21
- self.client: Optional[InferenceClient] = None
22
-
23
- async def initialize(self) -> None:
24
- """Initialize the Inference API client."""
25
- if self._initialized:
26
- return
27
-
28
- try:
29
- print(f"[{self.name}] Initializing Inference API for: {self.model_id}")
30
-
31
- self.client = InferenceClient(
32
- model=self.model_id,
33
- token=self.token
34
- )
35
-
36
- self._initialized = True
37
- print(f"[{self.name}] Inference API ready")
38
-
39
- except Exception as e:
40
- print(f"[{self.name}] Failed to initialize: {e}")
41
- raise
42
-
43
- async def generate(
44
- self,
45
- prompt: str = None,
46
- chat_messages: List[Dict[str, str]] = None,
47
- max_new_tokens: int = 150,
48
- temperature: float = 0.7,
49
- top_p: float = 0.9,
50
- **kwargs
51
- ) -> str:
52
- """Generate text using HuggingFace Inference API."""
53
-
54
- if not self._initialized or not self.client:
55
- raise RuntimeError(f"[{self.name}] Client not initialized")
56
-
57
- try:
58
- # Use chat completion if chat_messages provided
59
- if chat_messages:
60
- response = self.client.chat_completion(
61
- messages=chat_messages,
62
- max_tokens=max_new_tokens,
63
- temperature=temperature,
64
- top_p=top_p,
65
- )
66
- return response.choices[0].message.content.strip()
67
-
68
- # Otherwise use text generation
69
- elif prompt:
70
- response = self.client.text_generation(
71
- prompt=prompt,
72
- max_new_tokens=max_new_tokens,
73
- temperature=temperature,
74
- top_p=top_p,
75
- do_sample=True,
76
- )
77
- return response.strip()
78
-
79
- else:
80
- raise ValueError("Either prompt or chat_messages required")
81
-
82
- except Exception as e:
83
- print(f"[{self.name}] Generation error: {e}")
84
- raise
85
-
86
- def get_info(self) -> Dict[str, Any]:
87
- """Return model info."""
88
- return {
89
- "name": self.name,
90
- "model_id": self.model_id,
91
- "type": "inference_api",
92
- "initialized": self._initialized,
93
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/models/llama_cpp_model.py CHANGED
@@ -23,10 +23,12 @@ class LlamaCppModel(BaseLLM):
23
  Provides significant speedups on CPU compared to Transformers.
24
  """
25
 
26
- def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096):
27
  super().__init__(name, model_id)
28
  self.model_path = model_path
29
  self.n_ctx = n_ctx
 
 
30
  self.llm = None
31
  self._response_cache = {}
32
  self._max_cache_size = 100
@@ -62,6 +64,16 @@ class LlamaCppModel(BaseLLM):
62
  self._initialized = True
63
  print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx})")
64
 
 
 
 
 
 
 
 
 
 
 
65
  except Exception as e:
66
  error_msg = str(e) if str(e) else repr(e)
67
  print(f"[{self.name}] Failed to load GGUF model: {error_msg}")
@@ -152,7 +164,8 @@ class LlamaCppModel(BaseLLM):
152
  "backend": "llama.cpp",
153
  "context_length": self.n_ctx,
154
  "loaded": self._initialized,
155
- "model_path": self.model_path
 
156
  }
157
 
158
  async def cleanup(self) -> None:
 
23
  Provides significant speedups on CPU compared to Transformers.
24
  """
25
 
26
+ def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096, grammar_path: str = None):
27
  super().__init__(name, model_id)
28
  self.model_path = model_path
29
  self.n_ctx = n_ctx
30
+ self.grammar_path = grammar_path
31
+ self.default_grammar = None # Will be loaded from file if provided
32
  self.llm = None
33
  self._response_cache = {}
34
  self._max_cache_size = 100
 
64
  self._initialized = True
65
  print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx})")
66
 
67
+ # Load grammar file if provided
68
+ if self.grammar_path:
69
+ grammar_full_path = os.path.join(os.path.dirname(__file__), "..", "logic", self.grammar_path)
70
+ if os.path.exists(grammar_full_path):
71
+ with open(grammar_full_path, 'r', encoding='utf-8') as f:
72
+ self.default_grammar = f.read()
73
+ print(f"[{self.name}] Loaded grammar from: {grammar_full_path}")
74
+ else:
75
+ print(f"[{self.name}] Grammar file not found: {grammar_full_path}")
76
+
77
  except Exception as e:
78
  error_msg = str(e) if str(e) else repr(e)
79
  print(f"[{self.name}] Failed to load GGUF model: {error_msg}")
 
164
  "backend": "llama.cpp",
165
  "context_length": self.n_ctx,
166
  "loaded": self._initialized,
167
+ "model_path": self.model_path,
168
+ "has_grammar": self.default_grammar is not None
169
  }
170
 
171
  async def cleanup(self) -> None:
app/models/registry.py CHANGED
@@ -1,6 +1,5 @@
1
  """
2
  Model Registry - Central configuration and factory for all LLM models.
3
- Supports lazy loading and on/off mechanism for memory management.
4
  """
5
 
6
  import os
@@ -12,22 +11,31 @@ from app.models.huggingface_local import HuggingFaceLocal
12
  from app.models.huggingface_inference_api import HuggingFaceInferenceAPI
13
  from app.models.llama_cpp_model import LlamaCppModel
14
 
15
-
16
- # Model configuration - 3 local + 1 API for Polish language comparison
17
  MODEL_CONFIG = {
18
- "bielik-1.5b": {
19
- "id": "speakleash/Bielik-1.5B-v3.0-Instruct",
20
- "local_path": "bielik-1.5b",
21
- "type": "local",
22
- "polish_support": "excellent",
23
- "size": "1.5B",
24
- },
25
  "bielik-1.5b-gguf": {
26
  "id": "speakleash/Bielik-1.5B-v3.0-Instruct-GGUF",
27
  "local_path": "bielik-1.5b-gguf",
28
  "filename": "Bielik-1.5B-v3.0-Instruct.Q8_0.gguf",
29
  "type": "gguf",
30
  "size": "1.7 GB",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  },
32
  "qwen2.5-3b": {
33
  "id": "Qwen/Qwen2.5-3B-Instruct",
@@ -42,34 +50,18 @@ MODEL_CONFIG = {
42
  "type": "local",
43
  "polish_support": "medium",
44
  "size": "2B",
45
- },
46
- "pllum-12b": {
47
- "id": "CYFRAGOVPL/PLLuM-12B-instruct",
48
- "type": "inference_api",
49
- "polish_support": "excellent",
50
- "size": "12B",
51
- },
52
  }
53
 
54
- # Base path for pre-downloaded models in container
55
  LOCAL_MODEL_BASE = os.getenv("MODEL_DIR", "/app/pretrain_model")
56
 
57
-
58
  class ModelRegistry:
59
- """
60
- Central registry for managing all LLM models.
61
- Supports lazy loading (load on first request) and unloading for memory management.
62
- Only one local model is loaded at a time to conserve memory.
63
- """
64
-
65
  def __init__(self):
66
  self._models: Dict[str, BaseLLM] = {}
67
  self._config = MODEL_CONFIG.copy()
68
  self._active_local_model: Optional[str] = None
69
 
70
  def _create_model(self, name: str) -> BaseLLM:
71
- """Factory method to create model instance."""
72
-
73
  if name not in self._config:
74
  raise ValueError(f"Unknown model: {name}")
75
 
@@ -77,35 +69,21 @@ class ModelRegistry:
77
  model_type = config["type"]
78
  model_id = config["id"]
79
 
80
- # For local models, check if pre-downloaded version exists
81
- if model_type == "local" and "local_path" in config:
82
  local_path = os.path.join(LOCAL_MODEL_BASE, config["local_path"])
83
- if os.path.exists(local_path):
84
- print(f"Using pre-downloaded model at: {local_path}")
85
- model_id = local_path
86
- else:
87
- print(f"Pre-downloaded model not found at {local_path}, will download from HuggingFace")
88
 
89
- if model_type == "local":
90
- return HuggingFaceLocal(
91
- name=name,
92
- model_id=model_id,
93
- device="cpu"
94
- )
95
  elif model_type == "inference_api":
96
- return HuggingFaceInferenceAPI(
97
- name=name,
98
- model_id=model_id
99
- )
100
  elif model_type == "gguf":
101
- # For GGUF, we expect the file to be present locally or we need to download it
102
  local_path_dir = os.path.join(LOCAL_MODEL_BASE, config.get("local_path", ""))
103
  filename = config.get("filename")
104
  full_path = os.path.join(local_path_dir, filename)
105
 
106
- # Auto-download if missing (simplified logic using huggingface_hub)
107
  if not os.path.exists(full_path):
108
- print(f"GGUF file not found at {full_path}, downloading...")
109
  from huggingface_hub import hf_hub_download
110
  os.makedirs(local_path_dir, exist_ok=True)
111
  full_path = hf_hub_download(
@@ -115,144 +93,54 @@ class ModelRegistry:
115
  local_dir_use_symlinks=False
116
  )
117
 
 
 
 
118
  return LlamaCppModel(
119
  name=name,
120
  model_id=model_id,
121
- model_path=full_path
 
122
  )
123
- else:
124
- raise ValueError(f"Unknown model type: {model_type}")
125
-
126
- async def _unload_model(self, name: str) -> None:
127
- """Unload a model from memory."""
128
- if name in self._models:
129
- model = self._models[name]
130
- # Call cleanup if available
131
- if hasattr(model, 'cleanup'):
132
- await model.cleanup()
133
- del self._models[name]
134
- gc.collect() # Force garbage collection
135
- print(f"Model '{name}' unloaded from memory.")
136
-
137
- async def _unload_all_local_models(self) -> None:
138
- """Unload all local models to free memory."""
139
- local_models = [
140
- name for name, config in self._config.items()
141
- if config["type"] == "local" and name in self._models
142
- ]
143
- for name in local_models:
144
- await self._unload_model(name)
145
- self._active_local_model = None
146
-
147
  async def get_model(self, name: str) -> BaseLLM:
148
- """
149
- Get a model (lazy loading).
150
- For local models: unloads any previously loaded local model first.
151
- For API models: always available without affecting local models.
152
- """
153
- print(f"DEBUG: get_model called for {name}", flush=True)
154
- if name not in self._config:
155
- raise ValueError(f"Unknown model: {name}")
156
-
157
  config = self._config[name]
158
 
159
- # If it's a local model, ensure only one is loaded at a time
160
- if config["type"] == "local":
161
- # Unload current local model if different
162
  if self._active_local_model and self._active_local_model != name:
163
- print(f"Switching from '{self._active_local_model}' to '{name}'...")
164
  await self._unload_model(self._active_local_model)
165
 
166
- # Load the requested model if not already loaded
167
  if name not in self._models:
168
- print(f"Loading model '{name}'...")
169
  model = self._create_model(name)
170
  await model.initialize()
171
  self._models[name] = model
172
  self._active_local_model = name
173
- print(f"Model '{name}' loaded successfully.")
174
 
175
- # For API models, just create/return (no memory concern)
176
  elif config["type"] == "inference_api":
177
  if name not in self._models:
178
- print(f"Initializing API model '{name}'...")
179
- model = self._create_model(name)
180
- await model.initialize()
181
- self._models[name] = model
182
-
183
- # For GGUF models, treat similar to local (single slot?) or API?
184
- # Typically GGUF uses RAM, so we should treat it like 'local' and manage memory.
185
- elif config["type"] == "gguf":
186
- # Unload current local model if different (GGUF also takes RAM)
187
- if self._active_local_model and self._active_local_model != name:
188
- print(f"Switching from '{self._active_local_model}' to '{name}'...")
189
- await self._unload_model(self._active_local_model)
190
-
191
- if name not in self._models:
192
- print(f"Loading GGUF model '{name}'...")
193
  model = self._create_model(name)
194
  await model.initialize()
195
  self._models[name] = model
196
- self._active_local_model = name # Track as active local model
197
 
198
  return self._models[name]
199
-
200
- async def load_model(self, name: str) -> Dict[str, Any]:
201
- """
202
- Explicitly load a model (unloads other local models first).
203
- Returns model info.
204
- """
205
- await self.get_model(name)
206
- return self.get_model_info(name)
207
-
208
- async def unload_model(self, name: str) -> Dict[str, str]:
209
- """
210
- Explicitly unload a model from memory.
211
- """
212
- if name not in self._config:
213
- raise ValueError(f"Unknown model: {name}")
214
-
215
- if name not in self._models:
216
- return {"status": "not_loaded", "model": name}
217
-
218
- await self._unload_model(name)
219
- if self._active_local_model == name:
220
- self._active_local_model = None
221
-
222
- return {"status": "unloaded", "model": name}
223
-
224
  def get_model_info(self, name: str) -> Dict[str, Any]:
225
- """Get info about a specific model."""
226
- if name not in self._config:
227
- raise ValueError(f"Unknown model: {name}")
228
-
229
  config = self._config[name]
230
  return {
231
  "name": name,
232
  "model_id": config["id"],
233
  "type": config["type"],
234
- "polish_support": config["polish_support"],
235
- "size": config["size"],
236
  "loaded": name in self._models,
237
- "active": name == self._active_local_model if config["type"] == "local" else None,
238
  }
239
-
240
- def list_models(self) -> List[Dict[str, Any]]:
241
- """List all available models with their info."""
242
- return [self.get_model_info(name) for name in self._config.keys()]
243
-
244
- def get_available_model_names(self) -> List[str]:
245
- """Get list of available model names."""
246
- return list(self._config.keys())
247
-
248
- def get_active_model(self) -> Optional[str]:
249
- """Get the currently active (loaded) local model name."""
250
- return self._active_local_model
251
-
252
- def get_loaded_models(self) -> List[str]:
253
- """Get list of currently loaded model names."""
254
- return list(self._models.keys())
255
-
256
 
257
- # Global registry instance
258
- registry = ModelRegistry()
 
1
  """
2
  Model Registry - Central configuration and factory for all LLM models.
 
3
  """
4
 
5
  import os
 
11
  from app.models.huggingface_inference_api import HuggingFaceInferenceAPI
12
  from app.models.llama_cpp_model import LlamaCppModel
13
 
14
+ # Model configuration
 
15
  MODEL_CONFIG = {
 
 
 
 
 
 
 
16
  "bielik-1.5b-gguf": {
17
  "id": "speakleash/Bielik-1.5B-v3.0-Instruct-GGUF",
18
  "local_path": "bielik-1.5b-gguf",
19
  "filename": "Bielik-1.5B-v3.0-Instruct.Q8_0.gguf",
20
  "type": "gguf",
21
  "size": "1.7 GB",
22
+ "polish_support": "excellent",
23
+ "grammar_file": "answers.gbnf"
24
+ },
25
+ "bielik-11b-gguf": {
26
+ "id": "speakleash/Bielik-11B-v2.3-Instruct-GGUF",
27
+ "local_path": "bielik-11b-gguf",
28
+ "filename": "Bielik-11B-v2.3-Instruct.Q4_K_M.gguf",
29
+ "type": "gguf",
30
+ "size": "7.2 GB",
31
+ "polish_support": "excellent",
32
+ "grammar_file": "answers.gbnf"
33
+ },
34
+ "llama-3.1-8b": {
35
+ "id": "meta-llama/Llama-3.1-8B-Instruct",
36
+ "type": "inference_api",
37
+ "polish_support": "excellent",
38
+ "size": "8B",
39
  },
40
  "qwen2.5-3b": {
41
  "id": "Qwen/Qwen2.5-3B-Instruct",
 
50
  "type": "local",
51
  "polish_support": "medium",
52
  "size": "2B",
53
+ }
 
 
 
 
 
 
54
  }
55
 
 
56
  LOCAL_MODEL_BASE = os.getenv("MODEL_DIR", "/app/pretrain_model")
57
 
 
58
  class ModelRegistry:
 
 
 
 
 
 
59
  def __init__(self):
60
  self._models: Dict[str, BaseLLM] = {}
61
  self._config = MODEL_CONFIG.copy()
62
  self._active_local_model: Optional[str] = None
63
 
64
  def _create_model(self, name: str) -> BaseLLM:
 
 
65
  if name not in self._config:
66
  raise ValueError(f"Unknown model: {name}")
67
 
 
69
  model_type = config["type"]
70
  model_id = config["id"]
71
 
72
+ if model_type == "local":
 
73
  local_path = os.path.join(LOCAL_MODEL_BASE, config["local_path"])
74
+ model_id = local_path if os.path.exists(local_path) else model_id
75
+ return HuggingFaceLocal(name=name, model_id=model_id, device="cpu")
 
 
 
76
 
 
 
 
 
 
 
77
  elif model_type == "inference_api":
78
+ return HuggingFaceInferenceAPI(name=name, model_id=model_id)
79
+
 
 
80
  elif model_type == "gguf":
 
81
  local_path_dir = os.path.join(LOCAL_MODEL_BASE, config.get("local_path", ""))
82
  filename = config.get("filename")
83
  full_path = os.path.join(local_path_dir, filename)
84
 
85
+ # Pobieranie GGUF jeśli brak
86
  if not os.path.exists(full_path):
 
87
  from huggingface_hub import hf_hub_download
88
  os.makedirs(local_path_dir, exist_ok=True)
89
  full_path = hf_hub_download(
 
93
  local_dir_use_symlinks=False
94
  )
95
 
96
+ # Przekazanie gramatyki do modelu
97
+ grammar_path = config.get("grammar_file")
98
+
99
  return LlamaCppModel(
100
  name=name,
101
  model_id=model_id,
102
+ model_path=full_path,
103
+ grammar_path=grammar_path # Upewnij się, że klasa LlamaCppModel to obsługuje
104
  )
105
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  async def get_model(self, name: str) -> BaseLLM:
 
 
 
 
 
 
 
 
 
107
  config = self._config[name]
108
 
109
+ # Zarządzanie pamięcią RAM dla modeli lokalnych i GGUF
110
+ if config["type"] in ["local", "gguf"]:
 
111
  if self._active_local_model and self._active_local_model != name:
 
112
  await self._unload_model(self._active_local_model)
113
 
 
114
  if name not in self._models:
 
115
  model = self._create_model(name)
116
  await model.initialize()
117
  self._models[name] = model
118
  self._active_local_model = name
 
119
 
 
120
  elif config["type"] == "inference_api":
121
  if name not in self._models:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  model = self._create_model(name)
123
  await model.initialize()
124
  self._models[name] = model
 
125
 
126
  return self._models[name]
127
+
128
+ async def _unload_model(self, name: str) -> None:
129
+ if name in self._models:
130
+ model = self._models[name]
131
+ if hasattr(model, 'cleanup'): await model.cleanup()
132
+ del self._models[name]
133
+ gc.collect()
134
+ print(f"Model '{name}' unloaded.")
135
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  def get_model_info(self, name: str) -> Dict[str, Any]:
 
 
 
 
137
  config = self._config[name]
138
  return {
139
  "name": name,
140
  "model_id": config["id"],
141
  "type": config["type"],
 
 
142
  "loaded": name in self._models,
143
+ "active": name == self._active_local_model
144
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
+ registry = ModelRegistry()