Spaces:

studzinsky
/

bielik_app_service

Sleeping

App Files Files Community

Patryk Studzinski commited on Dec 29, 2025

Commit

c14ac43

1 Parent(s): 329abd1

add GBNF grammar for car advertisement gap filling; update LlamaCppModel to support loading grammar from file

Browse files

Files changed (5) hide show

app/logic/answers.gbnf +16 -0
app/main.py +12 -7
app/models/huggingface_inference_api.py +0 -93
app/models/llama_cpp_model.py +15 -2
app/models/registry.py +44 -156

app/logic/answers.gbnf ADDED Viewed

	@@ -0,0 +1,16 @@

+# GBNF Grammar for Car Advertisement Gap Filling
+# Forces model to output valid JSON with gap fills
+# Supports 1-10 gaps with Polish characters
+root ::= "{" ws "\"gaps\":" ws "[" ws gap-list ws "]" ws "}"
+gap-list ::= gap-item (ws "," ws gap-item)*
+gap-item ::= "{" ws "\"index\":" ws number ws "," ws "\"choice\":" ws "\"" phrase "\"" ws "}"
+# Allow words with Polish characters, numbers, spaces
+phrase ::= word (space word){0,4}
+word ::= [a-zA-ZżźćńółęąśŻŹĆŃÓŁĘĄŚ0-9.,%-]+
+space ::= " "
+number ::= [0-9]+
+ws ::= [ \t\n]*

app/main.py CHANGED Viewed

@@ -430,13 +430,18 @@ async def process_infill_item(
         grammar_str = None
         if use_grammar and hasattr(llm, 'llm') and llm.llm is not None:
-            # Only use grammar for GGUF models (llama.cpp)
-            try:
-                from app.logic.grammar_utils import get_infill_grammar
-                grammar_str = get_infill_grammar(len(gaps))
-                print(f"DEBUG: Using GBNF grammar for {len(gaps)} gaps", flush=True)
-            except ImportError:
-                pass
         raw_output = await llm.generate(
             chat_messages=chat_messages,

         grammar_str = None
         if use_grammar and hasattr(llm, 'llm') and llm.llm is not None:
+            # Use model's default grammar (loaded from answers.gbnf) if available
+            if hasattr(llm, 'default_grammar') and llm.default_grammar:
+                grammar_str = llm.default_grammar
+                print(f"DEBUG: Using model's default GBNF grammar", flush=True)
+            else:
+                # Fallback to dynamic grammar generation
+                try:
+                    from app.logic.grammar_utils import get_infill_grammar
+                    grammar_str = get_infill_grammar(len(gaps))
+                    print(f"DEBUG: Using dynamic GBNF grammar for {len(gaps)} gaps", flush=True)
+                except ImportError:
+                    pass
         raw_output = await llm.generate(
             chat_messages=chat_messages,

app/models/huggingface_inference_api.py DELETED Viewed

@@ -1,93 +0,0 @@
-"""
-HuggingFace Inference API client for remote model access.
-"""
-import os
-from typing import List, Dict, Any, Optional
-from huggingface_hub import InferenceClient
-from app.models.base_llm import BaseLLM
-class HuggingFaceInferenceAPI(BaseLLM):
-    """
-    Remote model access via HuggingFace Inference API.
-    Best for larger models (7B+) that don't fit in local RAM.
-    """
-    def __init__(self, name: str, model_id: str, token: str = None):
-        super().__init__(name, model_id)
-        self.token = token or os.getenv("HF_TOKEN")
-        self.client: Optional[InferenceClient] = None
-    async def initialize(self) -> None:
-        """Initialize the Inference API client."""
-        if self._initialized:
-            return
-        try:
-            print(f"[{self.name}] Initializing Inference API for: {self.model_id}")
-            self.client = InferenceClient(
-                model=self.model_id,
-                token=self.token
-            )
-            self._initialized = True
-            print(f"[{self.name}] Inference API ready")
-        except Exception as e:
-            print(f"[{self.name}] Failed to initialize: {e}")
-            raise
-    async def generate(
-        self,
-        prompt: str = None,
-        chat_messages: List[Dict[str, str]] = None,
-        max_new_tokens: int = 150,
-        temperature: float = 0.7,
-        top_p: float = 0.9,
-        **kwargs
-    ) -> str:
-        """Generate text using HuggingFace Inference API."""
-        if not self._initialized or not self.client:
-            raise RuntimeError(f"[{self.name}] Client not initialized")
-        try:
-            # Use chat completion if chat_messages provided
-            if chat_messages:
-                response = self.client.chat_completion(
-                    messages=chat_messages,
-                    max_tokens=max_new_tokens,
-                    temperature=temperature,
-                    top_p=top_p,
-                )
-                return response.choices[0].message.content.strip()
-            # Otherwise use text generation
-            elif prompt:
-                response = self.client.text_generation(
-                    prompt=prompt,
-                    max_new_tokens=max_new_tokens,
-                    temperature=temperature,
-                    top_p=top_p,
-                    do_sample=True,
-                )
-                return response.strip()
-            else:
-                raise ValueError("Either prompt or chat_messages required")
-        except Exception as e:
-            print(f"[{self.name}] Generation error: {e}")
-            raise
-    def get_info(self) -> Dict[str, Any]:
-        """Return model info."""
-        return {
-            "name": self.name,
-            "model_id": self.model_id,
-            "type": "inference_api",
-            "initialized": self._initialized,
-        }

app/models/llama_cpp_model.py CHANGED Viewed

@@ -23,10 +23,12 @@ class LlamaCppModel(BaseLLM):
     Provides significant speedups on CPU compared to Transformers.
     """
-    def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096):
         super().__init__(name, model_id)
         self.model_path = model_path
         self.n_ctx = n_ctx
         self.llm = None
         self._response_cache = {}
         self._max_cache_size = 100
@@ -62,6 +64,16 @@ class LlamaCppModel(BaseLLM):
             self._initialized = True
             print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx})")
         except Exception as e:
             error_msg = str(e) if str(e) else repr(e)
             print(f"[{self.name}] Failed to load GGUF model: {error_msg}")
@@ -152,7 +164,8 @@ class LlamaCppModel(BaseLLM):
             "backend": "llama.cpp",
             "context_length": self.n_ctx,
             "loaded": self._initialized,
-            "model_path": self.model_path
         }
     async def cleanup(self) -> None:

     Provides significant speedups on CPU compared to Transformers.
     """
+    def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096, grammar_path: str = None):
         super().__init__(name, model_id)
         self.model_path = model_path
         self.n_ctx = n_ctx
+        self.grammar_path = grammar_path
+        self.default_grammar = None  # Will be loaded from file if provided
         self.llm = None
         self._response_cache = {}
         self._max_cache_size = 100
             self._initialized = True
             print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx})")
+            # Load grammar file if provided
+            if self.grammar_path:
+                grammar_full_path = os.path.join(os.path.dirname(__file__), "..", "logic", self.grammar_path)
+                if os.path.exists(grammar_full_path):
+                    with open(grammar_full_path, 'r', encoding='utf-8') as f:
+                        self.default_grammar = f.read()
+                    print(f"[{self.name}] Loaded grammar from: {grammar_full_path}")
+                else:
+                    print(f"[{self.name}] Grammar file not found: {grammar_full_path}")
         except Exception as e:
             error_msg = str(e) if str(e) else repr(e)
             print(f"[{self.name}] Failed to load GGUF model: {error_msg}")
             "backend": "llama.cpp",
             "context_length": self.n_ctx,
             "loaded": self._initialized,
+            "model_path": self.model_path,
+            "has_grammar": self.default_grammar is not None
         }
     async def cleanup(self) -> None:

app/models/registry.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """
 Model Registry - Central configuration and factory for all LLM models.
-Supports lazy loading and on/off mechanism for memory management.
 """
 import os
@@ -12,22 +11,31 @@ from app.models.huggingface_local import HuggingFaceLocal
 from app.models.huggingface_inference_api import HuggingFaceInferenceAPI
 from app.models.llama_cpp_model import LlamaCppModel
-# Model configuration - 3 local + 1 API for Polish language comparison
 MODEL_CONFIG = {
-    "bielik-1.5b": {
-        "id": "speakleash/Bielik-1.5B-v3.0-Instruct",
-        "local_path": "bielik-1.5b",
-        "type": "local",
-        "polish_support": "excellent",
-        "size": "1.5B",
-    },
     "bielik-1.5b-gguf": {
         "id": "speakleash/Bielik-1.5B-v3.0-Instruct-GGUF",
         "local_path": "bielik-1.5b-gguf",
         "filename": "Bielik-1.5B-v3.0-Instruct.Q8_0.gguf",
         "type": "gguf",
         "size": "1.7 GB",
     },
     "qwen2.5-3b": {
         "id": "Qwen/Qwen2.5-3B-Instruct",
@@ -42,34 +50,18 @@ MODEL_CONFIG = {
         "type": "local",
         "polish_support": "medium",
         "size": "2B",
-    },
-    "pllum-12b": {
-        "id": "CYFRAGOVPL/PLLuM-12B-instruct",
-        "type": "inference_api",
-        "polish_support": "excellent",
-        "size": "12B",
-    },
 }
-# Base path for pre-downloaded models in container
 LOCAL_MODEL_BASE = os.getenv("MODEL_DIR", "/app/pretrain_model")
 class ModelRegistry:
-    """
-    Central registry for managing all LLM models.
-    Supports lazy loading (load on first request) and unloading for memory management.
-    Only one local model is loaded at a time to conserve memory.
-    """
     def __init__(self):
         self._models: Dict[str, BaseLLM] = {}
         self._config = MODEL_CONFIG.copy()
         self._active_local_model: Optional[str] = None
     def _create_model(self, name: str) -> BaseLLM:
-        """Factory method to create model instance."""
         if name not in self._config:
             raise ValueError(f"Unknown model: {name}")
@@ -77,35 +69,21 @@ class ModelRegistry:
         model_type = config["type"]
         model_id = config["id"]
-        # For local models, check if pre-downloaded version exists
-        if model_type == "local" and "local_path" in config:
             local_path = os.path.join(LOCAL_MODEL_BASE, config["local_path"])
-            if os.path.exists(local_path):
-                print(f"Using pre-downloaded model at: {local_path}")
-                model_id = local_path
-            else:
-                print(f"Pre-downloaded model not found at {local_path}, will download from HuggingFace")
-        if model_type == "local":
-            return HuggingFaceLocal(
-                name=name,
-                model_id=model_id,
-                device="cpu"
-            )
         elif model_type == "inference_api":
-            return HuggingFaceInferenceAPI(
-                name=name,
-                model_id=model_id
-            )
         elif model_type == "gguf":
-            # For GGUF, we expect the file to be present locally or we need to download it
             local_path_dir = os.path.join(LOCAL_MODEL_BASE, config.get("local_path", ""))
             filename = config.get("filename")
             full_path = os.path.join(local_path_dir, filename)
-            # Auto-download if missing (simplified logic using huggingface_hub)
             if not os.path.exists(full_path):
-                print(f"GGUF file not found at {full_path}, downloading...")
                 from huggingface_hub import hf_hub_download
                 os.makedirs(local_path_dir, exist_ok=True)
                 full_path = hf_hub_download(
@@ -115,144 +93,54 @@ class ModelRegistry:
                     local_dir_use_symlinks=False
                 )
             return LlamaCppModel(
                 name=name,
                 model_id=model_id,
-                model_path=full_path
             )
-        else:
-            raise ValueError(f"Unknown model type: {model_type}")
-    async def _unload_model(self, name: str) -> None:
-        """Unload a model from memory."""
-        if name in self._models:
-            model = self._models[name]
-            # Call cleanup if available
-            if hasattr(model, 'cleanup'):
-                await model.cleanup()
-            del self._models[name]
-            gc.collect()  # Force garbage collection
-            print(f"Model '{name}' unloaded from memory.")
-    async def _unload_all_local_models(self) -> None:
-        """Unload all local models to free memory."""
-        local_models = [
-            name for name, config in self._config.items()
-            if config["type"] == "local" and name in self._models
-        ]
-        for name in local_models:
-            await self._unload_model(name)
-        self._active_local_model = None
     async def get_model(self, name: str) -> BaseLLM:
-        """
-        Get a model (lazy loading).
-        For local models: unloads any previously loaded local model first.
-        For API models: always available without affecting local models.
-        """
-        print(f"DEBUG: get_model called for {name}", flush=True)
-        if name not in self._config:
-            raise ValueError(f"Unknown model: {name}")
         config = self._config[name]
-        # If it's a local model, ensure only one is loaded at a time
-        if config["type"] == "local":
-            # Unload current local model if different
             if self._active_local_model and self._active_local_model != name:
-                print(f"Switching from '{self._active_local_model}' to '{name}'...")
                 await self._unload_model(self._active_local_model)
-            # Load the requested model if not already loaded
             if name not in self._models:
-                print(f"Loading model '{name}'...")
                 model = self._create_model(name)
                 await model.initialize()
                 self._models[name] = model
                 self._active_local_model = name
-                print(f"Model '{name}' loaded successfully.")
-        # For API models, just create/return (no memory concern)
         elif config["type"] == "inference_api":
             if name not in self._models:
-                print(f"Initializing API model '{name}'...")
-                model = self._create_model(name)
-                await model.initialize()
-                self._models[name] = model
-        # For GGUF models, treat similar to local (single slot?) or API?
-        # Typically GGUF uses RAM, so we should treat it like 'local' and manage memory.
-        elif config["type"] == "gguf":
-             # Unload current local model if different (GGUF also takes RAM)
-            if self._active_local_model and self._active_local_model != name:
-                print(f"Switching from '{self._active_local_model}' to '{name}'...")
-                await self._unload_model(self._active_local_model)
-            if name not in self._models:
-                print(f"Loading GGUF model '{name}'...")
                 model = self._create_model(name)
                 await model.initialize()
                 self._models[name] = model
-                self._active_local_model = name # Track as active local model
         return self._models[name]
-    async def load_model(self, name: str) -> Dict[str, Any]:
-        """
-        Explicitly load a model (unloads other local models first).
-        Returns model info.
-        """
-        await self.get_model(name)
-        return self.get_model_info(name)
-    async def unload_model(self, name: str) -> Dict[str, str]:
-        """
-        Explicitly unload a model from memory.
-        """
-        if name not in self._config:
-            raise ValueError(f"Unknown model: {name}")
-        if name not in self._models:
-            return {"status": "not_loaded", "model": name}
-        await self._unload_model(name)
-        if self._active_local_model == name:
-            self._active_local_model = None
-        return {"status": "unloaded", "model": name}
     def get_model_info(self, name: str) -> Dict[str, Any]:
-        """Get info about a specific model."""
-        if name not in self._config:
-            raise ValueError(f"Unknown model: {name}")
         config = self._config[name]
         return {
             "name": name,
             "model_id": config["id"],
             "type": config["type"],
-            "polish_support": config["polish_support"],
-            "size": config["size"],
             "loaded": name in self._models,
-            "active": name == self._active_local_model if config["type"] == "local" else None,
         }
-    def list_models(self) -> List[Dict[str, Any]]:
-        """List all available models with their info."""
-        return [self.get_model_info(name) for name in self._config.keys()]
-    def get_available_model_names(self) -> List[str]:
-        """Get list of available model names."""
-        return list(self._config.keys())
-    def get_active_model(self) -> Optional[str]:
-        """Get the currently active (loaded) local model name."""
-        return self._active_local_model
-    def get_loaded_models(self) -> List[str]:
-        """Get list of currently loaded model names."""
-        return list(self._models.keys())
-# Global registry instance
-registry = ModelRegistry()

 """
 Model Registry - Central configuration and factory for all LLM models.
 """
 import os
 from app.models.huggingface_inference_api import HuggingFaceInferenceAPI
 from app.models.llama_cpp_model import LlamaCppModel
+# Model configuration
 MODEL_CONFIG = {
     "bielik-1.5b-gguf": {
         "id": "speakleash/Bielik-1.5B-v3.0-Instruct-GGUF",
         "local_path": "bielik-1.5b-gguf",
         "filename": "Bielik-1.5B-v3.0-Instruct.Q8_0.gguf",
         "type": "gguf",
         "size": "1.7 GB",
+        "polish_support": "excellent",
+        "grammar_file": "answers.gbnf"
+    },
+    "bielik-11b-gguf": {
+    "id": "speakleash/Bielik-11B-v2.3-Instruct-GGUF",
+    "local_path": "bielik-11b-gguf",
+    "filename": "Bielik-11B-v2.3-Instruct.Q4_K_M.gguf",
+    "type": "gguf",
+    "size": "7.2 GB",
+    "polish_support": "excellent",
+    "grammar_file": "answers.gbnf"
+},
+    "llama-3.1-8b": {
+        "id": "meta-llama/Llama-3.1-8B-Instruct",
+        "type": "inference_api",
+        "polish_support": "excellent",
+        "size": "8B",
     },
     "qwen2.5-3b": {
         "id": "Qwen/Qwen2.5-3B-Instruct",
         "type": "local",
         "polish_support": "medium",
         "size": "2B",
+    }
 }
 LOCAL_MODEL_BASE = os.getenv("MODEL_DIR", "/app/pretrain_model")
 class ModelRegistry:
     def __init__(self):
         self._models: Dict[str, BaseLLM] = {}
         self._config = MODEL_CONFIG.copy()
         self._active_local_model: Optional[str] = None
     def _create_model(self, name: str) -> BaseLLM:
         if name not in self._config:
             raise ValueError(f"Unknown model: {name}")
         model_type = config["type"]
         model_id = config["id"]
+        if model_type == "local":
             local_path = os.path.join(LOCAL_MODEL_BASE, config["local_path"])
+            model_id = local_path if os.path.exists(local_path) else model_id
+            return HuggingFaceLocal(name=name, model_id=model_id, device="cpu")
         elif model_type == "inference_api":
+            return HuggingFaceInferenceAPI(name=name, model_id=model_id)
         elif model_type == "gguf":
             local_path_dir = os.path.join(LOCAL_MODEL_BASE, config.get("local_path", ""))
             filename = config.get("filename")
             full_path = os.path.join(local_path_dir, filename)
+            # Pobieranie GGUF jeśli brak
             if not os.path.exists(full_path):
                 from huggingface_hub import hf_hub_download
                 os.makedirs(local_path_dir, exist_ok=True)
                 full_path = hf_hub_download(
                     local_dir_use_symlinks=False
                 )
+            # Przekazanie gramatyki do modelu
+            grammar_path = config.get("grammar_file")
             return LlamaCppModel(
                 name=name,
                 model_id=model_id,
+                model_path=full_path,
+                grammar_path=grammar_path # Upewnij się, że klasa LlamaCppModel to obsługuje
             )
     async def get_model(self, name: str) -> BaseLLM:
         config = self._config[name]
+        # Zarządzanie pamięcią RAM dla modeli lokalnych i GGUF
+        if config["type"] in ["local", "gguf"]:
             if self._active_local_model and self._active_local_model != name:
                 await self._unload_model(self._active_local_model)
             if name not in self._models:
                 model = self._create_model(name)
                 await model.initialize()
                 self._models[name] = model
                 self._active_local_model = name
         elif config["type"] == "inference_api":
             if name not in self._models:
                 model = self._create_model(name)
                 await model.initialize()
                 self._models[name] = model
         return self._models[name]
+    async def _unload_model(self, name: str) -> None:
+        if name in self._models:
+            model = self._models[name]
+            if hasattr(model, 'cleanup'): await model.cleanup()
+            del self._models[name]
+            gc.collect()
+            print(f"Model '{name}' unloaded.")
     def get_model_info(self, name: str) -> Dict[str, Any]:
         config = self._config[name]
         return {
             "name": name,
             "model_id": config["id"],
             "type": config["type"],
             "loaded": name in self._models,
+            "active": name == self._active_local_model
         }
+registry = ModelRegistry()