Spaces:

studzinsky
/

bielik_app_service

Sleeping

App Files Files Community

Patryk Studzinski commited on Dec 29, 2025

Commit

db4996d

1 Parent(s): d9b1571

increase context size and improve message handling in LlamaCppModel

Browse files

Files changed (1) hide show

app/models/llama_cpp_model.py +16 -28

app/models/llama_cpp_model.py CHANGED Viewed

@@ -21,7 +21,7 @@ class LlamaCppModel(BaseLLM):
     Provides significant speedups on CPU compared to Transformers.
     """
-    def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 2048):
         super().__init__(name, model_id)
         self.model_path = model_path
         self.n_ctx = n_ctx
@@ -55,7 +55,7 @@ class LlamaCppModel(BaseLLM):
             )
             self._initialized = True
-            print(f"[{self.name}] GGUF Model loaded successfully")
         except Exception as e:
             print(f"[{self.name}] Failed to load GGUF model: {e}")
@@ -75,43 +75,31 @@ class LlamaCppModel(BaseLLM):
         if not self._initialized or self.llm is None:
             raise RuntimeError(f"[{self.name}] Model not initialized")
-        # Format prompt
-        formatted_prompt = ""
-        if chat_messages:
-            # Simple chat formatting
-            # Llama-cpp might have chat_handler but manual formatting is often safer for custom templates
-            for msg in chat_messages:
-                role = msg.get("role", "user")
-                content = msg.get("content", "")
-                if role == "system":
-                    formatted_prompt += f"{content}\n\n"
-                elif role == "user":
-                    formatted_prompt += f"User: {content}\n"
-                elif role == "assistant":
-                    formatted_prompt += f"Assistant: {content}\n"
-            formatted_prompt += "Assistant:"
-        elif prompt:
-            formatted_prompt = prompt
-        else:
             raise ValueError("Either prompt or chat_messages required")
-        # Cache Check
-        cache_key = f"{formatted_prompt}_{max_new_tokens}_{temperature}_{top_p}"
         if cache_key in self._response_cache:
             return self._response_cache[cache_key]
-        # Generate
         output = await asyncio.to_thread(
-            self.llm.create_completion,
-            formatted_prompt,
             max_tokens=max_new_tokens,
             temperature=temperature,
             top_p=top_p,
-            stop=["User:", "Assistant:"],
-            echo=False
         )
-        response_text = output['choices'][0]['text'].strip()
         # Cache Store
         if len(self._response_cache) >= self._max_cache_size:

     Provides significant speedups on CPU compared to Transformers.
     """
+    def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 8192):
         super().__init__(name, model_id)
         self.model_path = model_path
         self.n_ctx = n_ctx
             )
             self._initialized = True
+            print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx})")
         except Exception as e:
             print(f"[{self.name}] Failed to load GGUF model: {e}")
         if not self._initialized or self.llm is None:
             raise RuntimeError(f"[{self.name}] Model not initialized")
+        # Ensure we have a list of messages
+        messages = chat_messages
+        if not messages and prompt:
+            messages = [{"role": "user", "content": prompt}]
+        if not messages:
             raise ValueError("Either prompt or chat_messages required")
+        # Cache Check - using stringified messages for the key
+        import json
+        cache_key = f"{json.dumps(messages)}_{max_new_tokens}_{temperature}_{top_p}"
         if cache_key in self._response_cache:
             return self._response_cache[cache_key]
+        # Generate using chat completion to leverage internal templates
         output = await asyncio.to_thread(
+            self.llm.create_chat_completion,
+            messages=messages,
             max_tokens=max_new_tokens,
             temperature=temperature,
             top_p=top_p,
+            # No manual stop tokens needed usually as template handles them
         )
+        response_text = output['choices'][0]['message']['content'].strip()
         # Cache Store
         if len(self._response_cache) >= self._max_cache_size: