Spaces:

studzinsky
/

bielik_app_service

Sleeping

App Files Files Community

Patryk Studzinski commited on 3 days ago

Commit

eaa2e37

1 Parent(s): ab2e415

Fix: Use direct model.generate() with proper KV caching instead of pipeline

Browse files

Files changed (1) hide show

app/models/huggingface_local.py +50 -79

app/models/huggingface_local.py CHANGED Viewed

@@ -63,12 +63,11 @@ class HuggingFaceLocal(BaseLLM):
             # Model config optimizations
             model_kwargs = {
                 "trust_remote_code": True,
-                "use_cache": self.use_cache,  # Enable KV caching
                 "torch_dtype": self.torch_dtype,
             }
-            # Enable flash attention if requested and available
-            if self.use_flash_attention:
                 model_kwargs["attn_implementation"] = "flash_attention_2"
             self.model = await asyncio.to_thread(
@@ -78,48 +77,16 @@ class HuggingFaceLocal(BaseLLM):
                 **model_kwargs
             )
-            # Create pipeline with optimized model
-            self.pipeline = await asyncio.to_thread(
-                pipeline,
-                "text-generation",
-                model=self.model,
-                tokenizer=self.tokenizer,
-                device=self.device_index,
-            )
             self._initialized = True
-            print(f"[{self.name}] Model loaded successfully with KV caching enabled")
         except Exception as e:
             print(f"[{self.name}] Failed to load model: {e}")
-            # Fallback: try without flash attention
-            if self.use_flash_attention:
-                print(f"[{self.name}] Retrying without flash attention...")
-                self.use_flash_attention = False
-                try:
-                    self.tokenizer = await asyncio.to_thread(
-                        AutoTokenizer.from_pretrained,
-                        self.model_id,
-                        trust_remote_code=True
-                    )
-                    self.pipeline = await asyncio.to_thread(
-                        pipeline,
-                        "text-generation",
-                        model=self.model_id,
-                        tokenizer=self.tokenizer,
-                        device=self.device_index,
-                        torch_dtype=self.torch_dtype,
-                        trust_remote_code=True,
-                        use_cache=self.use_cache,
-                    )
-                    self._initialized = True
-                    print(f"[{self.name}] Model loaded successfully (without flash attention)")
-                except Exception as e2:
-                    print(f"[{self.name}] Fallback also failed: {e2}")
-                    raise
-            else:
-                raise
     async def generate(
         self,
@@ -131,14 +98,14 @@ class HuggingFaceLocal(BaseLLM):
         **kwargs
     ) -> str:
         """
-        Generate text using local pipeline with KV cache optimizations.
-        KV Cache Impact:
-        - WITH: ~9 seconds for 10 ads (50 gaps total)
         - WITHOUT: ~42 seconds (4.7x slower)
         """
-        if not self._initialized:
             raise RuntimeError(f"[{self.name}] Model not initialized")
         formatted_prompt = None
@@ -153,55 +120,59 @@ class HuggingFaceLocal(BaseLLM):
                 )
             except Exception as e:
                 print(f"[{self.name}] apply_chat_template failed: {e}, using fallback")
-                # Fallback: manually format chat messages
                 formatted_prompt = self._format_chat_fallback(chat_messages)
-        # Use raw prompt if provided and no chat_messages
         if formatted_prompt is None and prompt:
             formatted_prompt = prompt
         if formatted_prompt is None:
             raise ValueError("Either prompt or chat_messages required")
-        # Generate with KV cache and optimizations
-        # The pipeline uses use_cache=True internally when initialized
-        generation_kwargs = {
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "eos_token_id": self.tokenizer.eos_token_id,
-            "pad_token_id": self.tokenizer.eos_token_id if self.tokenizer.pad_token_id is None else self.tokenizer.pad_token_id,
-        }
-        # If using direct model (not pipeline), enable return_dict_in_generate for better caching
-        if hasattr(self, 'model') and self.model is not None:
-            generation_kwargs["return_dict_in_generate"] = True
         outputs = await asyncio.to_thread(
-            self.pipeline,
-            formatted_prompt,
-            **generation_kwargs
         )
-        # Extract response
-        if outputs and isinstance(outputs, list) and "generated_text" in outputs[0]:
-            full_text = outputs[0]["generated_text"]
-            # Remove prompt from output
-            if full_text.startswith(formatted_prompt):
-                response = full_text[len(formatted_prompt):]
-            else:
-                response = full_text
-            # Clean up special tokens
-            for token in ["<|im_end|>", "<end_of_turn>", "<eos>", "</s>"]:
-                if response.endswith(token):
-                    response = response[:-len(token)]
-            return response.strip()
-        return ""
     def _format_chat_fallback(self, chat_messages: List[Dict[str, str]]) -> str:
         """

             # Model config optimizations
             model_kwargs = {
                 "trust_remote_code": True,
                 "torch_dtype": self.torch_dtype,
             }
+            # Enable flash attention if requested and available (GPU only)
+            if self.use_flash_attention and self.device == "cuda":
                 model_kwargs["attn_implementation"] = "flash_attention_2"
             self.model = await asyncio.to_thread(
                 **model_kwargs
             )
+            # Ensure cache is enabled on model config
+            if hasattr(self.model.config, 'use_cache'):
+                self.model.config.use_cache = self.use_cache
             self._initialized = True
+            print(f"[{self.name}] Model loaded successfully (use_cache={self.use_cache})")
         except Exception as e:
             print(f"[{self.name}] Failed to load model: {e}")
+            raise
     async def generate(
         self,
         **kwargs
     ) -> str:
         """
+        Generate text using direct model.generate() with proper KV caching.
+        KV Cache Impact (with proper implementation):
+        - WITH: ~9 seconds for 10 ads (50 gaps)
         - WITHOUT: ~42 seconds (4.7x slower)
         """
+        if not self._initialized or self.model is None:
             raise RuntimeError(f"[{self.name}] Model not initialized")
         formatted_prompt = None
                 )
             except Exception as e:
                 print(f"[{self.name}] apply_chat_template failed: {e}, using fallback")
                 formatted_prompt = self._format_chat_fallback(chat_messages)
+        # Use raw prompt if provided
         if formatted_prompt is None and prompt:
             formatted_prompt = prompt
         if formatted_prompt is None:
             raise ValueError("Either prompt or chat_messages required")
+        # Tokenize input
+        inputs = await asyncio.to_thread(
+            self.tokenizer.encode,
+            formatted_prompt,
+            return_tensors="pt"
+        )
+        # Move to device
+        if self.device == "cuda":
+            inputs = await asyncio.to_thread(lambda: inputs.to("cuda"))
+        # Generate with explicit KV cache
         outputs = await asyncio.to_thread(
+            self.model.generate,
+            inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=temperature,
+            top_p=top_p,
+            use_cache=True,  # CRITICAL: Enable KV cache
+            use_xformers_attention=False,  # CPU doesn't support this
+            eos_token_id=self.tokenizer.eos_token_id,
+            pad_token_id=self.tokenizer.eos_token_id if self.tokenizer.pad_token_id is None else self.tokenizer.pad_token_id,
         )
+        # Decode output
+        output_text = await asyncio.to_thread(
+            self.tokenizer.decode,
+            outputs[0],
+            skip_special_tokens=True
+        )
+        # Remove prompt from output
+        if output_text.startswith(formatted_prompt):
+            response = output_text[len(formatted_prompt):]
+        else:
+            response = output_text
+        # Clean up special tokens
+        for token in ["<|im_end|>", "<end_of_turn>", "<eos>", "</s>"]:
+            if response.endswith(token):
+                response = response[:-len(token)]
+        return response.strip()
     def _format_chat_fallback(self, chat_messages: List[Dict[str, str]]) -> str:
         """