Spaces:

Sandei
/

tech-support-helpdesk-chatbot

Sleeping

App Files Files Community

Sandei commited on Feb 4

Commit

0a70e53

1 Parent(s): d5dcd77

quantization

Browse files

Files changed (1) hide show

service/llm_service.py +42 -16

service/llm_service.py CHANGED Viewed

@@ -1,34 +1,60 @@
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 class LLMService:
     def __init__(self):
         self.tokenizer = AutoTokenizer.from_pretrained(
-            "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
         )
-        self.model = AutoModelForCausalLM.from_pretrained(
-            "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-            torch_dtype=torch.float16,
-            device_map="auto"
         )
     def generate(self, prompt: str) -> str:
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
-        output = self.model.generate(
-            **inputs,
-            max_new_tokens=256,
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.9,
-            eos_token_id=self.tokenizer.eos_token_id
         )
-        text = self.tokenizer.decode(output[0], skip_special_tokens=False)
         return self._clean(text)
     def _clean(self, text: str) -> str:
-        # ✅ Extract content AFTER <|assistant|>
         if "<|assistant|>" in text:
             text = text.split("<|assistant|>")[-1]

 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 class LLMService:
     def __init__(self):
+        self.model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+        # Tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name,
+            use_fast=True
         )
+        # Load model in FP32 on CPU
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.float32
+        )
+        # 🔥 CPU INT8 dynamic quantization
+        self.model = torch.quantization.quantize_dynamic(
+            model,
+            {torch.nn.Linear},
+            dtype=torch.qint8
         )
+        self.model.eval()
+        # Optional sanity check
+        print("LLM loaded with dtype:", next(self.model.parameters()).dtype)
     def generate(self, prompt: str) -> str:
+        inputs = self.tokenizer(
+            prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=1024
+        )
+        with torch.no_grad():
+            output = self.model.generate(
+                **inputs,
+                max_new_tokens=120,      # ⬅️ faster + enough
+                do_sample=False,         # ⬅️ HUGE speed win
+                eos_token_id=self.tokenizer.eos_token_id
+            )
+        text = self.tokenizer.decode(
+            output[0],
+            skip_special_tokens=False
         )
         return self._clean(text)
     def _clean(self, text: str) -> str:
+        # Extract content AFTER <|assistant|>
         if "<|assistant|>" in text:
             text = text.split("<|assistant|>")[-1]