Sandei commited on
Commit
e0d9dd8
·
1 Parent(s): 0a70e53

speed improvement

Browse files
service/knowledge_base.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from service.data_loader_service import CSVDataLoader
4
+ from service.embedded_service import EmbeddingService
5
+ from service.vector_store_service import VectorStoreService
6
+
7
+
8
+ class KnowledgeBase:
9
+ def __init__(self, csv_path: str):
10
+ self.embedder = EmbeddingService()
11
+
12
+ loader = CSVDataLoader(csv_path)
13
+ qa_pairs = loader.load_qa_pairs()
14
+
15
+ self.documents = [
16
+ f"Question: {p['question']}\nAnswer: {p['answer']}"
17
+ for p in qa_pairs
18
+ ]
19
+
20
+ self.embeddings = self.embedder.embed(self.documents)
21
+
22
+ self.vector_store = VectorStoreService(
23
+ self.embeddings,
24
+ self.documents
25
+ )
service/llm_service.py CHANGED
@@ -1,24 +1,25 @@
1
  import torch
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
 
4
-
5
  class LLMService:
6
  def __init__(self):
7
  self.model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
8
 
9
- # Tokenizer
 
 
 
10
  self.tokenizer = AutoTokenizer.from_pretrained(
11
  self.model_name,
12
  use_fast=True
13
  )
14
 
15
- # Load model in FP32 on CPU
16
  model = AutoModelForCausalLM.from_pretrained(
17
  self.model_name,
18
  torch_dtype=torch.float32
19
  )
20
 
21
- # 🔥 CPU INT8 dynamic quantization
22
  self.model = torch.quantization.quantize_dynamic(
23
  model,
24
  {torch.nn.Linear},
@@ -27,22 +28,22 @@ class LLMService:
27
 
28
  self.model.eval()
29
 
30
- # Optional sanity check
31
- print("LLM loaded with dtype:", next(self.model.parameters()).dtype)
32
 
33
  def generate(self, prompt: str) -> str:
34
  inputs = self.tokenizer(
35
  prompt,
36
  return_tensors="pt",
37
  truncation=True,
38
- max_length=1024
39
  )
40
 
41
  with torch.no_grad():
42
  output = self.model.generate(
43
  **inputs,
44
- max_new_tokens=120, # ⬅️ faster + enough
45
- do_sample=False, # ⬅️ HUGE speed win
 
46
  eos_token_id=self.tokenizer.eos_token_id
47
  )
48
 
@@ -54,12 +55,10 @@ class LLMService:
54
  return self._clean(text)
55
 
56
  def _clean(self, text: str) -> str:
57
- # Extract content AFTER <|assistant|>
58
  if "<|assistant|>" in text:
59
  text = text.split("<|assistant|>")[-1]
60
 
61
- # Stop if model continues roles
62
- for stop in ["<|system|>", "<|user|>"]:
63
  if stop in text:
64
  text = text.split(stop)[0]
65
 
 
1
  import torch
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
 
 
4
  class LLMService:
5
  def __init__(self):
6
  self.model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
7
 
8
+ # 🔥 Limit CPU threads (CRITICAL)
9
+ torch.set_num_threads(4) # try 2–6 depending on CPU
10
+ torch.set_num_interop_threads(1)
11
+
12
  self.tokenizer = AutoTokenizer.from_pretrained(
13
  self.model_name,
14
  use_fast=True
15
  )
16
 
 
17
  model = AutoModelForCausalLM.from_pretrained(
18
  self.model_name,
19
  torch_dtype=torch.float32
20
  )
21
 
22
+ # 🔥 INT8 dynamic quantization
23
  self.model = torch.quantization.quantize_dynamic(
24
  model,
25
  {torch.nn.Linear},
 
28
 
29
  self.model.eval()
30
 
31
+ print("LLM loaded | dtype:", next(self.model.parameters()).dtype)
 
32
 
33
  def generate(self, prompt: str) -> str:
34
  inputs = self.tokenizer(
35
  prompt,
36
  return_tensors="pt",
37
  truncation=True,
38
+ max_length=768 # ⬅️ smaller context = faster attention
39
  )
40
 
41
  with torch.no_grad():
42
  output = self.model.generate(
43
  **inputs,
44
+ max_new_tokens=120,
45
+ do_sample=False,
46
+ use_cache=True, # ⬅️ IMPORTANT
47
  eos_token_id=self.tokenizer.eos_token_id
48
  )
49
 
 
55
  return self._clean(text)
56
 
57
  def _clean(self, text: str) -> str:
 
58
  if "<|assistant|>" in text:
59
  text = text.split("<|assistant|>")[-1]
60
 
61
+ for stop in ("<|system|>", "<|user|>"):
 
62
  if stop in text:
63
  text = text.split(stop)[0]
64
 
service/vector_store_service.py CHANGED
@@ -2,11 +2,13 @@ import numpy as np
2
 
3
  class VectorStoreService:
4
  def __init__(self, embeddings, documents):
5
- self.embeddings = np.array(embeddings)
6
  self.documents = documents
7
 
8
- def search(self, query_embedding, top_k: int = 3):
9
- query = np.array(query_embedding)
 
10
  scores = np.dot(self.embeddings, query)
11
- top_idx = scores.argsort()[-top_k:][::-1]
12
- return [self.documents[i] for i in top_idx]
 
 
2
 
3
  class VectorStoreService:
4
  def __init__(self, embeddings, documents):
5
+ self.embeddings = np.array(embeddings, dtype="float32")
6
  self.documents = documents
7
 
8
+ def search(self, query_embedding, top_k=3):
9
+ query = np.array(query_embedding, dtype="float32")
10
+
11
  scores = np.dot(self.embeddings, query)
12
+ top_indices = scores.argsort()[-top_k:][::-1]
13
+
14
+ return [self.documents[i] for i in top_indices]