Sandei commited on
Commit
28cd2e3
·
1 Parent(s): a5d886c

history error temp fix

Browse files
__pycache__/app.cpython-314.pyc CHANGED
Binary files a/__pycache__/app.cpython-314.pyc and b/__pycache__/app.cpython-314.pyc differ
 
app.py CHANGED
@@ -152,7 +152,7 @@ def query_endpoint(req: QueryRequest):
152
  categories, urgency = classify_text(req.query)
153
 
154
  # RAG
155
- answer = generate_answer(req.query,history)
156
 
157
  # Update conversation memory
158
  add_message(req.user_id, "user", req.query)
 
152
  categories, urgency = classify_text(req.query)
153
 
154
  # RAG
155
+ answer = generate_answer(req.query,"1")
156
 
157
  # Update conversation memory
158
  add_message(req.user_id, "user", req.query)
service/__pycache__/llm_service.cpython-314.pyc CHANGED
Binary files a/service/__pycache__/llm_service.cpython-314.pyc and b/service/__pycache__/llm_service.cpython-314.pyc differ
 
service/llm_service.py CHANGED
@@ -6,21 +6,30 @@ class LLMService:
6
  model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
7
 
8
  self.tokenizer = AutoTokenizer.from_pretrained(model_id)
 
9
  self.model = AutoModelForCausalLM.from_pretrained(
10
  model_id,
11
- torch_dtype=torch.float16,
12
- device_map="auto"
13
  )
14
 
15
- def generate(self, prompt: str) -> str:
16
- inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
17
 
18
- output = self.model.generate(
19
- **inputs,
20
- max_new_tokens=200,
21
- temperature=0.3,
22
- top_p=0.9,
23
- do_sample=True
24
  )
25
 
 
 
 
 
 
 
 
 
 
26
  return self.tokenizer.decode(output[0], skip_special_tokens=True)
 
6
  model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
7
 
8
  self.tokenizer = AutoTokenizer.from_pretrained(model_id)
9
+
10
  self.model = AutoModelForCausalLM.from_pretrained(
11
  model_id,
12
+ torch_dtype=torch.float32, # IMPORTANT
 
13
  )
14
 
15
+ self.model.to("cpu")
16
+ self.model.eval()
17
 
18
+ def generate(self, prompt: str) -> str:
19
+ inputs = self.tokenizer(
20
+ prompt,
21
+ return_tensors="pt",
22
+ truncation=True,
23
+ max_length=2048
24
  )
25
 
26
+ with torch.no_grad():
27
+ output = self.model.generate(
28
+ **inputs,
29
+ max_new_tokens=200,
30
+ temperature=0.3,
31
+ top_p=0.9,
32
+ do_sample=True
33
+ )
34
+
35
  return self.tokenizer.decode(output[0], skip_special_tokens=True)