credent007 commited on
Commit
da8bf0e
·
verified ·
1 Parent(s): ed121f1

Update llm.py

Browse files
Files changed (1) hide show
  1. llm.py +46 -40
llm.py CHANGED
@@ -1,50 +1,56 @@
1
  import torch
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
- from functools import partial
4
  import asyncio
 
 
5
  model_name = "microsoft/phi-3-mini-128k-instruct"
6
 
7
- # 8-bit quantization config
8
- quant_config = BitsAndBytesConfig(
9
-     load_in_8bit=True
10
- )
11
 
12
- # tokenizer
13
- tokenizer = AutoTokenizer.from_pretrained(model_name,local_files_only=True)
 
 
 
14
 
15
- # model
16
  model = AutoModelForCausalLM.from_pretrained(
17
-     model_name,
18
-     device_map="auto",
19
-     quantization_config=quant_config,
20
-     local_files_only=True,
21
-     trust_remote_code=True
22
  )
23
- # Mock LLM function (replace with your actual LLM API call)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  async def call_llm(prompt: str):
25
-     # Simulate LLM call in executor
26
-     llm_function_with_args=partial(execute_llm(model,tokenizer,prompt))
27
-     loop = asyncio.get_event_loop()
28
-     result = await loop.run_in_executor(None, llm_function_with_args)
29
-     return result
30
-
31
- async def execute_llm(model,tokenizer,prompt:str):
32
-     prompt="what is json give an example "
33
-     data=""
34
-     full_prompt=prompt+" "+data
35
-     messages = [
36
-         {"role": "user", "content":full_prompt }
37
-     ]
38
-
39
-     inputs = tokenizer.apply_chat_template(
40
-         messages,
41
-         add_generation_prompt=True,
42
-         return_tensors="pt"
43
-     ).to(model.device)
44
-
45
-     outputs = model.generate(
46
-         **inputs, 
47
-         max_new_tokens=500
48
-     )
49
-
50
-     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
1
  import torch
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 
3
  import asyncio
4
+ from functools import partial
5
+
6
  model_name = "microsoft/phi-3-mini-128k-instruct"
7
 
8
+ # Quantization
9
+ quant_config = BitsAndBytesConfig(load_in_8bit=True)
 
 
10
 
11
+ # Load tokenizer
12
+ tokenizer = AutoTokenizer.from_pretrained(
13
+ model_name,
14
+ local_files_only=True
15
+ )
16
 
17
+ # Load model
18
  model = AutoModelForCausalLM.from_pretrained(
19
+ model_name,
20
+ device_map="auto",
21
+ quantization_config=quant_config,
22
+ local_files_only=True,
23
+ trust_remote_code=True
24
  )
25
+
26
+ # ✅ SYNC function (important)
27
+ def execute_llm(model, tokenizer, prompt: str):
28
+ messages = [
29
+ {"role": "user", "content": prompt}
30
+ ]
31
+
32
+ inputs = tokenizer.apply_chat_template(
33
+ messages,
34
+ add_generation_prompt=True,
35
+ return_tensors="pt"
36
+ ).to(model.device)
37
+
38
+ with torch.no_grad():
39
+ outputs = model.generate(
40
+ **inputs,
41
+ max_new_tokens=300, # keep safe
42
+ do_sample=True,
43
+ temperature=0.7
44
+ )
45
+
46
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
47
+
48
+
49
+ # ✅ ASYNC wrapper
50
  async def call_llm(prompt: str):
51
+ loop = asyncio.get_event_loop()
52
+
53
+ func = partial(execute_llm, model, tokenizer, prompt)
54
+
55
+ result = await loop.run_in_executor(None, func)
56
+ return result