credent007 commited on
Commit
ed121f1
·
verified ·
1 Parent(s): 342762f

Update llm.py

Browse files
Files changed (1) hide show
  1. llm.py +34 -40
llm.py CHANGED
@@ -2,55 +2,49 @@ import torch
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
  from functools import partial
4
  import asyncio
5
-
6
  model_name = "microsoft/phi-3-mini-128k-instruct"
7
 
8
  # 8-bit quantization config
9
  quant_config = BitsAndBytesConfig(
10
- load_in_8bit=True,
11
- llm_int8_enable_fp32_cpu_offload=True # Helpful if GPU memory gets tight
12
  )
13
 
14
  # tokenizer
15
- tokenizer = AutoTokenizer.from_pretrained(model_name)
16
 
17
  # model
18
  model = AutoModelForCausalLM.from_pretrained(
19
- model_name,
20
- device_map="auto",
21
- quantization_config=quant_config,
22
- trust_remote_code=True
 
23
  )
24
-
25
- def execute_llm_sync(model, tokenizer, prompt: str):
26
- """Synchronous function to run the actual inference."""
27
- messages = [
28
- {"role": "user", "content": prompt}
29
- ]
30
-
31
- # Prep inputs and move to the same device as the model (usually GPU)
32
- inputs = tokenizer.apply_chat_template(
33
- messages,
34
- add_generation_prompt=True,
35
- return_tensors="pt"
36
- ).to(model.device)
37
-
38
- # Generate
39
- with torch.no_grad():
40
- outputs = model.generate(
41
- **inputs,
42
- max_new_tokens=2048 # 10,000 is very high; Phi-3 works better with smaller chunks
43
- )
44
-
45
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
46
-
47
  async def call_llm(prompt: str):
48
- """Asynchronous wrapper to prevent blocking the main thread."""
49
- loop = asyncio.get_running_loop()
50
-
51
- # We use partial to pass arguments to the synchronous function
52
- func = partial(execute_llm_sync, model, tokenizer, prompt)
53
-
54
- # Run the heavy computation in a background thread
55
- result = await loop.run_in_executor(None, func)
56
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
  from functools import partial
4
  import asyncio
 
5
  model_name = "microsoft/phi-3-mini-128k-instruct"
6
 
7
  # 8-bit quantization config
8
  quant_config = BitsAndBytesConfig(
9
+     load_in_8bit=True
 
10
  )
11
 
12
  # tokenizer
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name,local_files_only=True)
14
 
15
  # model
16
  model = AutoModelForCausalLM.from_pretrained(
17
+     model_name,
18
+     device_map="auto",
19
+     quantization_config=quant_config,
20
+     local_files_only=True,
21
+     trust_remote_code=True
22
  )
23
+ # Mock LLM function (replace with your actual LLM API call)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  async def call_llm(prompt: str):
25
+     # Simulate LLM call in executor
26
+     llm_function_with_args=partial(execute_llm(model,tokenizer,prompt))
27
+     loop = asyncio.get_event_loop()
28
+     result = await loop.run_in_executor(None, llm_function_with_args)
29
+     return result
30
+
31
+ async def execute_llm(model,tokenizer,prompt:str):
32
+     prompt="what is json give an example "
33
+     data=""
34
+     full_prompt=prompt+" "+data
35
+     messages = [
36
+         {"role": "user", "content":full_prompt }
37
+     ]
38
+
39
+     inputs = tokenizer.apply_chat_template(
40
+         messages,
41
+         add_generation_prompt=True,
42
+         return_tensors="pt"
43
+     ).to(model.device)
44
+
45
+     outputs = model.generate(
46
+         **inputs, 
47
+         max_new_tokens=500
48
+     )
49
+
50
+     return tokenizer.decode(outputs[0], skip_special_tokens=True)