credent007 commited on
Commit
d146204
·
verified ·
1 Parent(s): 3378f60

Update llm.py

Browse files
Files changed (1) hide show
  1. llm.py +7 -7
llm.py CHANGED
@@ -1,6 +1,7 @@
1
  import torch
2
  import asyncio
3
  from functools import partial
 
4
  from transformers import AutoProcessor, AutoModelForImageTextToText #, BitsAndBytesConfig
5
 
6
  # Quantization config
@@ -13,7 +14,7 @@ processor = AutoProcessor.from_pretrained("datalab-to/chandra-ocr-2")
13
  model = AutoModelForImageTextToText.from_pretrained(
14
  "datalab-to/chandra-ocr-2",
15
  # quantization_config=quant_config,
16
- device_map="cuda"
17
  )
18
 
19
  print("CUDA available:", torch.cuda.is_available())
@@ -22,11 +23,12 @@ print("Model device:", model.device)
22
  if torch.cuda.is_available():
23
  print("GPU name:", torch.cuda.get_device_name(0))
24
  print("Memory allocated:", torch.cuda.memory_allocated() / 1e9, "GB")
 
25
 
26
- # ✅ SYNC function (runs in thread)
27
  def execute_llm(model, processor, image, prompt: str):
28
  print("execute llm")
29
-
 
30
  # ✅ Use passed prompt (FIXED)
31
  if not prompt:
32
  prompt = """
@@ -43,7 +45,7 @@ def execute_llm(model, processor, image, prompt: str):
43
  }
44
  ]
45
 
46
- # ✅ Inference mode (faster + less memory)
47
  with torch.inference_mode():
48
 
49
  inputs = processor.apply_chat_template(
@@ -68,16 +70,14 @@ def execute_llm(model, processor, image, prompt: str):
68
  outputs[0][inputs["input_ids"].shape[-1]:],
69
  skip_special_tokens=True
70
  )
71
-
72
  print(result)
73
  return result
74
 
75
 
76
- # ✅ ASYNC wrapper (non-blocking FastAPI)
77
  async def call_llm(image, prompt: str = ""):
78
  print("call llm")
79
 
80
- loop = asyncio.get_event_loop()
81
 
82
  result=execute_llm(model,processor,image,prompt)
83
 
 
1
  import torch
2
  import asyncio
3
  from functools import partial
4
+ import time
5
  from transformers import AutoProcessor, AutoModelForImageTextToText #, BitsAndBytesConfig
6
 
7
  # Quantization config
 
14
  model = AutoModelForImageTextToText.from_pretrained(
15
  "datalab-to/chandra-ocr-2",
16
  # quantization_config=quant_config,
17
+ device_map="auto"
18
  )
19
 
20
  print("CUDA available:", torch.cuda.is_available())
 
23
  if torch.cuda.is_available():
24
  print("GPU name:", torch.cuda.get_device_name(0))
25
  print("Memory allocated:", torch.cuda.memory_allocated() / 1e9, "GB")
26
+
27
 
 
28
  def execute_llm(model, processor, image, prompt: str):
29
  print("execute llm")
30
+ print(prompt)
31
+ a=time.time()
32
  # ✅ Use passed prompt (FIXED)
33
  if not prompt:
34
  prompt = """
 
45
  }
46
  ]
47
 
48
+
49
  with torch.inference_mode():
50
 
51
  inputs = processor.apply_chat_template(
 
70
  outputs[0][inputs["input_ids"].shape[-1]:],
71
  skip_special_tokens=True
72
  )
73
+ print('total time taken',time.time()-a)
74
  print(result)
75
  return result
76
 
77
 
 
78
  async def call_llm(image, prompt: str = ""):
79
  print("call llm")
80
 
 
81
 
82
  result=execute_llm(model,processor,image,prompt)
83