credent007 commited on
Commit
1b1512c
·
verified ·
1 Parent(s): 4ff8d3a

Update llm.py

Browse files
Files changed (1) hide show
  1. llm.py +24 -34
llm.py CHANGED
@@ -1,5 +1,5 @@
1
- import os
2
- hf_token=os.getenv("HUGGINGFACE_HUB_TOKEN")
3
  import torch
4
  import asyncio
5
  from functools import partial
@@ -8,16 +8,17 @@ from transformers import AutoProcessor, AutoModelForImageTextToText #, BitsAndBy
8
 
9
  # Quantization config
10
  # quant_config = BitsAndBytesConfig(load_in_8bit=True)
11
-
12
  # Load processor
13
- processor = AutoProcessor.from_pretrained("datalab-to/chandra-ocr-2")
14
 
15
  # Load model (auto device mapping)
16
  model = AutoModelForImageTextToText.from_pretrained(
17
- "datalab-to/chandra-ocr-2",
18
  # quantization_config=quant_config,
19
  device_map="auto",
20
- token=hf_token
 
21
  )
22
 
23
  print("CUDA available:", torch.cuda.is_available())
@@ -29,26 +30,21 @@ if torch.cuda.is_available():
29
 
30
 
31
  def execute_llm(model, processor, image, prompt: str):
32
- print("execute llm")
33
- print(prompt)
34
- a=time.time()
35
- # ✅ Use passed prompt (FIXED)
36
  if not prompt:
37
  prompt = """
38
- Extract all text from the given image and return key value pair like json.
 
39
  """
40
 
41
- messages = [
42
- {
43
- "role": "user",
44
- "content": [
45
- {"type": "image", "image": image},
46
- {"type": "text", "text": prompt}
47
- ]
48
- }
49
- ]
50
 
51
-
52
  with torch.inference_mode():
53
 
54
  inputs = processor.apply_chat_template(
@@ -57,31 +53,25 @@ def execute_llm(model, processor, image, prompt: str):
57
  tokenize=True,
58
  return_dict=True,
59
  return_tensors="pt"
60
- ).to(model.device)
61
 
62
- print("inputs ready")
63
 
64
  outputs = model.generate(
65
  **inputs,
66
- max_new_tokens=1000, # 🔥 REDUCED (important)
67
- do_sample=False # deterministic output
68
  )
69
 
70
- print("generated")
71
-
72
- result = processor.decode(
73
  outputs[0][inputs["input_ids"].shape[-1]:],
74
  skip_special_tokens=True
75
  )
76
- print('total time taken',time.time()-a)
77
- print(result)
78
- return result
79
-
80
 
81
  async def call_llm(image, prompt: str = ""):
82
  print("call llm")
83
 
84
-
85
- result=execute_llm(model,processor,image,prompt)
86
 
87
  return result
 
1
+ from huggingface_hub import login
2
+ login(token=os.getenv("HUGGINGFACE_HUB_TOKEN"))
3
  import torch
4
  import asyncio
5
  from functools import partial
 
8
 
9
  # Quantization config
10
  # quant_config = BitsAndBytesConfig(load_in_8bit=True)
11
+ model_name="Qwen/Qwen3.5-9B-Base"
12
  # Load processor
13
+ processor = AutoProcessor.from_pretrained(model_name)
14
 
15
  # Load model (auto device mapping)
16
  model = AutoModelForImageTextToText.from_pretrained(
17
+ model_name,
18
  # quantization_config=quant_config,
19
  device_map="auto",
20
+ attn_implementation='flash_attention_2'
21
+
22
  )
23
 
24
  print("CUDA available:", torch.cuda.is_available())
 
30
 
31
 
32
  def execute_llm(model, processor, image, prompt: str):
33
+
 
 
 
34
  if not prompt:
35
  prompt = """
36
+ Extract all text from image.
37
+ Return ONLY valid JSON.
38
  """
39
 
40
+ messages = [{
41
+ "role": "user",
42
+ "content": [
43
+ {"type": "image", "image": image},
44
+ {"type": "text", "text": prompt}
45
+ ]
46
+ }]
 
 
47
 
 
48
  with torch.inference_mode():
49
 
50
  inputs = processor.apply_chat_template(
 
53
  tokenize=True,
54
  return_dict=True,
55
  return_tensors="pt"
56
+ )
57
 
58
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
59
 
60
  outputs = model.generate(
61
  **inputs,
62
+ max_new_tokens=200,
63
+ do_sample=False
64
  )
65
 
66
+ return processor.decode(
 
 
67
  outputs[0][inputs["input_ids"].shape[-1]:],
68
  skip_special_tokens=True
69
  )
 
 
 
 
70
 
71
  async def call_llm(image, prompt: str = ""):
72
  print("call llm")
73
 
74
+ loop = asyncio.get_event_loop()
75
+ result = await loop.run_in_executor(None, execute_llm, model, processor, image, prompt)
76
 
77
  return result