Abdulmateen commited on
Commit
89a2202
·
verified ·
1 Parent(s): 8914801

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +27 -42
handler.py CHANGED
@@ -1,61 +1,46 @@
1
  import torch
2
  from transformers import AutoProcessor, LlavaForConditionalGeneration
3
  from PIL import Image
4
- import requests
5
- from io import BytesIO
6
  import base64
 
7
 
8
  class EndpointHandler:
9
  def __init__(self, path=""):
10
- # This loading logic is robust and correct...
11
- base_model_id = "llava-hf/llava-1.5-7b-hf"
12
-
13
- print("Loading processor...")
14
- self.processor = AutoProcessor.from_pretrained(base_model_id)
15
 
16
- print("Loading base model...")
 
17
  self.model = LlavaForConditionalGeneration.from_pretrained(
18
- base_model_id,
19
  load_in_4bit=True,
20
  torch_dtype=torch.float16,
21
- device_map="auto"
 
22
  )
23
-
24
- print(f"Loading LoRA adapters from repository path: {path}...")
25
- # This assumes your repo at `path` contains the LoRA adapter files
26
- self.model.load_adapter(path)
27
- print("✅ Model and adapters loaded successfully.")
28
 
29
  def __call__(self, data: dict) -> dict:
30
- # --- Simplified and Corrected Inference Logic ---
31
- payload = data.pop("inputs", data)
32
-
33
- prompt_text = payload.pop("prompt", "What can you do?")
34
- image_b64 = payload.pop("image_b64", None)
35
- max_new_tokens = payload.pop("max_new_tokens", 200)
36
-
37
- image = None
38
- if image_b64:
39
- try:
40
- image_bytes = base64.b64decode(image_b64)
41
- image = Image.open(BytesIO(image_bytes)).convert("RGB")
42
- except Exception as e:
43
- return {"error": f"Failed to decode base64 image: {e}"}
44
-
45
- # This is the key change: a simple, clear separation of logic
46
- if image is not None:
47
- # --- Case 1: Multimodal (Image + Text) Request ---
48
- prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
49
- inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.model.device)
50
- output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
51
- else:
52
- # --- Case 2: Text-Only Request ---
53
- prompt = f"USER: {prompt_text} ASSISTANT:"
54
- inputs = self.processor(text=prompt, return_tensors="pt").to(self.model.device)
55
  output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
56
 
57
- # Decode the output and extract the assistant's response
58
  full_response = self.processor.decode(output[0], skip_special_tokens=True)
59
  assistant_response = full_response.split("ASSISTANT:")[-1].strip()
60
-
61
  return {"generated_text": assistant_response}
 
1
  import torch
2
  from transformers import AutoProcessor, LlavaForConditionalGeneration
3
  from PIL import Image
 
 
4
  import base64
5
+ from io import BytesIO
6
 
7
  class EndpointHandler:
8
  def __init__(self, path=""):
9
+ # The 'path' is now a self-contained directory with the complete, merged model.
10
+ # No internet access is needed here.
 
 
 
11
 
12
+ print("Loading model and processor from local path...")
13
+ self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
14
  self.model = LlavaForConditionalGeneration.from_pretrained(
15
+ path,
16
  load_in_4bit=True,
17
  torch_dtype=torch.float16,
18
+ device_map="auto",
19
+ trust_remote_code=True
20
  )
21
+ print("✅ Model loaded successfully.")
 
 
 
 
22
 
23
  def __call__(self, data: dict) -> dict:
24
+ prompt_text = data.pop("prompt", "Describe the image in detail.")
25
+ image_b64 = data.pop("image_b64", None)
26
+ max_new_tokens = data.pop("max_new_tokens", 200)
27
+
28
+ if not image_b64:
29
+ return {"error": "No image provided. Please use the 'image_b64' key."}
30
+
31
+ try:
32
+ image_bytes = base64.b64decode(image_b64)
33
+ image = Image.open(BytesIO(image_bytes))
34
+ except Exception as e:
35
+ return {"error": f"Failed to decode or open base64 image: {e}"}
36
+
37
+ prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
38
+ inputs = self.processor(text=prompt, images=image, return_tensors="pt").to("cuda")
39
+
40
+ with torch.no_grad():
 
 
 
 
 
 
 
 
41
  output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
42
 
 
43
  full_response = self.processor.decode(output[0], skip_special_tokens=True)
44
  assistant_response = full_response.split("ASSISTANT:")[-1].strip()
45
+
46
  return {"generated_text": assistant_response}