Abdulmateen commited on
Commit
7f13c53
·
verified ·
1 Parent(s): 6dcee73

Upload 2 files

Browse files

Uploading Inference spices

Files changed (2) hide show
  1. handler.py +66 -0
  2. requirements.txt +7 -0
handler.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoProcessor, LlavaForConditionalGeneration
3
+ from peft import PeftModel
4
+ from PIL import Image
5
+ import requests
6
+ from io import BytesIO
7
+ import base64
8
+
9
+ class EndpointHandler:
10
+ def __init__(self, path=""):
11
+ # The 'path' argument will be the path to your LoRA repo on the Hub, e.g., "Abdulmateen/llava-finetuned"
12
+
13
+ # Define the base model that your LoRA was trained on
14
+ base_model_id = "llava-hf/llava-1.5-7b-hf"
15
+
16
+ print("Loading processor...")
17
+ # Pinning to a specific revision for stability
18
+ self.processor = AutoProcessor.from_pretrained(base_model_id, revision="a272c74")
19
+
20
+ print("Loading base model...")
21
+ # Load the base model in 4-bit for memory efficiency
22
+ self.model = LlavaForConditionalGeneration.from_pretrained(
23
+ base_model_id,
24
+ load_in_4bit=True,
25
+ torch_dtype=torch.float16,
26
+ device_map="auto"
27
+ )
28
+
29
+ print(f"Loading LoRA adapters from repository path: {path}...")
30
+ # Load and merge your LoRA adapters onto the base model
31
+ self.model = PeftModel.from_pretrained(self.model, path)
32
+ print("✅ Model and adapters loaded successfully.")
33
+
34
+ def __call__(self, data: dict) -> dict:
35
+ # Get the prompt and image from the request payload
36
+ prompt_text = data.pop("prompt", "Describe the image in detail.")
37
+ image_url = data.pop("image_url", None)
38
+ image_b64 = data.pop("image_b64", None)
39
+ max_new_tokens = data.pop("max_new_tokens", 200)
40
+
41
+ # Load image from either a URL or a base64 string
42
+ if image_url:
43
+ response = requests.get(image_url)
44
+ image = Image.open(BytesIO(response.content))
45
+ elif image_b64:
46
+ image_bytes = base64.b64decode(image_b64)
47
+ image = Image.open(BytesIO(image_bytes))
48
+ else:
49
+ return {"error": "No image provided. Please use 'image_url' or 'image_b64'."}
50
+
51
+ # Format the prompt for LLaVA
52
+ prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
53
+
54
+ # Process inputs
55
+ inputs = self.processor(text=prompt, images=image, return_tensors="pt").to("cuda")
56
+
57
+ # Generate a response
58
+ with torch.no_grad():
59
+ output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
60
+
61
+ # Decode and clean up the response
62
+ full_response = self.processor.decode(output[0], skip_special_tokens=True)
63
+ # Extract only the assistant's part of the response
64
+ assistant_response = full_response.split("ASSISTANT:")[-1].strip()
65
+
66
+ return {"generated_text": assistant_response}
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch==2.3.0
2
+ transformers==4.37.2
3
+ accelerate==0.28.0
4
+ bitsandbytes
5
+ peft
6
+ Pillow
7
+ requests