Spaces:
Paused
Paused
π¨ Mimic GenAIOrchestration
Browse files- app.py +3 -1
- internvl_utils.py +17 -5
- models/InternVL3/intervl3.py +35 -9
- models/misc_utils.py +22 -1
- payload_model.py +5 -2
app.py
CHANGED
|
@@ -19,7 +19,9 @@ def healthcheck():
|
|
| 19 |
async def inference(payload: PayloadModel, token: str = Depends(authenticate_token)):
|
| 20 |
try:
|
| 21 |
model_response = await internvl_inference(model, payload)
|
| 22 |
-
|
|
|
|
|
|
|
| 23 |
except Exception as e:
|
| 24 |
print(f"Error: {e}")
|
| 25 |
return JSONResponse(status_code=500, content={"status": "error", "message": str(e)})
|
|
|
|
| 19 |
async def inference(payload: PayloadModel, token: str = Depends(authenticate_token)):
|
| 20 |
try:
|
| 21 |
model_response = await internvl_inference(model, payload)
|
| 22 |
+
model_response = "True" if model_response else "False"
|
| 23 |
+
final_response = {"1":{"query_status": model_response}}
|
| 24 |
+
return JSONResponse(status_code=200, content={"final_response": final_response})
|
| 25 |
except Exception as e:
|
| 26 |
print(f"Error: {e}")
|
| 27 |
return JSONResponse(status_code=500, content={"status": "error", "message": str(e)})
|
internvl_utils.py
CHANGED
|
@@ -6,7 +6,7 @@ from torchvision.transforms.functional import InterpolationMode
|
|
| 6 |
from transformers import AutoConfig
|
| 7 |
from models import InternVL3
|
| 8 |
from payload_model import PayloadModel
|
| 9 |
-
from models.misc_utils import
|
| 10 |
|
| 11 |
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
| 12 |
IMAGENET_STD = (0.229, 0.224, 0.225)
|
|
@@ -75,9 +75,10 @@ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbna
|
|
| 75 |
processed_images.append(thumbnail_img)
|
| 76 |
return processed_images
|
| 77 |
|
| 78 |
-
def load_image(
|
| 79 |
-
|
| 80 |
-
|
|
|
|
| 81 |
transform = build_transform(input_size=input_size)
|
| 82 |
images = dynamic_preprocess(pil_image, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
| 83 |
pixel_values = [transform(image) for image in images]
|
|
@@ -114,4 +115,15 @@ def split_model(model_name):
|
|
| 114 |
return device_map
|
| 115 |
|
| 116 |
async def internvl_inference(model: InternVL3, payload: PayloadModel):
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
from transformers import AutoConfig
|
| 7 |
from models import InternVL3
|
| 8 |
from payload_model import PayloadModel
|
| 9 |
+
from models.misc_utils import get_images_using_bbox, get_whole_image
|
| 10 |
|
| 11 |
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
| 12 |
IMAGENET_STD = (0.229, 0.224, 0.225)
|
|
|
|
| 75 |
processed_images.append(thumbnail_img)
|
| 76 |
return processed_images
|
| 77 |
|
| 78 |
+
def load_image(pil_image, input_size=448, max_num=12):
|
| 79 |
+
pil_image = pil_image.convert('RGB')
|
| 80 |
+
|
| 81 |
+
# pil_image = convert_base64_to_pil(image)
|
| 82 |
transform = build_transform(input_size=input_size)
|
| 83 |
images = dynamic_preprocess(pil_image, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
| 84 |
pixel_values = [transform(image) for image in images]
|
|
|
|
| 115 |
return device_map
|
| 116 |
|
| 117 |
async def internvl_inference(model: InternVL3, payload: PayloadModel):
|
| 118 |
+
try:
|
| 119 |
+
if payload.input_utilization_mode == "bbox":
|
| 120 |
+
images = get_images_using_bbox(payload)
|
| 121 |
+
else:
|
| 122 |
+
images = get_whole_image(payload)
|
| 123 |
+
|
| 124 |
+
except Exception as e:
|
| 125 |
+
raise Exception(f"Error: {e}")
|
| 126 |
+
|
| 127 |
+
prompt_keyword = payload.prompt
|
| 128 |
+
prompt_eval_mode = payload.prompt_eval_mode
|
| 129 |
+
return await model(images, prompt_keyword, prompt_eval_mode)
|
models/InternVL3/intervl3.py
CHANGED
|
@@ -4,7 +4,7 @@ from payload_model import PayloadModel
|
|
| 4 |
from internvl_utils import load_image
|
| 5 |
from pydantic import BaseModel, Field
|
| 6 |
from typing import Optional
|
| 7 |
-
|
| 8 |
class InternVL3(BaseModel):
|
| 9 |
model_name: str
|
| 10 |
model: Optional[AutoModel] = None
|
|
@@ -43,11 +43,11 @@ respond with "Yes" else respond with "No". Limit your response to either "Yes" o
|
|
| 43 |
query_prompt = None
|
| 44 |
return query_prompt
|
| 45 |
|
| 46 |
-
def predict(self,
|
| 47 |
-
pixel_values = load_image(
|
| 48 |
-
query_prompt = self.get_query_prompt(
|
| 49 |
if query_prompt is None:
|
| 50 |
-
model_response = f"Invalid prompt keyword: {
|
| 51 |
else:
|
| 52 |
model_response = self.model.chat(
|
| 53 |
self.tokenizer,
|
|
@@ -58,11 +58,37 @@ respond with "Yes" else respond with "No". Limit your response to either "Yes" o
|
|
| 58 |
|
| 59 |
return model_response
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
def extract_model_response(self, model_response: str):
|
| 62 |
return "Yes" in model_response
|
| 63 |
|
| 64 |
-
async def __call__(self,
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
|
|
|
| 4 |
from internvl_utils import load_image
|
| 5 |
from pydantic import BaseModel, Field
|
| 6 |
from typing import Optional
|
| 7 |
+
import PIL
|
| 8 |
class InternVL3(BaseModel):
|
| 9 |
model_name: str
|
| 10 |
model: Optional[AutoModel] = None
|
|
|
|
| 43 |
query_prompt = None
|
| 44 |
return query_prompt
|
| 45 |
|
| 46 |
+
def predict(self, pil_image: PIL.Image.Image, prompt_keyword: str):
|
| 47 |
+
pixel_values = load_image(pil_image)
|
| 48 |
+
query_prompt = self.get_query_prompt(prompt_keyword)
|
| 49 |
if query_prompt is None:
|
| 50 |
+
model_response = f"Invalid prompt keyword: {prompt_keyword}"
|
| 51 |
else:
|
| 52 |
model_response = self.model.chat(
|
| 53 |
self.tokenizer,
|
|
|
|
| 58 |
|
| 59 |
return model_response
|
| 60 |
|
| 61 |
+
def eval_or(self, images: list[PIL.Image.Image], prompt_keyword: str):
|
| 62 |
+
model_responses = []
|
| 63 |
+
for image in images:
|
| 64 |
+
model_response = self.predict(image, prompt_keyword)
|
| 65 |
+
model_responses.append(model_response)
|
| 66 |
+
if self.extract_model_response(model_response):
|
| 67 |
+
return True, model_responses
|
| 68 |
+
return False, model_responses
|
| 69 |
+
|
| 70 |
+
def eval_and(self, images: list[PIL.Image.Image], prompt_keyword: str):
|
| 71 |
+
model_responses = []
|
| 72 |
+
for image in images:
|
| 73 |
+
model_response = self.predict(image, prompt_keyword)
|
| 74 |
+
model_responses.append(model_response)
|
| 75 |
+
if not self.extract_model_response(model_response):
|
| 76 |
+
return False, model_responses
|
| 77 |
+
return True, model_responses
|
| 78 |
+
|
| 79 |
def extract_model_response(self, model_response: str):
|
| 80 |
return "Yes" in model_response
|
| 81 |
|
| 82 |
+
async def __call__(self, images: list[PIL.Image.Image], prompt_keyword: str, prompt_eval_mode: str):
|
| 83 |
+
overall_response = False
|
| 84 |
+
if prompt_eval_mode == "or":
|
| 85 |
+
overall_response, model_responses = self.eval_or(images, prompt_keyword)
|
| 86 |
+
elif prompt_eval_mode == "and":
|
| 87 |
+
overall_response, model_responses = self.eval_and(images, prompt_keyword)
|
| 88 |
+
else:
|
| 89 |
+
raise ValueError(f"Invalid prompt eval mode: {prompt_eval_mode}")
|
| 90 |
+
|
| 91 |
+
print(f"Model responses: {model_responses}")
|
| 92 |
+
|
| 93 |
+
return overall_response
|
| 94 |
|
models/misc_utils.py
CHANGED
|
@@ -2,7 +2,7 @@ import cv2
|
|
| 2 |
import numpy as np
|
| 3 |
import base64
|
| 4 |
from PIL import Image
|
| 5 |
-
|
| 6 |
|
| 7 |
def convert_base64_to_cv2(base64_string: str):
|
| 8 |
return cv2.imdecode(np.frombuffer(base64.b64decode(base64_string), np.uint8), cv2.IMREAD_COLOR)
|
|
@@ -13,3 +13,24 @@ def convert_cv2_to_pil(image: np.ndarray):
|
|
| 13 |
def convert_base64_to_pil(base64_string: str):
|
| 14 |
return convert_cv2_to_pil(convert_base64_to_cv2(base64_string))
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import base64
|
| 4 |
from PIL import Image
|
| 5 |
+
from payload_model import PayloadModel
|
| 6 |
|
| 7 |
def convert_base64_to_cv2(base64_string: str):
|
| 8 |
return cv2.imdecode(np.frombuffer(base64.b64decode(base64_string), np.uint8), cv2.IMREAD_COLOR)
|
|
|
|
| 13 |
def convert_base64_to_pil(base64_string: str):
|
| 14 |
return convert_cv2_to_pil(convert_base64_to_cv2(base64_string))
|
| 15 |
|
| 16 |
+
def get_images_using_bbox(payload: PayloadModel):
|
| 17 |
+
images = []
|
| 18 |
+
# Forcing that only a single image is received
|
| 19 |
+
cv2_image = convert_base64_to_cv2(payload.input_data[0])
|
| 20 |
+
print(f"Bbox: {payload.bbox}")
|
| 21 |
+
images_bboxes = payload.bbox
|
| 22 |
+
image_bboxes = images_bboxes[0]
|
| 23 |
+
for idx, bbox in enumerate(image_bboxes):
|
| 24 |
+
x1, y1, x2, y2 = bbox
|
| 25 |
+
image = cv2_image[y1:y2, x1:x2]
|
| 26 |
+
cv2.imwrite(f"image_{idx}.png", image)
|
| 27 |
+
pil_image = convert_cv2_to_pil(image)
|
| 28 |
+
images.append(pil_image)
|
| 29 |
+
return images
|
| 30 |
+
|
| 31 |
+
def get_whole_image(payload: PayloadModel):
|
| 32 |
+
images = []
|
| 33 |
+
# Forcing that only a single image is received
|
| 34 |
+
pil_image = convert_base64_to_pil(payload.input_data[0])
|
| 35 |
+
images.append(pil_image)
|
| 36 |
+
return images
|
payload_model.py
CHANGED
|
@@ -2,5 +2,8 @@ from pydantic import BaseModel
|
|
| 2 |
|
| 3 |
class PayloadModel(BaseModel):
|
| 4 |
"""Type check for payload parameters"""
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
class PayloadModel(BaseModel):
|
| 4 |
"""Type check for payload parameters"""
|
| 5 |
+
input_data: list[str]
|
| 6 |
+
prompt: str
|
| 7 |
+
prompt_eval_mode: str
|
| 8 |
+
input_utilization_mode: str
|
| 9 |
+
bbox: list[list[list[int]]]
|