| |
| |
| |
| |
| |
|
|
| """ |
| A model worker executes the model. |
| """ |
| import argparse |
| import asyncio |
| import base64 |
| import json |
| import os |
| import threading |
| import time |
| import uuid |
| from functools import partial |
| from io import BytesIO |
| from threading import Thread |
|
|
| import requests |
| import torch |
| import torchvision.transforms as T |
| import uvicorn |
| from constants import IMAGENET_MEAN, IMAGENET_STD, WORKER_HEART_BEAT_INTERVAL |
| from fastapi import BackgroundTasks, FastAPI, Request |
| from fastapi.responses import StreamingResponse |
| from PIL import Image |
| from torchvision.transforms.functional import InterpolationMode |
| from transformers import (AutoModelForCausalLM, AutoTokenizer, |
| TextIteratorStreamer) |
| from utils import build_logger, pretty_print_semaphore, server_error_msg |
|
|
| worker_id = str(uuid.uuid4())[:6] |
| logger = build_logger('model_worker', f'model_worker_{worker_id}.log') |
| global_counter = 0 |
| model_semaphore = None |
|
|
|
|
| def load_image_from_base64(image): |
| return Image.open(BytesIO(base64.b64decode(image))) |
|
|
|
|
| def build_transform(input_size): |
| MEAN, STD = IMAGENET_MEAN, IMAGENET_STD |
| transform = T.Compose([ |
| T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), |
| T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), |
| T.ToTensor(), |
| T.Normalize(mean=MEAN, std=STD) |
| ]) |
| return transform |
|
|
|
|
| def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): |
| best_ratio_diff = float('inf') |
| best_ratio = (1, 1) |
| area = width * height |
| for ratio in target_ratios: |
| target_aspect_ratio = ratio[0] / ratio[1] |
| ratio_diff = abs(aspect_ratio - target_aspect_ratio) |
| if ratio_diff < best_ratio_diff: |
| best_ratio_diff = ratio_diff |
| best_ratio = ratio |
| elif ratio_diff == best_ratio_diff: |
| if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: |
| best_ratio = ratio |
| return best_ratio |
|
|
|
|
| def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False): |
| orig_width, orig_height = image.size |
| aspect_ratio = orig_width / orig_height |
|
|
| |
| target_ratios = set( |
| (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if |
| i * j <= max_num and i * j >= min_num) |
| target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) |
|
|
| |
| target_aspect_ratio = find_closest_aspect_ratio( |
| aspect_ratio, target_ratios, orig_width, orig_height, image_size) |
|
|
| |
| target_width = image_size * target_aspect_ratio[0] |
| target_height = image_size * target_aspect_ratio[1] |
| blocks = target_aspect_ratio[0] * target_aspect_ratio[1] |
|
|
| |
| resized_img = image.resize((target_width, target_height)) |
| processed_images = [] |
| for i in range(blocks): |
| box = ( |
| (i % (target_width // image_size)) * image_size, |
| (i // (target_width // image_size)) * image_size, |
| ((i % (target_width // image_size)) + 1) * image_size, |
| ((i // (target_width // image_size)) + 1) * image_size |
| ) |
| |
| split_img = resized_img.crop(box) |
| processed_images.append(split_img) |
| assert len(processed_images) == blocks |
| if use_thumbnail and len(processed_images) != 1: |
| thumbnail_img = image.resize((image_size, image_size)) |
| processed_images.append(thumbnail_img) |
| return processed_images |
|
|
|
|
| def heart_beat_worker(controller): |
| while True: |
| time.sleep(WORKER_HEART_BEAT_INTERVAL) |
| controller.send_heart_beat() |
|
|
|
|
| class ModelWorker: |
| def __init__(self, controller_addr, worker_addr, worker_id, model_path, model_name, |
| load_8bit, device, context_len=8192): |
| self.controller_addr = controller_addr |
| self.worker_addr = worker_addr |
| self.worker_id = worker_id |
| if model_path.endswith('/'): |
| model_path = model_path[:-1] |
| if model_name is None: |
| model_paths = model_path.split('/') |
| if model_paths[-1].startswith('checkpoint-'): |
| self.model_name = model_paths[-2] + '_' + model_paths[-1] |
| else: |
| self.model_name = model_paths[-1] |
| else: |
| self.model_name = model_name |
|
|
| logger.info(f'Loading the model {self.model_name} on worker {worker_id} ...') |
|
|
| self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False) |
| if device == 'auto': |
| os.environ['CUDA_LAUNCH_BLOCKING'] = '1' |
| |
| self.model = AutoModelForCausalLM.from_pretrained( |
| model_path, |
| load_in_8bit=load_8bit, |
| torch_dtype=torch.float16, |
| device_map='auto', |
| trust_remote_code=True).eval() |
| else: |
| self.model = AutoModelForCausalLM.from_pretrained( |
| model_path, |
| load_in_8bit=load_8bit, |
| torch_dtype=torch.float16, |
| trust_remote_code=True).eval() |
| if not load_8bit and not device == 'auto': |
| self.model = self.model.cuda() |
| self.image_size = self.model.config.force_image_size |
| self.context_len = context_len |
| self.register_to_controller() |
| self.heart_beat_thread = threading.Thread( |
| target=heart_beat_worker, args=(self,)) |
| self.heart_beat_thread.start() |
|
|
| def register_to_controller(self): |
| logger.info('Register to controller') |
|
|
| url = self.controller_addr + '/register_worker' |
| data = { |
| 'worker_name': self.worker_addr, |
| 'check_heart_beat': True, |
| 'worker_status': self.get_status() |
| } |
| r = requests.post(url, json=data) |
| assert r.status_code == 200 |
|
|
| def send_heart_beat(self): |
| logger.info(f'Send heart beat. Models: {[self.model_name]}. ' |
| f'Semaphore: {pretty_print_semaphore(model_semaphore)}. ' |
| f'global_counter: {global_counter}') |
|
|
| url = self.controller_addr + '/receive_heart_beat' |
|
|
| while True: |
| try: |
| ret = requests.post(url, json={ |
| 'worker_name': self.worker_addr, |
| 'queue_length': self.get_queue_length()}, timeout=5) |
| exist = ret.json()['exist'] |
| break |
| except requests.exceptions.RequestException as e: |
| logger.error(f'heart beat error: {e}') |
| time.sleep(5) |
|
|
| if not exist: |
| self.register_to_controller() |
|
|
| def get_queue_length(self): |
| if model_semaphore is None: |
| return 0 |
| else: |
| return args.limit_model_concurrency - model_semaphore._value + (len( |
| model_semaphore._waiters) if model_semaphore._waiters is not None else 0) |
|
|
| def get_status(self): |
| return { |
| 'model_names': [self.model_name], |
| 'speed': 1, |
| 'queue_length': self.get_queue_length(), |
| } |
|
|
| @torch.inference_mode() |
| def generate_stream(self, params): |
| system_message = params['prompt'][0]['content'] |
| send_messages = params['prompt'][1:] |
| max_input_tiles = params['max_input_tiles'] |
| temperature = params['temperature'] |
| top_p = params['top_p'] |
| max_new_tokens = params['max_new_tokens'] |
| repetition_penalty = params['repetition_penalty'] |
| do_sample = True if temperature > 0.0 else False |
|
|
| global_image_cnt = 1 |
| history, pil_images, max_input_tile_list = [], [], [] |
| for message in send_messages: |
| if message['role'] == 'user': |
| prefix = '' |
| if 'image' in message: |
| max_input_tile_temp = [] |
| for image_str in message['image']: |
| pil_images.append(load_image_from_base64(image_str)) |
| prefix += f'Image-{global_image_cnt}: <image>\n\n' |
| global_image_cnt += 1 |
| max_input_tile_temp.append(max(1, max_input_tiles // len(message['image']))) |
| if len(max_input_tile_temp) > 0: |
| max_input_tile_list.append(max_input_tile_temp) |
| content = prefix + message['content'] |
| history.append([content, ]) |
| else: |
| history[-1].append(message['content']) |
| question, history = history[-1][0], history[:-1] |
|
|
| |
| flattened_list = [] |
| |
| for sublist in max_input_tile_list[:-1]: |
| processed_sublist = [1] * len(sublist) |
| flattened_list.extend(processed_sublist) |
| |
| if max_input_tile_list: |
| flattened_list.extend(max_input_tile_list[-1]) |
| max_input_tile_list = flattened_list |
| assert len(max_input_tile_list) == len(pil_images), 'The number of max_input_tile_list and pil_images should be the same.' |
| logger.info(f'max_input_tile_list: {max_input_tile_list}') |
|
|
| old_system_message = self.model.system_message |
| self.model.system_message = system_message |
| image_tiles = [] |
| transform = build_transform(input_size=self.image_size) |
| if len(pil_images) > 0: |
| for current_max_input_tiles, pil_image in zip(max_input_tile_list, pil_images): |
| if self.model.config.dynamic_image_size: |
| tiles = dynamic_preprocess( |
| pil_image, image_size=self.image_size, max_num=current_max_input_tiles, |
| use_thumbnail=self.model.config.use_thumbnail) |
| else: |
| tiles = [pil_image] |
| image_tiles += tiles |
| pixel_values = [transform(item) for item in image_tiles] |
| pixel_values = torch.stack(pixel_values).to(self.model.device, dtype=torch.float16) |
| logger.info(f'Split images to {pixel_values.shape}') |
| else: |
| pixel_values = None |
|
|
| streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=False, timeout=10) |
| generation_config = dict( |
| num_beams=1, |
| max_new_tokens=max_new_tokens, |
| do_sample=do_sample, |
| temperature=temperature, |
| repetition_penalty=repetition_penalty, |
| max_length=self.context_len, |
| top_p=top_p, |
| streamer=streamer, |
| ) |
| logger.info(history) |
| logger.info(f'Generation config: {generation_config}') |
| try: |
| thread = Thread(target=self.model.chat, kwargs=dict( |
| tokenizer=self.tokenizer, |
| pixel_values=pixel_values, |
| question=question, |
| history=history, |
| return_history=False, |
| generation_config=generation_config, |
| )) |
| thread.start() |
| |
| generated_text = '' |
| for new_text in streamer: |
| generated_text += new_text |
| yield json.dumps({'text': generated_text.replace(self.model.conv_template.sep, ''), |
| 'error_code': 0}).encode() + b'\0' |
| self.model.system_message = old_system_message |
| except: |
| torch.cuda.empty_cache() |
|
|
| def generate_stream_gate(self, params): |
| try: |
| for x in self.generate_stream(params): |
| yield x |
| except ValueError as e: |
| print('Caught ValueError:', e) |
| ret = { |
| 'text': server_error_msg, |
| 'error_code': 1, |
| } |
| yield json.dumps(ret).encode() + b'\0' |
| except torch.cuda.CudaError as e: |
| print('Caught torch.cuda.CudaError:', e) |
| ret = { |
| 'text': server_error_msg, |
| 'error_code': 1, |
| } |
| yield json.dumps(ret).encode() + b'\0' |
| except Exception as e: |
| print('Caught Unknown Error', e) |
| ret = { |
| 'text': server_error_msg, |
| 'error_code': 1, |
| } |
| yield json.dumps(ret).encode() + b'\0' |
|
|
|
|
| app = FastAPI() |
|
|
|
|
| def release_model_semaphore(fn=None): |
| model_semaphore.release() |
| if fn is not None: |
| fn() |
|
|
|
|
| @app.post('/worker_generate_stream') |
| async def generate_stream(request: Request): |
| global model_semaphore, global_counter |
| global_counter += 1 |
| params = await request.json() |
|
|
| if model_semaphore is None: |
| model_semaphore = asyncio.Semaphore(args.limit_model_concurrency) |
| await model_semaphore.acquire() |
| worker.send_heart_beat() |
| generator = worker.generate_stream_gate(params) |
| background_tasks = BackgroundTasks() |
| background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat)) |
| return StreamingResponse(generator, background=background_tasks) |
|
|
|
|
| @app.post('/worker_get_status') |
| async def get_status(request: Request): |
| return worker.get_status() |
|
|
|
|
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--host', type=str, default='0.0.0.0') |
| parser.add_argument('--port', type=int, default=21002) |
| parser.add_argument('--worker-address', type=str, default='http://localhost:21002') |
| parser.add_argument('--controller-address', type=str, default='http://localhost:21001') |
| parser.add_argument('--model-path', type=str, default='facebook/opt-350m') |
| parser.add_argument('--model-name', type=str) |
| parser.add_argument('--device', type=str, default='cuda') |
| parser.add_argument('--limit-model-concurrency', type=int, default=5) |
| parser.add_argument('--stream-interval', type=int, default=1) |
| parser.add_argument('--load-8bit', action='store_true') |
| args = parser.parse_args() |
| logger.info(f'args: {args}') |
|
|
| worker = ModelWorker(args.controller_address, |
| args.worker_address, |
| worker_id, |
| args.model_path, |
| args.model_name, |
| args.load_8bit, |
| args.device) |
| uvicorn.run(app, host=args.host, port=args.port, log_level='info') |
|
|