| | |
| |
|
| | import requests |
| |
|
| | HOST = '0.0.0.0:5000' |
| |
|
| |
|
| | def generate(prompt, tokens=200): |
| | request = {'prompt': prompt, 'max_new_tokens': tokens} |
| | response = requests.post(f'http://{HOST}/api/v1/generate', json=request) |
| |
|
| | if response.status_code == 200: |
| | return response.json()['results'][0]['text'] |
| |
|
| |
|
| | def model_api(request): |
| | response = requests.post(f'http://{HOST}/api/v1/model', json=request) |
| | return response.json() |
| |
|
| |
|
| | |
| | def print_basic_model_info(response): |
| | basic_settings = ['truncation_length', 'instruction_template'] |
| | print("Model: ", response['result']['model_name']) |
| | print("Lora(s): ", response['result']['lora_names']) |
| | for setting in basic_settings: |
| | print(setting, "=", response['result']['shared.settings'][setting]) |
| |
|
| |
|
| | |
| | def model_info(): |
| | response = model_api({'action': 'info'}) |
| | print_basic_model_info(response) |
| |
|
| |
|
| | |
| | def model_load(model_name): |
| | return model_api({'action': 'load', 'model_name': model_name}) |
| |
|
| |
|
| | |
| | def complex_model_load(model): |
| |
|
| | def guess_groupsize(model_name): |
| | if '1024g' in model_name: |
| | return 1024 |
| | elif '128g' in model_name: |
| | return 128 |
| | elif '32g' in model_name: |
| | return 32 |
| | else: |
| | return -1 |
| |
|
| | req = { |
| | 'action': 'load', |
| | 'model_name': model, |
| | 'args': { |
| | 'loader': 'AutoGPTQ', |
| |
|
| | 'bf16': False, |
| | 'load_in_8bit': False, |
| | 'groupsize': 0, |
| | 'wbits': 0, |
| |
|
| | |
| | 'threads': 0, |
| | 'n_batch': 512, |
| | 'no_mmap': False, |
| | 'mlock': False, |
| | 'cache_capacity': None, |
| | 'n_gpu_layers': 0, |
| | 'n_ctx': 2048, |
| |
|
| | |
| | 'rwkv_strategy': None, |
| | 'rwkv_cuda_on': False, |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | }, |
| | } |
| |
|
| | model = model.lower() |
| |
|
| | if '4bit' in model or 'gptq' in model or 'int4' in model: |
| | req['args']['wbits'] = 4 |
| | req['args']['groupsize'] = guess_groupsize(model) |
| | elif '3bit' in model: |
| | req['args']['wbits'] = 3 |
| | req['args']['groupsize'] = guess_groupsize(model) |
| | else: |
| | req['args']['gptq_for_llama'] = False |
| |
|
| | if '8bit' in model: |
| | req['args']['load_in_8bit'] = True |
| | elif '-hf' in model or 'fp16' in model: |
| | if '7b' in model: |
| | req['args']['bf16'] = True |
| | elif '13b' in model: |
| | req['args']['load_in_8bit'] = True |
| | elif 'ggml' in model: |
| | |
| | if '7b' in model: |
| | req['args']['n_gpu_layers'] = 100 |
| | elif '13b' in model: |
| | req['args']['n_gpu_layers'] = 100 |
| | elif '30b' in model or '33b' in model: |
| | req['args']['n_gpu_layers'] = 59 |
| | elif '65b' in model: |
| | req['args']['n_gpu_layers'] = 42 |
| | elif 'rwkv' in model: |
| | req['args']['rwkv_cuda_on'] = True |
| | if '14b' in model: |
| | req['args']['rwkv_strategy'] = 'cuda f16i8' |
| | else: |
| | req['args']['rwkv_strategy'] = 'cuda f16' |
| |
|
| | return model_api(req) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | for model in model_api({'action': 'list'})['result']: |
| | try: |
| | resp = complex_model_load(model) |
| |
|
| | if 'error' in resp: |
| | print(f"β {model} FAIL Error: {resp['error']['message']}") |
| | continue |
| | else: |
| | print_basic_model_info(resp) |
| |
|
| | ans = generate("0,1,1,2,3,5,8,13,", tokens=2) |
| |
|
| | if '21' in ans: |
| | print(f"β
{model} PASS ({ans})") |
| | else: |
| | print(f"β {model} FAIL ({ans})") |
| |
|
| | except Exception as e: |
| | print(f"β {model} FAIL Exception: {repr(e)}") |
| |
|
| |
|
| | |
| | |
| | """ $ ./model-api-example.py |
| | Model: 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda |
| | Lora(s): [] |
| | truncation_length = 2048 |
| | instruction_template = Alpaca |
| | β
4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda PASS (21) |
| | Model: 4bit_WizardLM-13B-Uncensored-4bit-128g |
| | Lora(s): [] |
| | truncation_length = 2048 |
| | instruction_template = WizardLM |
| | β
4bit_WizardLM-13B-Uncensored-4bit-128g PASS (21) |
| | Model: Aeala_VicUnlocked-alpaca-30b-4bit |
| | Lora(s): [] |
| | truncation_length = 2048 |
| | instruction_template = Alpaca |
| | β
Aeala_VicUnlocked-alpaca-30b-4bit PASS (21) |
| | Model: alpaca-30b-4bit |
| | Lora(s): [] |
| | truncation_length = 2048 |
| | instruction_template = Alpaca |
| | β
alpaca-30b-4bit PASS (21) |
| | """ |
| |
|