Spaces:

apehex
/

de-generate

Sleeping

File size: 4,969 Bytes

import gradio
import huggingface_hub
import spaces

import psaiops.common.model
import psaiops.common.style
import psaiops.common.tokenizer

import psaiops.score.human.ux as _ux
import psaiops.score.human.app as _app

# META #########################################################################

# additional args used when loading the model
_MODEL_CFG = {}
_LOAD_CFG = {'repo_type': 'model', 'ignore_patterns': ['*.onnx', '*.tflite', '*.msgpack'],}
_app.MODELS = ['qwen/qwen3.5-9b', 'qwen/qwen3.5-27b']

# frontload the models on the CPU to avoid downloading them from the GPU slot
for __m in _app.MODELS:
    huggingface_hub.snapshot_download(repo_id=__m, **_LOAD_CFG)

# but do not instantiate unless necessary
_MODEL = None
_TOKENIZER = psaiops.common.tokenizer.get_tokenizer(name=_app.MODELS[0])

# CURRENT ######################################################################

def current_selection() -> dict:
    return gradio.update(value=_app.MODELS[0], choices=_app.MODELS)

def switch_selection(name: str) -> None:
    _app.MODELS = [name] + list(set(_app.MODELS) - {name})

# LAZY #########################################################################

def fetch_model() -> object:
    global _MODEL
    # control when the model is downloaded to avoid moving it to the CPU
    if _MODEL is None:
        # the first item in the list is always the selected model
        _MODEL = psaiops.common.model.get_model(name=_app.MODELS[0], device='cuda', **_MODEL_CFG)
        # give some feedback
        if hasattr(_MODEL, 'name_or_path'):
            gradio.Info(title='Info', message='Successfully switched to `{}`.'.format(getattr(_MODEL, 'name_or_path', 'None')), duration=2)
        else:
            gradio.Warning(title='Warning', message='The GPU time slot expired before the model could be loaded.', duration=4)
    # model object or `None`
    return _MODEL

def fetch_tokenizer() -> object:
    global _TOKENIZER
    # not strictly necessary, but symmetry is everything
    if _TOKENIZER is None:
        _TOKENIZER = psaiops.common.tokenizer.get_tokenizer(name=_app.MODELS[0])
    # tokenizer object or `None`
    return _TOKENIZER

# SWITCH #######################################################################

@spaces.GPU(duration=30)
def switch_model(
    model_str: str
) -> dict:
    global _MODEL, _TOKENIZER
    # end early if the selection isn't changed
    if model_str == _app.MODELS[0]:
        return current_selection()
    # reorder the model list so that the selected model is at index 0
    switch_selection(name=model_str)
    # free the memory allocated to the previous model
    psaiops.common.model.free_memory(model=_MODEL)
    # reset the pointers
    _MODEL = None
    _TOKENIZER = None
    # load the selected model
    try:
        fetch_model()
    except:
        pass
    # return the reordered list of models even if the loading failed
    return current_selection()

# TOKENS #######################################################################

def compute_tokens(
    prompt_str: str,
    export_str: str,
) -> object:
    # do not download the model without the GPU wrapper
    __tokenizer = fetch_tokenizer()
    # fill all the arguments that cannot be pickled
    return _ux.update_tokens_state(
        prompt_str=prompt_str,
        export_str=export_str,
        tokenizer_obj=__tokenizer)

# INDICES ######################################################################

def compute_indices(
    prompt_str: str,
    export_str: str,
) -> object:
    # do not download the model without the GPU wrapper
    __tokenizer = fetch_tokenizer()
    # fill all the arguments that cannot be pickled
    return _ux.update_indices_state(
        prompt_str=prompt_str,
        export_str=export_str,
        tokenizer_obj=__tokenizer)

# LOGITS #######################################################################

@spaces.GPU(duration=30)
def compute_logits(
    indices_arr: object,
    export_str: str,
) -> object:
    __logits = None
    # load the model inside the GPU wrapper (not before)
    __model = fetch_model()
    # the allocation might expire before the calculations are finished
    try:
        __logits = _ux.update_logits_state(
            indices_arr=indices_arr,
            export_str=export_str,
            model_obj=__model)
    except:
        gradio.Warning(title='Warning', message='Calculations aborted because the GPU allocation expired.', duration=4)
    # tensor or None
    return __logits

# CREATE #######################################################################

demo = _app.create_app(
    current=current_selection,
    switch=switch_model,
    partition=compute_tokens,
    convert=compute_indices,
    compute=compute_logits,
    models=_app.MODELS,
    export='')

# LAUNCH #######################################################################

demo.queue()
demo.launch(theme=gradio.themes.Soft(), css=psaiops.common.style.ALL, share=False, debug=False)