from mistralrs import ChatCompletionRequest, Runner, Which


runner = Runner(
    which=Which.XLora(
        tok_model_id=None,  # Automatically determine from ordering file
        model_id=...,  # Model ID of the base model (local path of HF model ID)
        xlora_model_id=...,  # X-LoRA Model ID of the base model (local path of HF model ID)
        order=...,  # Ordering file to ensure compatability with PEFT
        tgt_non_granular_index=3,  # Only generate scalings for the first 3 decoding tokens, and then use the last generated one
    )
)

res = runner.send_chat_completion_request(
    ChatCompletionRequest(
        model="mistral",
        messages=[{"role": "user", "content": "Tell me a story about 2 low rank matrices."}],
        max_tokens=256,
        presence_penalty=1.0,
        top_p=0.1,
        temperature=0.5,
    )
)
print(res.choices[0].message.content)
print(res.usage)