| from mistralrs import ChatCompletionRequest, Runner, Which | |
| runner = Runner( | |
| which=Which.XLora( | |
| tok_model_id=None, # Automatically determine from ordering file | |
| model_id=..., # Model ID of the base model (local path of HF model ID) | |
| xlora_model_id=..., # X-LoRA Model ID of the base model (local path of HF model ID) | |
| order=..., # Ordering file to ensure compatability with PEFT | |
| tgt_non_granular_index=3, # Only generate scalings for the first 3 decoding tokens, and then use the last generated one | |
| ) | |
| ) | |
| res = runner.send_chat_completion_request( | |
| ChatCompletionRequest( | |
| model="mistral", | |
| messages=[{"role": "user", "content": "Tell me a story about 2 low rank matrices."}], | |
| max_tokens=256, | |
| presence_penalty=1.0, | |
| top_p=0.1, | |
| temperature=0.5, | |
| ) | |
| ) | |
| print(res.choices[0].message.content) | |
| print(res.usage) | |