Instructions to use dizza01/medalpaca-13b with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use dizza01/medalpaca-13b with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="dizza01/medalpaca-13b")# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("dizza01/medalpaca-13b") model = AutoModelForCausalLM.from_pretrained("dizza01/medalpaca-13b") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use dizza01/medalpaca-13b with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "dizza01/medalpaca-13b" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "dizza01/medalpaca-13b", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/dizza01/medalpaca-13b
- SGLang
How to use dizza01/medalpaca-13b with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "dizza01/medalpaca-13b" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "dizza01/medalpaca-13b", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "dizza01/medalpaca-13b" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "dizza01/medalpaca-13b", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use dizza01/medalpaca-13b with Docker Model Runner:
docker model run hf.co/dizza01/medalpaca-13b
| import os | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| class EndpointHandler: | |
| def __init__(self, path: str = ""): | |
| model_dir = path or "/repository" | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| model_dir, | |
| trust_remote_code=True, | |
| ) | |
| if self.tokenizer.pad_token_id is None: | |
| self.tokenizer.pad_token = self.tokenizer.eos_token | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| model_dir, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16, | |
| low_cpu_mem_usage=True, | |
| device_map="auto", | |
| ) | |
| self.model.eval() | |
| def _messages_to_prompt(self, inputs): | |
| # Use chat template only if both the method and a non-empty template exist | |
| if hasattr(self.tokenizer, "apply_chat_template") and getattr( | |
| self.tokenizer, "chat_template", None | |
| ): | |
| return self.tokenizer.apply_chat_template( | |
| inputs, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| # Fallback for plain causal LMs with no chat_template (e.g. MedAlpaca) | |
| parts = [] | |
| for msg in inputs: | |
| role = (msg.get("role") or "user").upper() | |
| content = msg.get("content", "") | |
| parts.append(f"[{role}]\n{content}") | |
| parts.append("[ASSISTANT]\n") | |
| return "\n\n".join(parts) | |
| def __call__(self, data): | |
| inputs = data.get("inputs", "") | |
| params = data.get("parameters", {}) or {} | |
| max_new_tokens = int(params.get("max_new_tokens", 128)) | |
| temperature = float(params.get("temperature", 0.0)) | |
| top_p = float(params.get("top_p", 1.0)) | |
| do_sample = bool(params.get("do_sample", temperature > 0)) | |
| repetition_penalty = float(params.get("repetition_penalty", 1.0)) | |
| no_repeat_ngram_size = int(params.get("no_repeat_ngram_size", 0)) | |
| if isinstance(inputs, list): | |
| prompt = self._messages_to_prompt(inputs) | |
| else: | |
| prompt = str(inputs) | |
| enc = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) | |
| with torch.no_grad(): | |
| out = self.model.generate( | |
| **enc, | |
| max_new_tokens=max_new_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| do_sample=do_sample, | |
| repetition_penalty=repetition_penalty, | |
| no_repeat_ngram_size=no_repeat_ngram_size, | |
| pad_token_id=self.tokenizer.pad_token_id, | |
| eos_token_id=self.tokenizer.eos_token_id, | |
| ) | |
| generated_ids = out[0][enc["input_ids"].shape[-1]:] | |
| text = self.tokenizer.decode(generated_ids, skip_special_tokens=True) | |
| return {"generated_text": text} | |