| import argparse | |
| from pathlib import Path | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| def build_parser() -> argparse.ArgumentParser: | |
| parser = argparse.ArgumentParser(description="Run the local phi2-merged model.") | |
| parser.add_argument( | |
| "prompt", | |
| nargs="?", | |
| default="Kısa bir selam ver:", | |
| help="Prompt to send to the model.", | |
| ) | |
| parser.add_argument( | |
| "--max-new-tokens", | |
| type=int, | |
| default=120, | |
| help="Maximum number of new tokens to generate.", | |
| ) | |
| parser.add_argument( | |
| "--temperature", | |
| type=float, | |
| default=0.0, | |
| help="Sampling temperature. Default 0 uses deterministic greedy decoding.", | |
| ) | |
| parser.add_argument( | |
| "--top-p", | |
| type=float, | |
| default=0.9, | |
| help="Nucleus sampling threshold.", | |
| ) | |
| return parser | |
| def main() -> None: | |
| args = build_parser().parse_args() | |
| model_path = Path(__file__).resolve().parent | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_path, | |
| dtype=dtype, | |
| device_map="auto" if torch.cuda.is_available() else None, | |
| low_cpu_mem_usage=True, | |
| ) | |
| inputs = tokenizer(args.prompt, return_tensors="pt") | |
| inputs = {name: tensor.to(model.device) for name, tensor in inputs.items()} | |
| generate_kwargs = { | |
| "max_new_tokens": args.max_new_tokens, | |
| "pad_token_id": tokenizer.eos_token_id, | |
| } | |
| if args.temperature > 0: | |
| generate_kwargs["do_sample"] = True | |
| generate_kwargs["temperature"] = args.temperature | |
| generate_kwargs["top_p"] = args.top_p | |
| else: | |
| generate_kwargs["do_sample"] = False | |
| with torch.no_grad(): | |
| output = model.generate(**inputs, **generate_kwargs) | |
| print(tokenizer.decode(output[0], skip_special_tokens=True)) | |
| if __name__ == "__main__": | |
| main() |