import argparse from pathlib import Path import torch from transformers import AutoModelForCausalLM, AutoTokenizer def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Run the local phi2-merged model.") parser.add_argument( "prompt", nargs="?", default="Kısa bir selam ver:", help="Prompt to send to the model.", ) parser.add_argument( "--max-new-tokens", type=int, default=120, help="Maximum number of new tokens to generate.", ) parser.add_argument( "--temperature", type=float, default=0.0, help="Sampling temperature. Default 0 uses deterministic greedy decoding.", ) parser.add_argument( "--top-p", type=float, default=0.9, help="Nucleus sampling threshold.", ) return parser def main() -> None: args = build_parser().parse_args() model_path = Path(__file__).resolve().parent tokenizer = AutoTokenizer.from_pretrained(model_path) dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model = AutoModelForCausalLM.from_pretrained( model_path, dtype=dtype, device_map="auto" if torch.cuda.is_available() else None, low_cpu_mem_usage=True, ) inputs = tokenizer(args.prompt, return_tensors="pt") inputs = {name: tensor.to(model.device) for name, tensor in inputs.items()} generate_kwargs = { "max_new_tokens": args.max_new_tokens, "pad_token_id": tokenizer.eos_token_id, } if args.temperature > 0: generate_kwargs["do_sample"] = True generate_kwargs["temperature"] = args.temperature generate_kwargs["top_p"] = args.top_p else: generate_kwargs["do_sample"] = False with torch.no_grad(): output = model.generate(**inputs, **generate_kwargs) print(tokenizer.decode(output[0], skip_special_tokens=True)) if __name__ == "__main__": main()