| import torch |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
| |
| model_id = "dill-dev/Momo-336M-sft" |
| print("πΈ Loading Momo from Hugging Face...") |
|
|
| tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| model_id, |
| trust_remote_code=True, |
| dtype=torch.float32, |
| device_map="cpu", |
| ) |
| model.eval() |
| print("β
Momo is ready! Type 'exit' or 'quit' to stop.\n") |
|
|
|
|
| |
| def format_prompt(instruction): |
| return ( |
| "Below is an instruction that describes a task. " |
| "Write a response that appropriately completes the request.\n\n" |
| f"### Instruction:\n{instruction}\n\n### Response:\n" |
| ) |
|
|
|
|
| |
| |
| |
| def chat(user_input, max_new_tokens=200, temperature=0.75, |
| top_k=50, top_p=0.92, rep_penalty=1.1): |
|
|
| prompt = format_prompt(user_input) |
| inputs = tokenizer(prompt, return_tensors="pt") |
| input_ids = inputs["input_ids"] |
|
|
| with torch.no_grad(): |
| output_ids = model.generate( |
| input_ids, |
| max_new_tokens=max_new_tokens, |
| temperature=temperature, |
| top_k=top_k, |
| top_p=top_p, |
| rep_penalty=rep_penalty, |
| eos_token_id=tokenizer.eos_token_id, |
| pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id, |
| ) |
|
|
| |
| new_tokens = output_ids[0][input_ids.shape[1]:] |
| full_output = tokenizer.decode(new_tokens, skip_special_tokens=True) |
|
|
| |
| if "### Response:" in full_output: |
| response = full_output.split("### Response:")[-1].strip() |
| else: |
| response = full_output.strip() |
|
|
| return response |
|
|
|
|
| |
| while True: |
| user_input = input("π§ You: ") |
|
|
| if user_input.lower() in ['exit', 'quit']: |
| print("πΈ Momo: Bye bye! See you soon.") |
| break |
|
|
| if not user_input.strip(): |
| continue |
|
|
| print("πΈ Momo is thinking...", end="\r") |
| response = chat(user_input) |
| print(" " * 35, end="\r") |
| print(f"πΈ Momo: {response}\n") |