| | from transformers import AutoModelForCausalLM, AutoTokenizer |
| |
|
| | import torch |
| |
|
| | |
| | temperature = 0.7 # Controls the randomness of the generated text |
| | top_k = 50 # Only consider the top k most likely tokens when generating text |
| | repetition_penalty = 1.2 # Penalizes the repetition of tokens in the generated text |
| |
|
| | |
| | phi_model_name = "microsoft/phi-1_5" |
| | tokenizer_name = phi_model_name |
| | phi_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) |
| | phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name).to("cuda") |
| |
|
| | assistant_model_name = "roneneldan/TinyStories-33M" |
| | assistant_model = AutoModelForCausalLM.from_pretrained(assistant_model_name).to("cuda") |
| |
|
| | |
| | def generate_response(user_input, assistant_model, phi_model, temperature=temperature, top_k=top_k, repetition_penalty=repetition_penalty): |
| |
|
| | # Assistant generates initial story |
| | inputs = phi_tokenizer(user_input, return_tensors="pt").to("cuda") |
| | story = assistant_model.generate(**inputs, max_length=25, temperature=temperature, top_k=top_k, repetition_penalty=repetition_penalty) |
| | story_text = phi_tokenizer.decode(story[0], skip_special_tokens=True) |
| |
|
| | # Phi cleans it up |
| | phi_inputs = phi_tokenizer(story_text, return_tensors="pt").to("cuda") |
| | phi_inputs.pop("attention_mask") |
| | cleaned_story = phi_model.generate(**phi_inputs, max_length=500, temperature=temperature, top_k=top_k, repetition_penalty=repetition_penalty) |
| | cleaned_text = phi_tokenizer.decode(cleaned_story[0], skip_special_tokens=True) |
| |
|
| | # Assistant refines it |
| | inputs = phi_tokenizer(cleaned_text, return_tensors="pt").to("cuda") |
| | refined_story = assistant_model.generate(**inputs, max_length=100, temperature=temperature, top_k=top_k, repetition_penalty=repetition_penalty) |
| | refined_text = phi_tokenizer.decode(refined_story[0], skip_special_tokens=True) |
| |
|
| | # Final cleanup by Phi |
| | phi_inputs = phi_tokenizer(refined_text, return_tensors="pt").to("cuda") |
| | phi_inputs.pop("attention_mask") |
| | final_story = phi_model.generate(**phi_inputs, max_length=500, temperature=temperature, top_k=top_k, repetition_penalty=repetition_penalty) |
| | final_text = phi_tokenizer.decode(final_story[0], skip_special_tokens=True) |
| |
|
| | return final_text |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | while True: |
| | user_input = input("You: ") |
| | if user_input.lower() in ["exit", "quit"]: |
| | print("Goodbye!") |
| | break |
| |
|
| | response = generate_response(user_input, assistant_model, phi_model) |
| | print("BartPhi-2.8:", response) |