| import os
|
| import random
|
| from datasets import load_dataset
|
|
|
|
|
| movie_script = 'input.txt'
|
| output_file = 'mixed_train.txt'
|
|
|
|
|
| print("Downloading Alpaca Dataset from Hugging Face...")
|
| alpaca = load_dataset("tatsu-lab/alpaca", split='train')
|
|
|
| def format_entry(entry):
|
| """Formats Alpaca JSON into User/Assistant text."""
|
| instruction = entry['instruction']
|
| context = entry['input']
|
| response = entry['output']
|
|
|
| if context:
|
| return f"User: {instruction}\nContext: {context}\nAssistant: {response}\n\n"
|
| else:
|
| return f"User: {instruction}\nAssistant: {response}\n\n"
|
|
|
|
|
| mixed_data = []
|
|
|
| print("Processing 50,000 chat examples...")
|
| for i in range(50000):
|
| mixed_data.append(format_entry(alpaca[i]))
|
|
|
|
|
| if os.path.exists(movie_script):
|
| print(f"Mixing in your {movie_script}")
|
| with open(movie_script, 'r', encoding='utf-8') as f:
|
| movie_text = f.read()
|
|
|
| movie_blocks = [block.strip() + "\n\n" for block in movie_text.split('\n\n') if block.strip()]
|
| mixed_data.extend(movie_blocks)
|
| else:
|
| print(f" Warning: {movie_script} not found. Proceeding with Chat data only.")
|
|
|
|
|
| print(" Shuffling data for better learning...")
|
| random.shuffle(mixed_data)
|
|
|
| with open(output_file, 'w', encoding='utf-8') as f_out:
|
| f_out.write("".join(mixed_data))
|
|
|
| print(f"SUCCESS! Created '{output_file}'.")
|
|
|