Spaces:
Running
Running
| """ | |
| Data Preparation Script for TinyLlama Training | |
| This script helps prepare data in the right format for TinyLlama training. | |
| """ | |
| import os | |
| import json | |
| import argparse | |
| import pandas as pd | |
| from tqdm import tqdm | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="Prepare data for TinyLlama training") | |
| parser.add_argument( | |
| "--input_files", | |
| nargs="+", | |
| required=True, | |
| help="Path to input text files (accepts multiple files)" | |
| ) | |
| parser.add_argument( | |
| "--output_file", | |
| type=str, | |
| default="training_data.json", | |
| help="Output JSON file with prepared data" | |
| ) | |
| parser.add_argument( | |
| "--format", | |
| type=str, | |
| choices=["instruction", "completion", "plain"], | |
| default="plain", | |
| help="Data format: instruction (instruction-response pairs), completion (text completion), or plain text" | |
| ) | |
| parser.add_argument( | |
| "--chunk_size", | |
| type=int, | |
| default=512, | |
| help="Size of text chunks for plain text format" | |
| ) | |
| parser.add_argument( | |
| "--overlap", | |
| type=int, | |
| default=50, | |
| help="Overlap between chunks for plain text format" | |
| ) | |
| return parser.parse_args() | |
| def chunk_text(text, chunk_size, overlap): | |
| """Split text into overlapping chunks of specified size""" | |
| chunks = [] | |
| start = 0 | |
| # Skip empty or very short texts | |
| if len(text) < chunk_size / 2: | |
| return [] | |
| while start < len(text): | |
| end = min(start + chunk_size, len(text)) | |
| # Don't create tiny chunks at the end | |
| if end - start < chunk_size / 2 and chunks: | |
| # Extend the last chunk instead | |
| chunks[-1] = text[start - chunk_size + overlap:end] | |
| break | |
| chunks.append(text[start:end]) | |
| start += chunk_size - overlap | |
| return chunks | |
| def process_instruction_data(file_paths): | |
| """Process data formatted as instruction-response pairs""" | |
| data = [] | |
| for file_path in file_paths: | |
| logger.info(f"Processing instruction data from: {file_path}") | |
| try: | |
| # Assuming JSON file with instruction-response pairs | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| file_data = json.load(f) | |
| for item in tqdm(file_data): | |
| if "instruction" in item and "response" in item: | |
| # Format as a prompt for TinyLlama | |
| text = f"<|im_start|>user\n{item['instruction']}<|im_end|>\n<|im_start|>assistant\n{item['response']}<|im_end|>" | |
| data.append({"text": text}) | |
| except Exception as e: | |
| logger.error(f"Error processing {file_path}: {e}") | |
| return data | |
| def process_completion_data(file_paths): | |
| """Process data formatted for completion""" | |
| data = [] | |
| for file_path in file_paths: | |
| logger.info(f"Processing completion data from: {file_path}") | |
| try: | |
| # Assuming JSON file with prompt-completion pairs | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| file_data = json.load(f) | |
| for item in tqdm(file_data): | |
| if "prompt" in item and "completion" in item: | |
| text = f"{item['prompt']}{item['completion']}" | |
| data.append({"text": text}) | |
| except Exception as e: | |
| logger.error(f"Error processing {file_path}: {e}") | |
| return data | |
| def process_plain_text(file_paths, chunk_size, overlap): | |
| """Process plain text files by chunking them""" | |
| data = [] | |
| for file_path in file_paths: | |
| logger.info(f"Processing plain text from: {file_path}") | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| text = f.read() | |
| chunks = chunk_text(text, chunk_size, overlap) | |
| logger.info(f"Created {len(chunks)} chunks from {file_path}") | |
| for chunk in chunks: | |
| data.append({"text": chunk}) | |
| except Exception as e: | |
| logger.error(f"Error processing {file_path}: {e}") | |
| return data | |
| def main(): | |
| args = parse_args() | |
| # Process data based on format | |
| if args.format == "instruction": | |
| data = process_instruction_data(args.input_files) | |
| elif args.format == "completion": | |
| data = process_completion_data(args.input_files) | |
| else: # plain text | |
| data = process_plain_text(args.input_files, args.chunk_size, args.overlap) | |
| logger.info(f"Total processed examples: {len(data)}") | |
| # Save processed data | |
| output_dir = os.path.dirname(args.output_file) | |
| if output_dir and not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| with open(args.output_file, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, ensure_ascii=False, indent=2) | |
| logger.info(f"Data saved to {args.output_file}") | |
| if __name__ == "__main__": | |
| main() |