# Custom Training Data # Add high-quality, manually created QA pairs import json custom_qa_pairs = [ { "instruction": "Which prefecture has the largest population in Japan?", "input": "Context: Japan Census data.", "output": "According to the 2020 Population Census, Tokyo has the largest population with 14,047,594 residents." }, { "instruction": "What is the population density of Tokyo?", "input": "Context: Japan Census data.", "output": "Tokyo has a population density of approximately 6,402 people per square kilometer, calculated from its population of 14,047,594 and area of 2,194 km²." }, { "instruction": "Compare the population of Tokyo and Osaka.", "input": "Context: Japan Census data.", "output": "Tokyo has a population of 14,047,594 while Osaka has 8,837,685. Tokyo's population is approximately 5.2 million larger than Osaka's." }, { "instruction": "What percentage of Japan's population lives in Tokyo?", "input": "Context: Japan Census data.", "output": "Tokyo accounts for approximately 11.1% of Japan's total population of 126.1 million people." }, # Add more custom pairs here ] # Save to JSONL with open('custom_train.jsonl', 'w', encoding='utf-8') as f: for pair in custom_qa_pairs: json.dump(pair, f, ensure_ascii=False) f.write('\n') print(f"Created {len(custom_qa_pairs)} custom QA pairs")