Spaces:
Running
Running
| from datasets import load_dataset | |
| # Load the dataset | |
| dataset = load_dataset("phyloforfun/HLT_MICH_Angiospermae_SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05") | |
| # Define the directory where you want to save the files | |
| save_dir = "D:/Dropbox/VoucherVision/datasets/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05" | |
| # Save each split as a JSONL file in the specified directory | |
| for split, split_dataset in dataset.items(): | |
| split_dataset.to_json(f"{save_dir}/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05-{split}.jsonl") | |
| '''import json # convert to google | |
| # Load the JSONL file | |
| input_file_path = '/mnt/data/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05-train.jsonl' | |
| output_file_path = '/mnt/data/SLTPvC_v1-0_medium_OCR-C25-L25-E50-R05-train-converted.jsonl' | |
| # Define the conversion function | |
| def convert_record(record): | |
| return { | |
| "input_text": record.get('instruction', '') + ' ' + record.get('input', ''), | |
| "target_text": record.get('output', '') | |
| } | |
| # Convert and save the new JSONL file | |
| with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile: | |
| for line in infile: | |
| record = json.loads(line) | |
| converted_record = convert_record(record) | |
| outfile.write(json.dumps(converted_record) + '\n') | |
| output_file_path''' | |