| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| |
|
| |
|
| | import json |
| | from datasets import load_dataset |
| |
|
| | def load_processed_jsonl(file_path): |
| | try: |
| | dataset = load_dataset('json', data_files=file_path) |
| | print("数据集加载成功:") |
| | print(dataset) |
| | return dataset |
| | except Exception as e: |
| | print(f"加载数据集时出现错误: {e},详细错误信息如下:") |
| | import traceback |
| | traceback.print_exc() |
| |
|
| | input_file = '/fs-computility/ai-shen/wangyujia/datasets-jiaocai/category_10/output_new/biology/biology.jsonl' |
| | output_file = '/fs-computility/ai-shen/wangyujia/datasets-jiaocai/category_10/output_new/biology/match_bio.jsonl' |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | selected_fields=[ 'content_length', 'title', |
| | 'dataset_name', 'pdf_type','content', |
| | 'url', 'html', |
| | 'author', 'abstract', 'category', |
| | 'relation_id', 'subject', 'processed'] |
| |
|
| | try: |
| | with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile: |
| | for line in infile: |
| |
|
| | try: |
| | data = json.loads(line.strip()) |
| | new_data = {field: data[field] for field in selected_fields if field in data} |
| |
|
| | |
| | content = new_data.get('content', '') |
| | chunk_size = 6000 |
| | chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)] |
| |
|
| | for index, chunk in enumerate(chunks): |
| | new_data['content'] = chunk |
| | new_data['content_chunk_index'] = index |
| | outfile.write(json.dumps(new_data) + '\n') |
| | except json.JSONDecodeError as e: |
| | print(f"解析 JSON 时出错: {e},跳过该行。") |
| |
|
| | print(f"已成功处理并保存到 {output_file}") |
| | |
| | load_processed_jsonl(output_file) |
| | except FileNotFoundError: |
| | print(f"文件 {input_file} 未找到。") |