File size: 3,451 Bytes
fd95ace
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# import json

# def process_jsonl(file_path):
#     info_count = 0
#     first_entries = []
#     try:
#         with open(file_path, 'r', encoding='utf-8') as file:
#             for line in file:
#                 try:
#                     entry = json.loads(line)
#                     info_count += 1
#                     if len(first_entries) < 5:
#                         first_entries.append(entry)
#                 except json.JSONDecodeError:
#                     print("错误:无法解析某一行的JSON数据。")
#     except FileNotFoundError:
#         print("错误:未找到指定的JSONL文件。")

#     print(f"JSONL文件中的信息数量为: {info_count}")
#     # print("前五个条目信息如下:")
#     # for i, entry in enumerate(first_entries, start=1):
#     #     print(f"条目 {i}: {entry}")


# if __name__ == "__main__":
#     file_path = 'matched_records.jsonl'
#     process_jsonl(file_path)




import json
from datasets import load_dataset

def load_processed_jsonl(file_path):
    try:
        dataset = load_dataset('json', data_files=file_path)
        print("数据集加载成功:")
        print(dataset)
        return dataset
    except Exception as e:
        print(f"加载数据集时出现错误: {e},详细错误信息如下:")
        import traceback
        traceback.print_exc()

input_file = '/fs-computility/ai-shen/wangyujia/datasets-jiaocai/category_10/output_new/biology/biology.jsonl'  # 替换为你的输入 JSONL 文件路径
output_file = '/fs-computility/ai-shen/wangyujia/datasets-jiaocai/category_10/output_new/biology/match_bio.jsonl'  # 替换为你希望的输出 JSONL 文件路径

# selected_fields = [
#     'track_id', 'file_type', 'content_type', 'content_length', 'title', 'dataset_id',
#     'dataset_name', 'pdf_type', 'content_list', 'content', 'labels',
#     'sub_path', 'url', 'date', '__unimernet_version', 'html',
#     'remarkle', 'author', 'abstract', 'category', 'source',
#     'relation_id', 'file_source', 'subject', 'processed'
# ]'date'  'remarkle',
selected_fields=[ 'content_length', 'title',
                  'dataset_name', 'pdf_type','content',
                  'url', 'html',
                   'author', 'abstract', 'category',
                  'relation_id', 'subject', 'processed']

try:
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:

            try:
                data = json.loads(line.strip())
                new_data = {field: data[field] for field in selected_fields if field in data}

                # 分割 content
                content = new_data.get('content', '')
                chunk_size = 6000  # 每段的长度
                chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]

                for index, chunk in enumerate(chunks):
                    new_data['content'] = chunk
                    new_data['content_chunk_index'] = index  # 添加一个字段表示这是第几个分块
                    outfile.write(json.dumps(new_data) + '\n')
            except json.JSONDecodeError as e:
                print(f"解析 JSON 时出错: {e},跳过该行。")

    print(f"已成功处理并保存到 {output_file}")
    # 尝试加载处理后的文件
    load_processed_jsonl(output_file)
except FileNotFoundError:
    print(f"文件 {input_file} 未找到。")