yuccaaa commited on
Commit
c3e8669
·
verified ·
1 Parent(s): b1768ca

Upload datasets-jiaocai/predata.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. datasets-jiaocai/predata.py +86 -0
datasets-jiaocai/predata.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ def process_jsonl(file_path):
4
+ info_count = 0
5
+ first_entries = []
6
+ try:
7
+ with open(file_path, 'r', encoding='utf-8') as file:
8
+ for line in file:
9
+ try:
10
+ entry = json.loads(line)
11
+ info_count += 1
12
+ if len(first_entries) < 5:
13
+ first_entries.append(entry)
14
+ except json.JSONDecodeError:
15
+ print("错误:无法解析某一行的JSON数据。")
16
+ except FileNotFoundError:
17
+ print("错误:未找到指定的JSONL文件。")
18
+
19
+ print(f"JSONL文件中的信息数量为: {info_count}")
20
+ # print("前五个条目信息如下:")
21
+ # for i, entry in enumerate(first_entries, start=1):
22
+ # print(f"条目 {i}: {entry}")
23
+
24
+
25
+ if __name__ == "__main__":
26
+ file_path = 'matched_records.jsonl'
27
+ process_jsonl(file_path)
28
+
29
+
30
+
31
+
32
+ import json
33
+ from datasets import load_dataset
34
+
35
+ def load_processed_jsonl(file_path):
36
+ try:
37
+ dataset = load_dataset('json', data_files=file_path)
38
+ print("数据集加载成功:")
39
+ print(dataset)
40
+ return dataset
41
+ except Exception as e:
42
+ print(f"加载数据集时出现错误: {e},详细错误信息如下:")
43
+ import traceback
44
+ traceback.print_exc()
45
+
46
+ input_file = 'matched_records.jsonl' # 替换为你的输入 JSONL 文件路径
47
+ output_file = 'matched_bio.jsonl' # 替换为你希望的输出 JSONL 文件路径
48
+
49
+ # selected_fields = [
50
+ # 'track_id', 'file_type', 'content_type', 'content_length', 'title', 'dataset_id',
51
+ # 'dataset_name', 'pdf_type', 'content_list', 'content', 'labels',
52
+ # 'sub_path', 'url', 'date', '__unimernet_version', 'html',
53
+ # 'remarkle', 'author', 'abstract', 'category', 'source',
54
+ # 'relation_id', 'file_source', 'subject', 'processed'
55
+ # ]'date' 'remarkle',
56
+ selected_fields=[ 'track_id', 'file_type', 'content_type', 'content_length', 'title', 'dataset_id',
57
+ 'dataset_name', 'pdf_type','content','labels',
58
+ 'sub_path', 'url', '__unimernet_version', 'html',
59
+ 'author', 'abstract', 'category', 'source',
60
+ 'relation_id', 'file_source', 'subject', 'processed']
61
+
62
+ try:
63
+ with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
64
+ for line in infile:
65
+
66
+ try:
67
+ data = json.loads(line.strip())
68
+ new_data = {field: data[field] for field in selected_fields if field in data}
69
+
70
+ # 分割 content
71
+ content = new_data.get('content', '')
72
+ chunk_size = 8000 # 每段的长度
73
+ chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
74
+
75
+ for index, chunk in enumerate(chunks):
76
+ new_data['content'] = chunk
77
+ new_data['content_chunk_index'] = index # 添加一个字段表示这是第几个分块
78
+ outfile.write(json.dumps(new_data) + '\n')
79
+ except json.JSONDecodeError as e:
80
+ print(f"解析 JSON 时出错: {e},跳过该行。")
81
+
82
+ print(f"已成功处理并保存到 {output_file}")
83
+ # 尝试加载处理后的文件
84
+ load_processed_jsonl(output_file)
85
+ except FileNotFoundError:
86
+ print(f"文件 {input_file} 未找到。")