yuccaaa
/

oss

Safetensors

Model card Files Files and versions

xet

Community

yuccaaa commited on Sep 4, 2025

Commit

c3e8669

verified ·

1 Parent(s): b1768ca

Upload datasets-jiaocai/predata.py with huggingface_hub

Browse files

Files changed (1) hide show

datasets-jiaocai/predata.py +86 -0

datasets-jiaocai/predata.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import json
+def process_jsonl(file_path):
+    info_count = 0
+    first_entries = []
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            for line in file:
+                try:
+                    entry = json.loads(line)
+                    info_count += 1
+                    if len(first_entries) < 5:
+                        first_entries.append(entry)
+                except json.JSONDecodeError:
+                    print("错误：无法解析某一行的JSON数据。")
+    except FileNotFoundError:
+        print("错误：未找到指定的JSONL文件。")
+    print(f"JSONL文件中的信息数量为: {info_count}")
+    # print("前五个条目信息如下：")
+    # for i, entry in enumerate(first_entries, start=1):
+    #     print(f"条目 {i}: {entry}")
+if __name__ == "__main__":
+    file_path = 'matched_records.jsonl'
+    process_jsonl(file_path)
+import json
+from datasets import load_dataset
+def load_processed_jsonl(file_path):
+    try:
+        dataset = load_dataset('json', data_files=file_path)
+        print("数据集加载成功：")
+        print(dataset)
+        return dataset
+    except Exception as e:
+        print(f"加载数据集时出现错误: {e}，详细错误信息如下：")
+        import traceback
+        traceback.print_exc()
+input_file = 'matched_records.jsonl'  # 替换为你的输入 JSONL 文件路径
+output_file = 'matched_bio.jsonl'  # 替换为你希望的输出 JSONL 文件路径
+# selected_fields = [
+#     'track_id', 'file_type', 'content_type', 'content_length', 'title', 'dataset_id',
+#     'dataset_name', 'pdf_type', 'content_list', 'content', 'labels',
+#     'sub_path', 'url', 'date', '__unimernet_version', 'html',
+#     'remarkle', 'author', 'abstract', 'category', 'source',
+#     'relation_id', 'file_source', 'subject', 'processed'
+# ]'date'  'remarkle',
+selected_fields=[ 'track_id', 'file_type', 'content_type', 'content_length', 'title', 'dataset_id',
+                  'dataset_name', 'pdf_type','content','labels',
+                  'sub_path', 'url', '__unimernet_version', 'html',
+                   'author', 'abstract', 'category', 'source',
+                  'relation_id', 'file_source', 'subject', 'processed']
+try:
+    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
+        for line in infile:
+            try:
+                data = json.loads(line.strip())
+                new_data = {field: data[field] for field in selected_fields if field in data}
+                # 分割 content
+                content = new_data.get('content', '')
+                chunk_size = 8000  # 每段的长度
+                chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
+                for index, chunk in enumerate(chunks):
+                    new_data['content'] = chunk
+                    new_data['content_chunk_index'] = index  # 添加一个字段表示这是第几个分块
+                    outfile.write(json.dumps(new_data) + '\n')
+            except json.JSONDecodeError as e:
+                print(f"解析 JSON 时出错: {e}，跳过该行。")
+    print(f"已成功处理并保存到 {output_file}")
+    # 尝试加载处理后的文件
+    load_processed_jsonl(output_file)
+except FileNotFoundError:
+    print(f"文件 {input_file} 未找到。")