Upload datasets-jiaocai/predata.py with huggingface_hub
Browse files- datasets-jiaocai/predata.py +86 -0
datasets-jiaocai/predata.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
def process_jsonl(file_path):
|
| 4 |
+
info_count = 0
|
| 5 |
+
first_entries = []
|
| 6 |
+
try:
|
| 7 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
| 8 |
+
for line in file:
|
| 9 |
+
try:
|
| 10 |
+
entry = json.loads(line)
|
| 11 |
+
info_count += 1
|
| 12 |
+
if len(first_entries) < 5:
|
| 13 |
+
first_entries.append(entry)
|
| 14 |
+
except json.JSONDecodeError:
|
| 15 |
+
print("错误:无法解析某一行的JSON数据。")
|
| 16 |
+
except FileNotFoundError:
|
| 17 |
+
print("错误:未找到指定的JSONL文件。")
|
| 18 |
+
|
| 19 |
+
print(f"JSONL文件中的信息数量为: {info_count}")
|
| 20 |
+
# print("前五个条目信息如下:")
|
| 21 |
+
# for i, entry in enumerate(first_entries, start=1):
|
| 22 |
+
# print(f"条目 {i}: {entry}")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
file_path = 'matched_records.jsonl'
|
| 27 |
+
process_jsonl(file_path)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
import json
|
| 33 |
+
from datasets import load_dataset
|
| 34 |
+
|
| 35 |
+
def load_processed_jsonl(file_path):
|
| 36 |
+
try:
|
| 37 |
+
dataset = load_dataset('json', data_files=file_path)
|
| 38 |
+
print("数据集加载成功:")
|
| 39 |
+
print(dataset)
|
| 40 |
+
return dataset
|
| 41 |
+
except Exception as e:
|
| 42 |
+
print(f"加载数据集时出现错误: {e},详细错误信息如下:")
|
| 43 |
+
import traceback
|
| 44 |
+
traceback.print_exc()
|
| 45 |
+
|
| 46 |
+
input_file = 'matched_records.jsonl' # 替换为你的输入 JSONL 文件路径
|
| 47 |
+
output_file = 'matched_bio.jsonl' # 替换为你希望的输出 JSONL 文件路径
|
| 48 |
+
|
| 49 |
+
# selected_fields = [
|
| 50 |
+
# 'track_id', 'file_type', 'content_type', 'content_length', 'title', 'dataset_id',
|
| 51 |
+
# 'dataset_name', 'pdf_type', 'content_list', 'content', 'labels',
|
| 52 |
+
# 'sub_path', 'url', 'date', '__unimernet_version', 'html',
|
| 53 |
+
# 'remarkle', 'author', 'abstract', 'category', 'source',
|
| 54 |
+
# 'relation_id', 'file_source', 'subject', 'processed'
|
| 55 |
+
# ]'date' 'remarkle',
|
| 56 |
+
selected_fields=[ 'track_id', 'file_type', 'content_type', 'content_length', 'title', 'dataset_id',
|
| 57 |
+
'dataset_name', 'pdf_type','content','labels',
|
| 58 |
+
'sub_path', 'url', '__unimernet_version', 'html',
|
| 59 |
+
'author', 'abstract', 'category', 'source',
|
| 60 |
+
'relation_id', 'file_source', 'subject', 'processed']
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
|
| 64 |
+
for line in infile:
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
data = json.loads(line.strip())
|
| 68 |
+
new_data = {field: data[field] for field in selected_fields if field in data}
|
| 69 |
+
|
| 70 |
+
# 分割 content
|
| 71 |
+
content = new_data.get('content', '')
|
| 72 |
+
chunk_size = 8000 # 每段的长度
|
| 73 |
+
chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
|
| 74 |
+
|
| 75 |
+
for index, chunk in enumerate(chunks):
|
| 76 |
+
new_data['content'] = chunk
|
| 77 |
+
new_data['content_chunk_index'] = index # 添加一个字段表示这是第几个分块
|
| 78 |
+
outfile.write(json.dumps(new_data) + '\n')
|
| 79 |
+
except json.JSONDecodeError as e:
|
| 80 |
+
print(f"解析 JSON 时出错: {e},跳过该行。")
|
| 81 |
+
|
| 82 |
+
print(f"已成功处理并保存到 {output_file}")
|
| 83 |
+
# 尝试加载处理后的文件
|
| 84 |
+
load_processed_jsonl(output_file)
|
| 85 |
+
except FileNotFoundError:
|
| 86 |
+
print(f"文件 {input_file} 未找到。")
|