File size: 3,451 Bytes
4245984 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | # import json
# def process_jsonl(file_path):
# info_count = 0
# first_entries = []
# try:
# with open(file_path, 'r', encoding='utf-8') as file:
# for line in file:
# try:
# entry = json.loads(line)
# info_count += 1
# if len(first_entries) < 5:
# first_entries.append(entry)
# except json.JSONDecodeError:
# print("错误:无法解析某一行的JSON数据。")
# except FileNotFoundError:
# print("错误:未找到指定的JSONL文件。")
# print(f"JSONL文件中的信息数量为: {info_count}")
# # print("前五个条目信息如下:")
# # for i, entry in enumerate(first_entries, start=1):
# # print(f"条目 {i}: {entry}")
# if __name__ == "__main__":
# file_path = 'matched_records.jsonl'
# process_jsonl(file_path)
import json
from datasets import load_dataset
def load_processed_jsonl(file_path):
try:
dataset = load_dataset('json', data_files=file_path)
print("数据集加载成功:")
print(dataset)
return dataset
except Exception as e:
print(f"加载数据集时出现错误: {e},详细错误信息如下:")
import traceback
traceback.print_exc()
input_file = '/fs-computility/ai-shen/wangyujia/datasets-jiaocai/category_10/output_new/biology/biology.jsonl' # 替换为你的输入 JSONL 文件路径
output_file = '/fs-computility/ai-shen/wangyujia/datasets-jiaocai/category_10/output_new/biology/match_bio.jsonl' # 替换为你希望的输出 JSONL 文件路径
# selected_fields = [
# 'track_id', 'file_type', 'content_type', 'content_length', 'title', 'dataset_id',
# 'dataset_name', 'pdf_type', 'content_list', 'content', 'labels',
# 'sub_path', 'url', 'date', '__unimernet_version', 'html',
# 'remarkle', 'author', 'abstract', 'category', 'source',
# 'relation_id', 'file_source', 'subject', 'processed'
# ]'date' 'remarkle',
selected_fields=[ 'content_length', 'title',
'dataset_name', 'pdf_type','content',
'url', 'html',
'author', 'abstract', 'category',
'relation_id', 'subject', 'processed']
try:
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
for line in infile:
try:
data = json.loads(line.strip())
new_data = {field: data[field] for field in selected_fields if field in data}
# 分割 content
content = new_data.get('content', '')
chunk_size = 6000 # 每段的长度
chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
for index, chunk in enumerate(chunks):
new_data['content'] = chunk
new_data['content_chunk_index'] = index # 添加一个字段表示这是第几个分块
outfile.write(json.dumps(new_data) + '\n')
except json.JSONDecodeError as e:
print(f"解析 JSON 时出错: {e},跳过该行。")
print(f"已成功处理并保存到 {output_file}")
# 尝试加载处理后的文件
load_processed_jsonl(output_file)
except FileNotFoundError:
print(f"文件 {input_file} 未找到。") |