yuccaaa
/

dataset

Model card Files Files and versions

dataset / test_get_data.py

yuccaaa's picture

Add files using upload-large-folder tool

4245984 verified 6 months ago

history blame contribute delete

3.45 kB

	# import json

	# def process_jsonl(file_path):
	# info_count = 0
	# first_entries = []
	# try:
	# with open(file_path, 'r', encoding='utf-8') as file:
	# for line in file:
	# try:
	# entry = json.loads(line)
	# info_count += 1
	# if len(first_entries) < 5:
	# first_entries.append(entry)
	# except json.JSONDecodeError:
	# print("错误：无法解析某一行的JSON数据。")
	# except FileNotFoundError:
	# print("错误：未找到指定的JSONL文件。")

	# print(f"JSONL文件中的信息数量为: {info_count}")
	# # print("前五个条目信息如下：")
	# # for i, entry in enumerate(first_entries, start=1):
	# # print(f"条目 {i}: {entry}")


	# if __name__ == "__main__":
	# file_path = 'matched_records.jsonl'
	# process_jsonl(file_path)




	import json
	from datasets import load_dataset

	def load_processed_jsonl(file_path):
	try:
	dataset = load_dataset('json', data_files=file_path)
	print("数据集加载成功：")
	print(dataset)
	return dataset
	except Exception as e:
	print(f"加载数据集时出现错误: {e}，详细错误信息如下：")
	import traceback
	traceback.print_exc()

	input_file = '/fs-computility/ai-shen/wangyujia/datasets-jiaocai/category_10/output_new/biology/biology.jsonl' # 替换为你的输入 JSONL 文件路径
	output_file = '/fs-computility/ai-shen/wangyujia/datasets-jiaocai/category_10/output_new/biology/match_bio.jsonl' # 替换为你希望的输出 JSONL 文件路径

	# selected_fields = [
	# 'track_id', 'file_type', 'content_type', 'content_length', 'title', 'dataset_id',
	# 'dataset_name', 'pdf_type', 'content_list', 'content', 'labels',
	# 'sub_path', 'url', 'date', '__unimernet_version', 'html',
	# 'remarkle', 'author', 'abstract', 'category', 'source',
	# 'relation_id', 'file_source', 'subject', 'processed'
	# ]'date' 'remarkle',
	selected_fields=[ 'content_length', 'title',
	'dataset_name', 'pdf_type','content',
	'url', 'html',
	'author', 'abstract', 'category',
	'relation_id', 'subject', 'processed']

	try:
	with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
	for line in infile:

	try:
	data = json.loads(line.strip())
	new_data = {field: data[field] for field in selected_fields if field in data}

	# 分割 content
	content = new_data.get('content', '')
	chunk_size = 6000 # 每段的长度
	chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]

	for index, chunk in enumerate(chunks):
	new_data['content'] = chunk
	new_data['content_chunk_index'] = index # 添加一个字段表示这是第几个分块
	outfile.write(json.dumps(new_data) + '\n')
	except json.JSONDecodeError as e:
	print(f"解析 JSON 时出错: {e}，跳过该行。")

	print(f"已成功处理并保存到 {output_file}")
	# 尝试加载处理后的文件
	load_processed_jsonl(output_file)
	except FileNotFoundError:
	print(f"文件 {input_file} 未找到。")