# import json # def process_jsonl(file_path): # info_count = 0 # first_entries = [] # try: # with open(file_path, 'r', encoding='utf-8') as file: # for line in file: # try: # entry = json.loads(line) # info_count += 1 # if len(first_entries) < 5: # first_entries.append(entry) # except json.JSONDecodeError: # print("错误:无法解析某一行的JSON数据。") # except FileNotFoundError: # print("错误:未找到指定的JSONL文件。") # print(f"JSONL文件中的信息数量为: {info_count}") # # print("前五个条目信息如下:") # # for i, entry in enumerate(first_entries, start=1): # # print(f"条目 {i}: {entry}") # if __name__ == "__main__": # file_path = 'matched_records.jsonl' # process_jsonl(file_path) import json from datasets import load_dataset def load_processed_jsonl(file_path): try: dataset = load_dataset('json', data_files=file_path) print("数据集加载成功:") print(dataset) return dataset except Exception as e: print(f"加载数据集时出现错误: {e},详细错误信息如下:") import traceback traceback.print_exc() input_file = '/fs-computility/ai-shen/wangyujia/datasets-jiaocai/category_10/output_new/biology/biology.jsonl' # 替换为你的输入 JSONL 文件路径 output_file = '/fs-computility/ai-shen/wangyujia/datasets-jiaocai/category_10/output_new/biology/match_bio.jsonl' # 替换为你希望的输出 JSONL 文件路径 # selected_fields = [ # 'track_id', 'file_type', 'content_type', 'content_length', 'title', 'dataset_id', # 'dataset_name', 'pdf_type', 'content_list', 'content', 'labels', # 'sub_path', 'url', 'date', '__unimernet_version', 'html', # 'remarkle', 'author', 'abstract', 'category', 'source', # 'relation_id', 'file_source', 'subject', 'processed' # ]'date' 'remarkle', selected_fields=[ 'content_length', 'title', 'dataset_name', 'pdf_type','content', 'url', 'html', 'author', 'abstract', 'category', 'relation_id', 'subject', 'processed'] try: with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile: for line in infile: try: data = json.loads(line.strip()) new_data = {field: data[field] for field in selected_fields if field in data} # 分割 content content = new_data.get('content', '') chunk_size = 6000 # 每段的长度 chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)] for index, chunk in enumerate(chunks): new_data['content'] = chunk new_data['content_chunk_index'] = index # 添加一个字段表示这是第几个分块 outfile.write(json.dumps(new_data) + '\n') except json.JSONDecodeError as e: print(f"解析 JSON 时出错: {e},跳过该行。") print(f"已成功处理并保存到 {output_file}") # 尝试加载处理后的文件 load_processed_jsonl(output_file) except FileNotFoundError: print(f"文件 {input_file} 未找到。")