Spaces:
Build error
Build error
| import json | |
| import argparse | |
| from tqdm import tqdm | |
| def process_line(line, old_text, new_text): | |
| # 解析 JSON 行 | |
| data = json.loads(line) | |
| # 递归函数来处理嵌套的字典和列表 | |
| def replace_text(obj): | |
| if isinstance(obj, dict): | |
| return {k: replace_text(v) for k, v in obj.items()} | |
| elif isinstance(obj, list): | |
| return [replace_text(item) for item in obj] | |
| elif isinstance(obj, str): | |
| return obj.replace(old_text, new_text) | |
| else: | |
| return obj | |
| # 处理整个 JSON 对象 | |
| processed_data = replace_text(data) | |
| # 将处理后的对象转回 JSON 字符串 | |
| return json.dumps(processed_data, ensure_ascii=False) | |
| def main(input_file, output_file, old_text, new_text): | |
| with open(input_file, 'r', encoding='utf-8') as infile, \ | |
| open(output_file, 'w', encoding='utf-8') as outfile: | |
| # 计算总行数用于进度条 | |
| total_lines = sum(1 for _ in infile) | |
| infile.seek(0) # 重置文件指针到开头 | |
| # 使用 tqdm 创建进度条 | |
| for line in tqdm(infile, total=total_lines, desc="Processing"): | |
| processed_line = process_line(line.strip(), old_text, new_text) | |
| outfile.write(processed_line + '\n') | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Replace text in a JSONL file.") | |
| parser.add_argument("input_file", help="Input JSONL file to process") | |
| parser.add_argument("output_file", help="Output file for processed JSONL") | |
| parser.add_argument("--old_text", default="尖米", help="Text to be replaced") | |
| parser.add_argument("--new_text", default="机智流", help="Text to replace with") | |
| args = parser.parse_args() | |
| main(args.input_file, args.output_file, args.old_text, args.new_text) |