| import json |
| import csv |
| import os |
| from tqdm import tqdm |
| import re |
| from utils import * |
| import traceback |
|
|
|
|
| def process_json_files(csv_path, output_dir): |
| |
| os.makedirs(output_dir, exist_ok=True) |
| json_file = open(os.path.join(output_dir, 'output1.jsonl'), |
| 'w', |
| encoding='utf-8') |
| try: |
| |
| with open(csv_path, 'r', encoding='utf-8') as csv_file: |
| csv_reader = csv.reader(csv_file) |
| next(csv_reader) |
|
|
| |
| for row in tqdm(csv_reader, |
| desc="Processing JSON files", |
| unit="file"): |
| json_path = row[0] |
| |
| try: |
| |
| with open(json_path, 'r', encoding='utf-8') as f: |
| json_data = json.load(f) |
| img_path = row[1] |
| shape = cv2.imread(img_path).shape |
| |
| |
| doc_triplet = [] |
| doc_tgt_sen_trans = [] |
| doc_words_boxes_list = [] |
| |
| for key, value in json_data.items(): |
| if value.get("attribute") == 'text_block': |
| for text_ in value.get('text', []): |
| combined_list = [( |
| text_['src_words'][i], |
| text_['src_word_bboxes'][i], |
| ) for i in range(len(text_['src_words']))] |
| doc_words_boxes_list.extend(combined_list) |
| |
| doc_tgt_sen_trans.append( |
| text_['tgt_text.zh-CN']) |
| processed_list = [ |
| (src_w, src_w_boxes, resize_box(src_w_boxes, shape)) |
| for (src_w, src_w_boxes) in doc_words_boxes_list |
| ] |
| |
| sorted_tuple_list = tblr_reading_order_detector( |
| processed_list) |
|
|
| text_src_list = [atuple[0] for atuple in sorted_tuple_list] |
| layout_src_list = [ |
| atuple[2] for atuple in sorted_tuple_list |
| ] |
| text_src = ' '.join(text_src_list) |
| tgt_sen_trans = ''.join(doc_tgt_sen_trans) |
| |
| data_dict = { |
| "img_path": img_path, |
| "text_src": text_src, |
| "layout_src": layout_src_list, |
| "tgt_sen_trans": tgt_sen_trans |
| } |
| |
| json_line = json.dumps(data_dict, ensure_ascii=False) |
| json_file.write(json_line + '\n') |
|
|
| except FileNotFoundError: |
| print(f"File not found: {json_path}") |
| except json.JSONDecodeError: |
| print(f"Error decoding JSON in file: {json_path}") |
| except KeyError as e: |
| print(f"Missing key {e} in file: {json_path}") |
| except Exception as e: |
| print(f"Unexpected error processing {json_path}: {str(e)}") |
| traceback.print_exc() |
|
|
| except FileNotFoundError: |
| print(f"CSV file not found: {csv_path}") |
| except Exception as e: |
| print(f"Error reading CSV file: {str(e)}") |
|
|
| print("Processing completed!") |
|
|
|
|
| |
| csv_path = '/home/zychen/hwproject/my_modeling_phase_1/dataset/output.csv' |
| output_dir = '/home/zychen/hwproject/my_modeling_phase_1/dataset' |
|
|
| process_json_files(csv_path, output_dir) |
|
|