ocr / convert_format.py
longtc's picture
Upload 3 files
2b45a96 verified
import json
with open(r'D:\MyCode\Python\Model\paddleocr\total_text\test\train.txt', 'r', encoding='utf-8') as f, open(r'D:\MyCode\Python\Model\paddleocr\total_text\train\train_rec.txt', 'w', encoding='utf-8') as out_f:
for line in f:
parts = line.strip().split('\t')
if len(parts) != 2:
continue # bỏ qua dòng lỗi
img_path, annotations = parts
try:
ann_list = json.loads(annotations)
for ann in ann_list:
text = ann.get("transcription", "").strip()
if text:
out_f.write(f"{img_path}\t{text}\n")
except json.JSONDecodeError:
print(f"Lỗi JSON ở dòng: {line}")