longtc
/

ocr

Model card Files Files and versions

ocr / convert_format.py

longtc's picture

Upload 3 files

2b45a96 verified 8 months ago

history blame contribute delete

733 Bytes

	import json

	with open(r'D:\MyCode\Python\Model\paddleocr\total_text\test\train.txt', 'r', encoding='utf-8') as f, open(r'D:\MyCode\Python\Model\paddleocr\total_text\train\train_rec.txt', 'w', encoding='utf-8') as out_f:
	for line in f:
	parts = line.strip().split('\t')
	if len(parts) != 2:
	continue # bỏ qua dòng lỗi

	img_path, annotations = parts
	try:
	ann_list = json.loads(annotations)
	for ann in ann_list:
	text = ann.get("transcription", "").strip()
	if text:
	out_f.write(f"{img_path}\t{text}\n")
	except json.JSONDecodeError:
	print(f"Lỗi JSON ở dòng: {line}")