Spaces:
Sleeping
Sleeping
| """ | |
| Author: Khanh Phan | |
| Date: 2023-11-01 | |
| """ | |
| import math | |
| import string | |
| from pathlib import Path | |
| import cv2 | |
| import numpy as np | |
| from PIL import ( | |
| Image, | |
| ImageDraw, | |
| ImageFont, | |
| ) | |
| from src.markdown import Markdown | |
| from src.settings import ( | |
| FONTPATH, | |
| OUT_DIR, | |
| ) | |
| def count_characters(str: str) -> int: | |
| """ | |
| Count the number of Japanese characters, | |
| a single English character and a single number | |
| equal to half the length of Japanese characters. | |
| args: | |
| s(string): the input of string | |
| return(int): | |
| the number of Japanese characters | |
| """ | |
| count_zh = count_pu = 0 | |
| s_len = len(str) | |
| en_dg_count = 0 | |
| for c in str: | |
| if c in string.ascii_letters or c.isdigit() or c.isspace(): | |
| en_dg_count += 1 | |
| elif c.isalpha(): | |
| count_zh += 1 | |
| else: | |
| count_pu += 1 | |
| return s_len - math.ceil(en_dg_count / 2) | |
| def create_blank_img(img_h: int, img_w: int) -> Image: | |
| """ | |
| create new blank img | |
| args: | |
| img_h(int): the height of blank img | |
| img_w(int): the width of blank img | |
| return(Image|array): | |
| blank image | |
| """ | |
| blank_img = np.ones(shape=[img_h, img_w], dtype=np.int8) * 255 | |
| blank_img[:, img_w - 1 :] = 0 | |
| blank_img = Image.fromarray(blank_img).convert("RGB") | |
| # draw_txt = ImageDraw.Draw(blank_img) | |
| return blank_img | |
| def text_visual( | |
| texts: list[str], | |
| scores: list[float], | |
| img_h: int = 400, | |
| img_w: int = 600, | |
| threshold: float = 0.0, | |
| font_path: str = FONTPATH, | |
| ) -> np.array: | |
| """ | |
| Create new img with recognized text | |
| args: | |
| texts(list): the text will be draw | |
| scores(list|None): corresponding score of each txt | |
| img_h(int): the height of blank img | |
| img_w(int): the width of blank img | |
| font_path: the path of font which is used to draw text | |
| return(Image|array): image with recognized text | |
| """ | |
| if scores is not None: | |
| assert len(texts) == len( | |
| scores, | |
| ), "The number of txts and corresponding scores must match" | |
| blank_img = create_blank_img() | |
| draw_txt = ImageDraw.Draw(blank_img) | |
| font_size = 20 | |
| txt_color = (0, 0, 0) | |
| font = ImageFont.truetype(font_path, font_size, encoding="utf-8") | |
| gap = font_size + 5 | |
| txt_img_list = [] | |
| count, index = 1, 0 | |
| for idx, txt in enumerate(texts): | |
| index += 1 | |
| if scores[idx] < threshold or math.isnan(scores[idx]): | |
| index -= 1 | |
| continue | |
| first_line = True | |
| while count_characters(txt) >= img_w // font_size - 4: | |
| tmp = txt | |
| txt = tmp[: img_w // font_size - 4] | |
| if first_line: | |
| new_txt = str(index) + ": " + txt | |
| first_line = False | |
| else: | |
| new_txt = " " + txt | |
| draw_txt.text((0, gap * count), new_txt, txt_color, font=font) | |
| txt = tmp[img_w // font_size - 4 :] | |
| if count >= img_h // gap - 1: | |
| txt_img_list.append(np.array(blank_img)) | |
| blank_img = create_blank_img() | |
| draw_txt = ImageDraw.Draw(blank_img) | |
| count = 0 | |
| count += 1 | |
| if first_line: | |
| new_txt = str(index) + ": " + txt + " " + "%.3f" % (scores[idx]) | |
| else: | |
| new_txt = " " + txt + " " + "%.3f" % (scores[idx]) | |
| draw_txt.text((0, gap * count), new_txt, txt_color, font=font) | |
| # whether add new blank img or not | |
| if count >= img_h // gap - 1 and idx + 1 < len(texts): | |
| txt_img_list.append(np.array(blank_img)) | |
| blank_img = create_blank_img() | |
| draw_txt = ImageDraw.Draw(blank_img) | |
| count = 0 | |
| count += 1 | |
| txt_img_list.append(np.array(blank_img)) | |
| if len(txt_img_list) == 1: | |
| blank_img = np.array(txt_img_list[0]) | |
| else: | |
| blank_img = np.concatenate(txt_img_list, axis=1) | |
| return np.array(blank_img) | |
| def resize_img(img: np.array, input_size: int = 600) -> np.array: | |
| """ | |
| Resize img and limit the longest side of the image to input_size | |
| args: | |
| img(np.array): original image | |
| input_size(int): new size of the longest side of the image | |
| return(Image|array): | |
| a new-size image | |
| """ | |
| img = np.array(img) | |
| im_shape = img.shape | |
| im_size_max = np.max(im_shape[0:2]) | |
| im_scale = float(input_size) / float(im_size_max) | |
| img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale) | |
| return img | |
| def draw_ocr( | |
| image: np.array, | |
| boxes: list, | |
| txts: list[str] = None, | |
| scores: list[float] = None, | |
| drop_score: float = 0.0, | |
| font_path: str = FONTPATH, | |
| ) -> np.array: | |
| """ | |
| Visualize the results of OCR detection and recognition | |
| args: | |
| image(Image|array): RGB image | |
| boxes(list): boxes with shape(N, 4, 2) | |
| txts(list): the texts | |
| scores(list): txxs corresponding scores | |
| drop_score(float): only scores > drop_threshold will be visualized | |
| font_path: the path of font which is used to draw text | |
| return(Image|array): | |
| the visualized img | |
| """ | |
| if scores is None: | |
| scores = [1] * len(boxes) | |
| box_num = len(boxes) | |
| for i in range(box_num): | |
| if scores is not None and ( | |
| scores[i] < drop_score or math.isnan(scores[i]) | |
| ): | |
| continue | |
| box = np.reshape(np.array(boxes[i]), [-1, 1, 2]).astype(np.int64) | |
| image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2) | |
| if txts is not None: | |
| img = np.array(image) | |
| txt_img = text_visual( | |
| txts, | |
| scores, | |
| img_h=img.shape[0], | |
| img_w=600, | |
| threshold=drop_score, | |
| font_path=font_path, | |
| ) | |
| img = np.concatenate([np.array(img), np.array(txt_img)], axis=1) | |
| return img | |
| return image | |
| def draw_ocr_2( | |
| img: np.array, | |
| results: list[list, tuple([str, float])], | |
| ) -> list[np.array, np.array]: | |
| """ | |
| Visualize the results of OCR detection and recognition | |
| args: | |
| image(Image|array): RGB image | |
| results(list): boxes with shape(N, 4, 2), texts and scores | |
| return(Image|array): | |
| the visualized img | |
| """ | |
| img = np.asarray(img) | |
| img_text = np.ones((img.shape[0], img.shape[1], 3), np.uint8) * 255 | |
| for line in results: | |
| text = line[1][0] | |
| # score = line[1][1] | |
| top = int(min(line[0][0][1], line[0][1][1])) | |
| bottom = int(max(line[0][2][1], line[0][3][1])) | |
| left = int(min(line[0][0][0], line[0][3][0])) | |
| # right = int(max(line[0][1][0], line[0][2][0])) | |
| # text_size = max(1, int((right - left) / len(text))) | |
| text_size = max(1, int(bottom - top)) | |
| color = ( | |
| np.random.randint(0, 255), | |
| np.random.randint(0, 255), | |
| np.random.randint(0, 255), | |
| ) | |
| box = np.reshape(np.array(line[0]), [-1, 1, 2]).astype(np.int64) | |
| img = cv2.polylines(np.array(img), [box], True, color, 2) | |
| img_text = place_text(img_text, text, (left, top), text_size, color) | |
| return [img, img_text] | |
| def place_text( | |
| img: np.array, | |
| text: str, | |
| top_left_point: list[int, int], | |
| text_size: int, | |
| text_color: tuple([int, int, int]), | |
| ) -> np.array: | |
| """ | |
| Put text into image | |
| args: | |
| img(Image|array): RGB image | |
| text(list): text to be put | |
| text_size(int): size of text | |
| top_left_point(array): top-left point to start the text | |
| text_color(tuple): text color in () | |
| return(Image|array): | |
| the visualized img | |
| """ | |
| font = ImageFont.truetype(FONTPATH, text_size, encoding="utf-8") | |
| img_pil = Image.fromarray(img) | |
| draw = ImageDraw.Draw(img_pil) | |
| draw.text(top_left_point, text, fill=text_color, font=font) | |
| return np.array(img_pil) | |
| def visualize_result( | |
| result: list[list, tuple([str, float])], | |
| img: str, | |
| ) -> list[np.array, np.array]: | |
| """ | |
| make visualization in image foramt | |
| args: | |
| result(array): RGB image | |
| img_path(str): path to input image | |
| return(Image|array): | |
| the visualized img | |
| """ | |
| result = result[0] | |
| if isinstance(img, str): | |
| img_path = img | |
| image = Image.open(img_path).convert("RGB") | |
| img_name = Path(img_path).stem | |
| """ | |
| boxes = [line[0] for line in result] | |
| txts = [line[1][0] for line in result] | |
| scores = [line[1][1] for line in result] | |
| """ | |
| else: | |
| image = Image.fromarray(img) | |
| img_name = Path("gradio_input") | |
| # Write results to markdown file | |
| format_md = Markdown(result, image) | |
| md_path = (Path(OUT_DIR) / img_name).with_suffix(".out.md") | |
| markdown = format_md.write(md_path) | |
| # Write results to image file | |
| [img_boxes, img_text] = draw_ocr_2(image, result) | |
| img_combination = np.concatenate( | |
| [np.array(img_boxes), np.array(img_text)], | |
| axis=1, | |
| ) | |
| img_out_path = (Path(OUT_DIR) / img_name).with_suffix(".out.jpg") | |
| cv2.imwrite(str(img_out_path), img_combination) | |
| return [img_boxes, img_text, markdown] | |