Spaces:
Sleeping
Sleeping
| """ | |
| Author: Khanh Phan | |
| Date: 2023-11-01 | |
| """ | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| # from mdutils import Html | |
| from mdutils.mdutils import MdUtils | |
| from src.settings import ( | |
| MAX_CHAR, | |
| TEXT_SCALE, | |
| ) | |
| class Markdown: | |
| def __init__(self, result: list = None, image: np.array = None) -> None: | |
| """ | |
| Put text into image | |
| args: | |
| result(list): recognition results | |
| image(array): RGB image | |
| """ | |
| self.scale = TEXT_SCALE | |
| """ | |
| self.im_width = 1475 | |
| self.im_height = 1029 | |
| self.res = pd.read_csv("results.csv", sep='\t', encoding='utf-8') | |
| """ | |
| self.im_width, self.im_height = image.size # grayscale image | |
| res = [] | |
| for line in result: | |
| box_h = get_text_height(line) | |
| row = ( | |
| box_h, | |
| line[0][0][1], | |
| line[0][0][0], | |
| line[0][3][1], | |
| line[0][1][0], | |
| line[1][0], | |
| line[1][1], | |
| "left", | |
| False, | |
| ) | |
| res.append(row) | |
| self.res = pd.DataFrame( | |
| res, | |
| columns=[ | |
| "box_h", # height of bbox | |
| "top", # get top from top-left | |
| "left", # get left from top-left | |
| "bot", # get bot from bot-left | |
| "right", # get right from top-right | |
| "text", # recognized text | |
| "score", | |
| "align", # left. center, right | |
| "ignore", | |
| ], | |
| ) | |
| # self.res.to_csv("results.csv", sep="\t", encoding="utf-8") | |
| def write(self, out_path: Path = Path(".")) -> None: | |
| """ | |
| Write markdown text to file | |
| args: | |
| out_path(Path): path to write md file | |
| """ | |
| self.check_same_line() | |
| # TODO: add space between line | |
| out_path = Path(out_path) | |
| if not out_path.suffix == ".md": | |
| out_path = out_path + ".out.md" | |
| mdFile = MdUtils( | |
| file_name=str(out_path), | |
| # title="Markdown File Example", | |
| ) | |
| for _, row in self.res.iterrows(): | |
| if row["ignore"]: | |
| continue | |
| align = self.edit_align(row["left"], row["right"]) | |
| if align == "right": | |
| space = MAX_CHAR - len(row["text"]) | |
| row["text"] = " " * space + row["text"] | |
| mdFile.new_paragraph(f"{row['text']}", align=align) | |
| mdFile.create_md_file() | |
| file = open(out_path, encoding="utf-8") | |
| md = file.read() | |
| file.close() | |
| return md | |
| def sort_text(self) -> None: | |
| """ | |
| Sort strings by their top coordinate (sort vertically) | |
| args: | |
| res(df): results | |
| """ | |
| self.res = self.res.sort_values("top") | |
| self.res = self.res.reset_index(drop=True) | |
| def check_same_line(self) -> None: | |
| """ | |
| Check if two strings can be connetted in the same line | |
| """ | |
| n = len(self.res) | |
| self.same_line_pair = [] | |
| for i in range(n): | |
| if self.res.iloc[i]["ignore"] is True: | |
| continue | |
| for j in range(min(i + 1, n), min(i + 3, n)): | |
| if self.res.iloc[j]["ignore"] is True: | |
| continue | |
| if check_overlap( | |
| self.res.iloc[i]["bot"], | |
| self.res.iloc[i]["top"], | |
| self.res.iloc[j]["bot"], | |
| self.res.iloc[j]["top"], | |
| ): | |
| if ( | |
| self.res.iloc[j]["left"] - self.res.iloc[i]["right"] | |
| > -self.im_width / 100 | |
| ): | |
| space = calculate_space( | |
| self.res.iloc[i]["box_h"], | |
| self.res.iloc[i]["right"], | |
| self.res.iloc[j]["left"], | |
| ) | |
| self.res.at[i, "text"] = ( | |
| self.res.iloc[i]["text"] | |
| + " " * space | |
| + self.res.iloc[j]["text"] | |
| ) | |
| self.res.at[j, "ignore"] = True | |
| self.res.at[i, "right"] = self.res.iloc[j]["right"] | |
| elif ( | |
| self.res.iloc[i]["left"] - self.res.iloc[j]["right"] | |
| > -self.im_width / 100 | |
| ): | |
| space = calculate_space( | |
| self.res.iloc[j]["box_h"], | |
| self.res.iloc[j]["right"], | |
| self.res.iloc[i]["left"], | |
| ) | |
| self.res.at[j, "text"] = ( | |
| self.res.iloc[j]["text"] | |
| + " " * space | |
| + self.res.iloc[i]["text"] | |
| ) | |
| self.res.at[i, "ignore"] = True | |
| self.res.at[j, "right"] = self.res.iloc[i]["right"] | |
| def edit_align(self, left: int, right: int) -> str: | |
| """ | |
| Find the best alignment from left & right margin | |
| args: | |
| left(int): left margin | |
| right(int): right margin | |
| return(text): | |
| alignment | |
| """ | |
| align = "left" | |
| left_margin = left | |
| right_margin = self.im_width - right | |
| if abs(left_margin + right_margin) > self.im_width * 2 / 3: | |
| align = "center" | |
| # in case of paragraph, it should be revise as left or right | |
| if self.im_width / 2 < right_margin: | |
| align = "left" | |
| if self.im_width / 2 < left_margin: | |
| align = "right" | |
| return align | |
| def calculate_space(height1: int, right1: int, left2: int) -> int: | |
| """ | |
| Calcualte spaces between 2 strings | |
| args: | |
| height1(int): height coord of string 1 | |
| right1(int): right coord of string 1 | |
| left1(int): left coord of string 2 | |
| return(int): | |
| number of spaces | |
| """ | |
| distance = abs(left2 - right1) | |
| spaces = distance / height1 | |
| return round(spaces) | |
| def check_overlap(max1: int, min1: int, max2: int, min2: int) -> bool: | |
| """ | |
| Calcualte spaces between 2 strings | |
| args: | |
| max1(int): right coord of string 1 | |
| min1(int): left coord of string 1 | |
| max1(int): right coord of string 1 | |
| min1(int): left coord of string 1 | |
| return(bool): | |
| Same line or not | |
| """ | |
| overlap = max(0, min(max1, max2) - max(min1, min2)) | |
| if overlap / (max1 - min1) > 0.5 or overlap / (max2 - min2) > 0.5: | |
| return True | |
| else: | |
| return False | |
| def get_text_height(line) -> int: | |
| """ | |
| Write markdown text to file | |
| args: | |
| line(array): recognition result | |
| return(int): | |
| height of bbox in pixels | |
| """ | |
| top = int(min(line[0][0][1], line[0][1][1])) | |
| bottom = int(max(line[0][2][1], line[0][3][1])) | |
| return bottom - top | |
| if __name__ == "__main__": | |
| md = Markdown() | |
| md.write() | |