Spaces:

Raj-Master
/

tabel_ocr

Runtime error

App Files Files Community

Raj-Master commited on May 28, 2023

Commit

4635598

1 Parent(s): e1422df

Create predict.py

Browse files

Files changed (1) hide show

predict.py +151 -0

predict.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import tempfile
+import random
+import string
+from ultralyticsplus import YOLO
+import streamlit as st
+import numpy as np
+import pandas as pd
+from process import (
+    filter_columns,
+    extract_text_of_col,
+    prepare_cols,
+    process_cols,
+    finalize_data,
+)
+from file_utils import (
+    get_img,
+    save_excel_file,
+    concat_csv,
+    convert_pdf_to_image,
+    filter_color,
+    plot,
+    delete_file,
+)
+def process_img(
+    img,
+    page_enumeration: int = 0,
+    filter=False,
+    foldername: str = "",
+    filename: str = "",
+):
+    tables = PaddleOCR.table_model(img, conf=0.75)
+    tables = tables[0].boxes.xyxy.cpu().numpy()
+    results = []
+    for table in tables:
+        try:
+            # * crop the table as an image from the original image
+            sub_img = img[
+                int(table[1].item()): int(table[3].item()),
+                int(table[0].item()): int(table[2].item()),
+            ]
+            columns_detect = PaddleOCR.column_model(sub_img, conf=0.75)
+            cols_data = columns_detect[0].boxes.data.cpu().numpy()
+            # * Sort columns according to the x coordinate
+            cols_data = np.array(
+                sorted(cols_data, key=lambda x: x[0]), dtype=np.ndarray
+            )
+            # * merge the duplicated columns
+            cols_data = filter_columns(cols_data)
+            st.image(plot(sub_img, cols_data), channels="RGB")
+        except:
+            st.warning("No Detection")
+        try:
+            columns = cols_data[:, 0:4]
+            sub_imgs = []
+            for column in columns:
+                # * Create list of cropped images for each column
+                sub_imgs.append(sub_img[:, int(column[0]): int(column[2])])
+            cols = []
+            thr = 0
+            for image in sub_imgs:
+                if filter:
+                    # * keep only black color in the image
+                    image = filter_color(image)
+                # * extract text of each column and get the length threshold
+                res, threshold = extract_text_of_col(image)
+                thr += threshold
+                # * arrange the rows of each column with respect to row length threshold
+                cols.append(prepare_cols(res, threshold * 0.6))
+            thr = thr / len(sub_imgs)
+            # * append each element in each column to its right place in the dataframe
+            data = process_cols(cols, thr * 0.6)
+            # * merge the related rows together
+            data: pd.DataFrame = finalize_data(data, page_enumeration)
+            results.append(data)
+            print("data : ",data)
+            print("results : ", results)
+        except:
+            st.warning("Text Extraction Failed")
+            continue
+    list(
+        map(
+            lambda x: save_excel_file(
+                *x,
+                foldername,
+                filename,
+                page_enumeration,
+            ),
+            enumerate(results),
+        )
+    )
+class PaddleOCR:
+    # Load Image Detection model
+    table_model = YOLO("table.pt")
+    column_model = YOLO("columns.pt")
+    def __call__(self, uploaded, filter=False):
+        foldername = tempfile.TemporaryDirectory(dir=os.getcwd())
+        filename = uploaded.name.split(".")[0]
+        if uploaded.name.split(".")[1].lower() == "pdf":
+            pdf_pages = convert_pdf_to_image(uploaded.read())
+            for page_enumeration, page in enumerate(pdf_pages, start=1):
+                process_img(
+                    np.asarray(page),
+                    page_enumeration,
+                    filter=filter,
+                    foldername=foldername.name,
+                    filename=filename,
+                )
+        else:
+            img = get_img(uploaded)
+            process_img(
+                img,
+                filter=filter,
+                foldername=foldername.name,
+                filename=filename,
+            )
+        # * concatenate all csv files if many
+        extra = "".join(random.choices(string.ascii_uppercase, k=5))
+        filename = f"{filename}_{extra}.csv"
+        try:
+            concat_csv(foldername, filename)
+        except:
+            st.warning("No results found")
+        foldername.cleanup()
+        if os.path.exists(filename):
+            with open(f"{filename}", "rb") as fp:
+                st.download_button(
+                    label="Download CSV file",
+                    data=fp,
+                    file_name=filename,
+                    mime="text/csv",
+                )
+            delete_file(filename)
+        else:
+            st.warning("No results found")