Spaces:

Raj-Master
/

tabel_ocr

Runtime error

App Files Files Community

Raj-Master commited on May 28, 2023

Commit

e1422df

1 Parent(s): 4911ff5

Create file_utils.py

Browse files

Files changed (1) hide show

file_utils.py +109 -0

file_utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import cv2
+import math
+import numpy as np
+import pandas as pd
+from pdf2image import convert_from_bytes
+import streamlit as st
+def get_img(uploaded_file):
+    # convert file bytes into cv2 image
+    file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
+    img = cv2.imdecode(file_bytes, 1)
+    return img
+def convert_pdf_to_image(filename):
+    # * returns back a list of images according to the pdf pages
+    pdf_pages = convert_from_bytes(filename, 500)
+    return pdf_pages
+def filter_color(img):
+    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+    # define range of black color in HSV
+    lower_val = np.array([0, 0, 0])
+    upper_val = np.array([179, 100, 130])
+    # Threshold the HSV image to get only black colors
+    mask = cv2.inRange(hsv, lower_val, upper_val)
+    # Bitwise-AND mask and original image
+    res = cv2.bitwise_not(mask)
+    return res
+def plot(img, boxes):
+    FONT_SCALE = 1e-3
+    THICKNESS_SCALE = 1e-3
+    TEXT_Y_OFFSET_SCALE = 2.5e-2
+    height, width, _ = img.shape
+    font_scale = min(width, height) * FONT_SCALE
+    thickness = math.ceil(min(width, height) * THICKNESS_SCALE)
+    tmp = img.copy()
+    for box in boxes:
+        top_left = (int(box[0]), int(box[1]))
+        bottom_right = (int(box[2]), int(box[3]))
+        tmp = cv2.rectangle(tmp, top_left, bottom_right,
+                            (0, 0, 255), thickness)
+        text = str(round(float(box[4]), 2))
+        cv2.putText(
+            tmp,
+            text,
+            (int(box[0]), int(box[1]) + int(height * TEXT_Y_OFFSET_SCALE)),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            font_scale,
+            (0, 0, 255),
+            thickness,
+        )
+    return tmp
+def delete_file(filename):
+    if os.path.exists(filename):
+        os.remove(filename)
+def save_excel_file(
+    idx, df: pd.DataFrame, foldername, filename, page_enumeration: int = 0
+):
+    df.to_csv(
+        f"{foldername}/{filename}page{page_enumeration}table{idx}.csv",
+        index=False,
+    )
+def concat_csv(folder, filename: str):
+    df = pd.DataFrame()
+    foldername = folder.name
+    files = list(
+        sorted(
+            os.listdir(foldername), key=lambda x: x.split("page")[1].split("table")[0]
+        )
+    )
+    columns = []
+    for idx, file in enumerate(files):
+        tmp = pd.read_csv(f"{foldername}/{file}")
+        try:
+            if idx == 0:
+                columns = tmp.iloc[0]
+            df = pd.concat([df, tmp[1:]])
+        except:
+            continue
+    if not df.empty:
+        df.columns = columns
+        st.dataframe(df)
+        df.to_csv(filename, index=False)