Spaces:
Runtime error
Runtime error
change vietocr to PaddleOCR
Browse files- README.md +5 -4
- app.py +27 -16
- requirements.txt +3 -1
README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
---
|
| 2 |
-
title: Table Extraction
|
| 3 |
emoji: π
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: streamlit
|
| 7 |
sdk_version: 1.21.0
|
| 8 |
app_file: app.py
|
|
@@ -12,7 +12,6 @@ pinned: false
|
|
| 12 |
|
| 13 |
Imported from https://huggingface.co/spaces/jurgendn/table-extraction with some adjustment.
|
| 14 |
|
| 15 |
-
|
| 16 |
Current pipeline:
|
| 17 |
|
| 18 |
Table detection: https://huggingface.co/microsoft/table-transformer-detection
|
|
@@ -20,3 +19,5 @@ Table detection: https://huggingface.co/microsoft/table-transformer-detection
|
|
| 20 |
Table recognition: https://huggingface.co/microsoft/table-transformer-structure-recognition
|
| 21 |
|
| 22 |
OCR: https://github.com/pbcquoc/vietocr
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Table Extraction (Table Transformer + PaddleOCR)
|
| 3 |
emoji: π
|
| 4 |
+
colorFrom: while
|
| 5 |
+
colorTo: black
|
| 6 |
sdk: streamlit
|
| 7 |
sdk_version: 1.21.0
|
| 8 |
app_file: app.py
|
|
|
|
| 12 |
|
| 13 |
Imported from https://huggingface.co/spaces/jurgendn/table-extraction with some adjustment.
|
| 14 |
|
|
|
|
| 15 |
Current pipeline:
|
| 16 |
|
| 17 |
Table detection: https://huggingface.co/microsoft/table-transformer-detection
|
|
|
|
| 19 |
Table recognition: https://huggingface.co/microsoft/table-transformer-structure-recognition
|
| 20 |
|
| 21 |
OCR: https://github.com/pbcquoc/vietocr
|
| 22 |
+
|
| 23 |
+
OCR-new: https://github.com/PaddlePaddle/PaddleOCR
|
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import asyncio
|
| 2 |
import string
|
|
|
|
| 3 |
from collections import Counter
|
| 4 |
from itertools import count, tee
|
| 5 |
|
|
@@ -11,8 +12,9 @@ import streamlit as st
|
|
| 11 |
import torch
|
| 12 |
from PIL import Image
|
| 13 |
from transformers import DetrImageProcessor, TableTransformerForObjectDetection
|
| 14 |
-
from
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
| 18 |
st.set_page_config(layout='wide')
|
|
@@ -20,13 +22,6 @@ st.title("Table Detection and Table Structure Recognition")
|
|
| 20 |
st.write(
|
| 21 |
"Implemented by MSFT team: https://github.com/microsoft/table-transformer")
|
| 22 |
|
| 23 |
-
# config = Cfg.load_config_from_name('vgg_transformer')
|
| 24 |
-
config = Cfg.load_config_from_name('vgg_seq2seq')
|
| 25 |
-
config['cnn']['pretrained'] = False
|
| 26 |
-
config['device'] = 'cpu'
|
| 27 |
-
config['predictor']['beamsearch'] = False
|
| 28 |
-
detector = Predictor(config)
|
| 29 |
-
|
| 30 |
table_detection_model = TableTransformerForObjectDetection.from_pretrained(
|
| 31 |
"microsoft/table-transformer-detection")
|
| 32 |
|
|
@@ -43,10 +38,20 @@ def cv_to_PIL(cv_img):
|
|
| 43 |
|
| 44 |
|
| 45 |
async def pytess(cell_pil_img, threshold: float = 0.5):
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def sharpen_image(pil_img):
|
|
@@ -179,8 +184,8 @@ class TableExtractionPipeline():
|
|
| 179 |
|
| 180 |
# colors = ["red", "blue", "green", "red", "red", "red"]
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
| 184 |
top,
|
| 185 |
right,
|
| 186 |
bottom,
|
|
@@ -449,11 +454,17 @@ class TableExtractionPipeline():
|
|
| 449 |
|
| 450 |
c3.dataframe(df)
|
| 451 |
csv = self.convert_df(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
c3.download_button("Download table",
|
| 453 |
csv,
|
| 454 |
"file.csv",
|
| 455 |
"text/csv",
|
| 456 |
-
key='download-csv-' +
|
| 457 |
|
| 458 |
return df
|
| 459 |
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import string
|
| 3 |
+
import random
|
| 4 |
from collections import Counter
|
| 5 |
from itertools import count, tee
|
| 6 |
|
|
|
|
| 12 |
import torch
|
| 13 |
from PIL import Image
|
| 14 |
from transformers import DetrImageProcessor, TableTransformerForObjectDetection
|
| 15 |
+
from paddleocr import PaddleOCR
|
| 16 |
+
|
| 17 |
+
ocr = PaddleOCR(use_angle_cls=True, lang="en",use_gpu=False)
|
| 18 |
|
| 19 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
| 20 |
st.set_page_config(layout='wide')
|
|
|
|
| 22 |
st.write(
|
| 23 |
"Implemented by MSFT team: https://github.com/microsoft/table-transformer")
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
table_detection_model = TableTransformerForObjectDetection.from_pretrained(
|
| 26 |
"microsoft/table-transformer-detection")
|
| 27 |
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
async def pytess(cell_pil_img, threshold: float = 0.5):
|
| 41 |
+
cell_pil_img = TableExtractionPipeline.add_padding(pil_img=cell_pil_img, top=50, right=30, bottom=50, left=30, color=(255, 255, 255))
|
| 42 |
+
result = ocr.ocr(np.asarray(cell_pil_img), cls=True)[0]
|
| 43 |
+
|
| 44 |
+
#Debug
|
| 45 |
+
# filename = str(random.random())
|
| 46 |
+
# cell_pil_img.save("dump/" + filename + ".png")
|
| 47 |
+
# print(filename)
|
| 48 |
+
# print(result)
|
| 49 |
+
|
| 50 |
+
text = ""
|
| 51 |
+
if result != None:
|
| 52 |
+
txts = [line[1][0] for line in result]
|
| 53 |
+
text = " ".join(txts)
|
| 54 |
+
return text
|
| 55 |
|
| 56 |
|
| 57 |
def sharpen_image(pil_img):
|
|
|
|
| 184 |
|
| 185 |
# colors = ["red", "blue", "green", "red", "red", "red"]
|
| 186 |
|
| 187 |
+
@staticmethod
|
| 188 |
+
def add_padding(pil_img,
|
| 189 |
top,
|
| 190 |
right,
|
| 191 |
bottom,
|
|
|
|
| 454 |
|
| 455 |
c3.dataframe(df)
|
| 456 |
csv = self.convert_df(df)
|
| 457 |
+
|
| 458 |
+
try:
|
| 459 |
+
numkey = df.iloc[0, 0]
|
| 460 |
+
except:
|
| 461 |
+
numkey = str(0)
|
| 462 |
+
|
| 463 |
c3.download_button("Download table",
|
| 464 |
csv,
|
| 465 |
"file.csv",
|
| 466 |
"text/csv",
|
| 467 |
+
key='download-csv-' + numkey)
|
| 468 |
|
| 469 |
return df
|
| 470 |
|
requirements.txt
CHANGED
|
@@ -6,4 +6,6 @@ vietocr==0.3.11
|
|
| 6 |
streamlit==1.21.0
|
| 7 |
pandas
|
| 8 |
transformers==4.29.1
|
| 9 |
-
Pillow==9.5.0
|
|
|
|
|
|
|
|
|
| 6 |
streamlit==1.21.0
|
| 7 |
pandas
|
| 8 |
transformers==4.29.1
|
| 9 |
+
Pillow==9.5.0
|
| 10 |
+
paddlepaddle
|
| 11 |
+
paddleocr
|