Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,109 +1,102 @@
|
|
| 1 |
import os
|
| 2 |
-
import
|
| 3 |
-
import gradio as gr
|
| 4 |
import pandas as pd
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
# =========================
|
| 8 |
-
# Initialisation OCR
|
| 9 |
-
# =========================
|
| 10 |
-
ocr = PaddleOCR(use_angle_cls=True, lang="fr")
|
| 11 |
-
table_engine = PPStructure(show_log=False)
|
| 12 |
-
|
| 13 |
-
# =========================
|
| 14 |
-
# Fonction principale
|
| 15 |
-
# =========================
|
| 16 |
-
def process_images(images):
|
| 17 |
-
all_rows = []
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
html = block["res"]["html"]
|
| 28 |
|
| 29 |
-
|
| 30 |
-
try:
|
| 31 |
-
tables = pd.read_html(html)
|
| 32 |
-
except:
|
| 33 |
-
continue
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
continue
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
|
| 43 |
-
if pd.isna(cell):
|
| 44 |
-
continue
|
| 45 |
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
"colonne_2": text
|
| 56 |
-
})
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
output_txt = "/app/resultats_colonne_2.txt"
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
for
|
| 74 |
-
|
| 75 |
-
current_image = row["image"]
|
| 76 |
-
f.write(f"\n===== {current_image} =====\n")
|
| 77 |
-
f.write(row["colonne_2"] + "\n")
|
| 78 |
|
| 79 |
-
return df,
|
| 80 |
|
| 81 |
|
| 82 |
-
# =========================
|
| 83 |
# Interface Gradio
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
gr.Markdown("## 📄 Extraction OCR – 2ᵉ colonne (textes uniquement)")
|
| 87 |
|
| 88 |
-
|
|
|
|
| 89 |
file_types=[".jpg", ".jpeg", ".png"],
|
| 90 |
-
file_count="multiple"
|
| 91 |
-
label="📤 Importer les images"
|
| 92 |
)
|
| 93 |
|
| 94 |
-
btn = gr.Button("
|
| 95 |
|
| 96 |
-
table_output = gr.Dataframe(label="
|
| 97 |
-
csv_output = gr.File(label="
|
| 98 |
-
txt_output = gr.File(label="
|
| 99 |
|
| 100 |
btn.click(
|
| 101 |
-
fn=
|
| 102 |
-
inputs=
|
| 103 |
outputs=[table_output, csv_output, txt_output]
|
| 104 |
)
|
| 105 |
|
| 106 |
demo.launch()
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
|
| 109 |
|
|
|
|
| 1 |
import os
|
| 2 |
+
import cv2
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
+
import gradio as gr
|
| 5 |
+
from paddleocr import PaddleOCR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
# Initialisation OCR (table + français)
|
| 8 |
+
ocr = PaddleOCR(
|
| 9 |
+
use_angle_cls=True,
|
| 10 |
+
lang="fr",
|
| 11 |
+
show_log=False,
|
| 12 |
+
use_gpu=False
|
| 13 |
+
)
|
| 14 |
|
| 15 |
+
def extract_second_column(images):
|
| 16 |
+
results_col2 = []
|
| 17 |
|
| 18 |
+
for image_file in images:
|
| 19 |
+
image_path = image_file.name
|
|
|
|
| 20 |
|
| 21 |
+
ocr_result = ocr.ocr(image_path, cls=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
if not ocr_result or not ocr_result[0]:
|
| 24 |
+
continue
|
|
|
|
| 25 |
|
| 26 |
+
# On trie les cellules par position verticale puis horizontale
|
| 27 |
+
cells = []
|
| 28 |
+
for line in ocr_result[0]:
|
| 29 |
+
box = line[0]
|
| 30 |
+
text = line[1][0]
|
| 31 |
+
x = box[0][0]
|
| 32 |
+
y = box[0][1]
|
| 33 |
+
cells.append((y, x, text))
|
| 34 |
|
| 35 |
+
cells.sort(key=lambda x: (x[0], x[1]))
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
# Reconstruction lignes
|
| 38 |
+
rows = []
|
| 39 |
+
current_row = []
|
| 40 |
+
last_y = None
|
| 41 |
+
threshold = 20
|
| 42 |
|
| 43 |
+
for y, x, text in cells:
|
| 44 |
+
if last_y is None or abs(y - last_y) < threshold:
|
| 45 |
+
current_row.append((x, text))
|
| 46 |
+
else:
|
| 47 |
+
rows.append(sorted(current_row))
|
| 48 |
+
current_row = [(x, text)]
|
| 49 |
+
last_y = y
|
| 50 |
|
| 51 |
+
if current_row:
|
| 52 |
+
rows.append(sorted(current_row))
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
# Extraction colonne 2 (index 1)
|
| 55 |
+
for row in rows:
|
| 56 |
+
if len(row) >= 2:
|
| 57 |
+
results_col2.append(row[1][1])
|
| 58 |
|
| 59 |
+
# Création des fichiers de sortie
|
| 60 |
+
os.makedirs("output", exist_ok=True)
|
| 61 |
|
| 62 |
+
df = pd.DataFrame({"colonne_2": results_col2})
|
| 63 |
+
csv_path = "output/resultats_colonne_2.csv"
|
| 64 |
+
txt_path = "output/resultats_colonne_2.txt"
|
|
|
|
| 65 |
|
| 66 |
+
df.to_csv(csv_path, index=False, encoding="utf-8")
|
| 67 |
+
with open(txt_path, "w", encoding="utf-8") as f:
|
| 68 |
+
for item in results_col2:
|
| 69 |
+
f.write(item + "\n")
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
return df, csv_path, txt_path
|
| 72 |
|
| 73 |
|
|
|
|
| 74 |
# Interface Gradio
|
| 75 |
+
with gr.Blocks(title="Extraction OCR – Colonne 2") as demo:
|
| 76 |
+
gr.Markdown("## 📄 Extraction OCR – Deuxième colonne des tableaux")
|
|
|
|
| 77 |
|
| 78 |
+
images_input = gr.File(
|
| 79 |
+
label="Téléverser les images (JPEG/PNG)",
|
| 80 |
file_types=[".jpg", ".jpeg", ".png"],
|
| 81 |
+
file_count="multiple"
|
|
|
|
| 82 |
)
|
| 83 |
|
| 84 |
+
btn = gr.Button("Extraire la colonne 2")
|
| 85 |
|
| 86 |
+
table_output = gr.Dataframe(label="Résultat – Colonne 2")
|
| 87 |
+
csv_output = gr.File(label="Télécharger CSV")
|
| 88 |
+
txt_output = gr.File(label="Télécharger TXT")
|
| 89 |
|
| 90 |
btn.click(
|
| 91 |
+
fn=extract_second_column,
|
| 92 |
+
inputs=images_input,
|
| 93 |
outputs=[table_output, csv_output, txt_output]
|
| 94 |
)
|
| 95 |
|
| 96 |
demo.launch()
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
| 100 |
|
| 101 |
|
| 102 |
|