Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,79 +1,106 @@
|
|
| 1 |
-
import Sys
|
| 2 |
-
sys.exit("Arret forcé")
|
| 3 |
import os
|
| 4 |
import cv2
|
| 5 |
import pandas as pd
|
| 6 |
-
import gradio as gr
|
| 7 |
from paddleocr import PaddleOCR
|
|
|
|
| 8 |
|
| 9 |
-
#
|
|
|
|
|
|
|
| 10 |
ocr = PaddleOCR(
|
| 11 |
use_angle_cls=True,
|
| 12 |
-
lang="
|
| 13 |
show_log=False
|
| 14 |
)
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
image = cv2.imread(image_path)
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
#
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
#
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
df = pd.DataFrame({
|
| 48 |
-
"
|
| 49 |
})
|
| 50 |
|
| 51 |
-
#
|
| 52 |
-
|
| 53 |
-
|
|
|
|
| 54 |
|
| 55 |
df.to_csv(csv_path, index=False, encoding="utf-8")
|
|
|
|
| 56 |
with open(txt_path, "w", encoding="utf-8") as f:
|
| 57 |
-
for
|
| 58 |
-
f.write(
|
| 59 |
|
| 60 |
return df, csv_path, txt_path
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
#
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
csv_file = gr.File(label="Télécharger CSV")
|
| 69 |
-
txt_file = gr.File(label="Télécharger TXT")
|
| 70 |
-
|
| 71 |
-
btn.click(
|
| 72 |
-
extract_second_column_image0,
|
| 73 |
-
outputs=[table, csv_file, txt_file]
|
| 74 |
-
)
|
| 75 |
|
| 76 |
-
demo.launch()
|
| 77 |
|
| 78 |
|
| 79 |
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import cv2
|
| 3 |
import pandas as pd
|
|
|
|
| 4 |
from paddleocr import PaddleOCR
|
| 5 |
+
import gradio as gr
|
| 6 |
|
| 7 |
+
# =========================
|
| 8 |
+
# INITIALISATION OCR
|
| 9 |
+
# =========================
|
| 10 |
ocr = PaddleOCR(
|
| 11 |
use_angle_cls=True,
|
| 12 |
+
lang="en", # anglais pour la facture
|
| 13 |
show_log=False
|
| 14 |
)
|
| 15 |
|
| 16 |
+
# =========================
|
| 17 |
+
# FONCTION PRINCIPALE
|
| 18 |
+
# =========================
|
| 19 |
+
def extract_second_column(image_path):
|
| 20 |
+
"""
|
| 21 |
+
Extrait UNIQUEMENT la 2e colonne (Description) d'un tableau
|
| 22 |
+
et retourne CSV + TXT lisibles
|
| 23 |
+
"""
|
| 24 |
|
| 25 |
image = cv2.imread(image_path)
|
| 26 |
+
result = ocr.ocr(image, cls=True)
|
| 27 |
+
|
| 28 |
+
# Stockage des blocs texte avec leurs positions X
|
| 29 |
+
boxes = []
|
| 30 |
+
for line in result:
|
| 31 |
+
for box, (text, conf) in line:
|
| 32 |
+
x_coords = [p[0] for p in box]
|
| 33 |
+
x_center = sum(x_coords) / len(x_coords)
|
| 34 |
+
boxes.append((x_center, text.strip()))
|
| 35 |
+
|
| 36 |
+
# Trier par position horizontale
|
| 37 |
+
boxes.sort(key=lambda x: x[0])
|
| 38 |
+
|
| 39 |
+
# Regroupement en colonnes (simple mais robuste)
|
| 40 |
+
columns = []
|
| 41 |
+
tolerance = 50 # pixels
|
| 42 |
+
|
| 43 |
+
for x, text in boxes:
|
| 44 |
+
placed = False
|
| 45 |
+
for col in columns:
|
| 46 |
+
if abs(col["x"] - x) < tolerance:
|
| 47 |
+
col["texts"].append(text)
|
| 48 |
+
placed = True
|
| 49 |
+
break
|
| 50 |
+
if not placed:
|
| 51 |
+
columns.append({"x": x, "texts": [text]})
|
| 52 |
+
|
| 53 |
+
# Trier les colonnes de gauche à droite
|
| 54 |
+
columns.sort(key=lambda c: c["x"])
|
| 55 |
+
|
| 56 |
+
if len(columns) < 2:
|
| 57 |
+
return "Erreur : colonne 2 non détectée", None, None
|
| 58 |
+
|
| 59 |
+
# 🔥 COLONNE 2 = Description
|
| 60 |
+
colonne_2 = columns[1]["texts"]
|
| 61 |
+
|
| 62 |
+
# Nettoyage (éviter chiffres seuls)
|
| 63 |
+
colonne_2 = [t for t in colonne_2 if len(t) > 2]
|
| 64 |
+
|
| 65 |
+
# DataFrame
|
| 66 |
df = pd.DataFrame({
|
| 67 |
+
"Description": colonne_2
|
| 68 |
})
|
| 69 |
|
| 70 |
+
# Fichiers de sortie
|
| 71 |
+
os.makedirs("outputs", exist_ok=True)
|
| 72 |
+
csv_path = "outputs/description_colonne_2.csv"
|
| 73 |
+
txt_path = "outputs/description_colonne_2.txt"
|
| 74 |
|
| 75 |
df.to_csv(csv_path, index=False, encoding="utf-8")
|
| 76 |
+
|
| 77 |
with open(txt_path, "w", encoding="utf-8") as f:
|
| 78 |
+
for line in colonne_2:
|
| 79 |
+
f.write(line + "\n")
|
| 80 |
|
| 81 |
return df, csv_path, txt_path
|
| 82 |
|
| 83 |
+
# =========================
|
| 84 |
+
# INTERFACE GRADIO
|
| 85 |
+
# =========================
|
| 86 |
+
interface = gr.Interface(
|
| 87 |
+
fn=extract_second_column,
|
| 88 |
+
inputs=gr.Image(type="filepath", label="Image de facture"),
|
| 89 |
+
outputs=[
|
| 90 |
+
gr.Dataframe(label="Colonne 2 : Description"),
|
| 91 |
+
gr.File(label="Télécharger CSV"),
|
| 92 |
+
gr.File(label="Télécharger TXT (Bloc-notes)")
|
| 93 |
+
],
|
| 94 |
+
title="Extraction OCR – Colonne Description",
|
| 95 |
+
description="Extraction uniquement de la colonne texte (Description) d'une facture"
|
| 96 |
+
)
|
| 97 |
|
| 98 |
+
# =========================
|
| 99 |
+
# LANCEMENT
|
| 100 |
+
# =========================
|
| 101 |
+
if __name__ == "__main__":
|
| 102 |
+
interface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
|
|
|
| 104 |
|
| 105 |
|
| 106 |
|