Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,11 +4,22 @@ import cv2
|
|
| 4 |
import pytesseract
|
| 5 |
import numpy as np
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def extract_descriptions(image: Image.Image):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 11 |
|
|
|
|
| 12 |
data = pytesseract.image_to_data(
|
| 13 |
img,
|
| 14 |
output_type=pytesseract.Output.DICT,
|
|
@@ -24,42 +35,88 @@ def extract_descriptions(image: Image.Image):
|
|
| 24 |
"x": data["left"][i],
|
| 25 |
"y": data["top"][i],
|
| 26 |
"w": data["width"][i],
|
| 27 |
-
"h": data["height"][i]
|
| 28 |
})
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
|
|
|
| 34 |
x_min = header["x"] - 10
|
| 35 |
x_max = header["x"] + header["w"] + 350
|
| 36 |
y_min = header["y"] + header["h"] + 10
|
| 37 |
|
| 38 |
-
|
|
|
|
| 39 |
w for w in words
|
| 40 |
if x_min <= w["x"] <= x_max and w["y"] > y_min
|
| 41 |
]
|
| 42 |
|
|
|
|
| 43 |
lines = {}
|
| 44 |
-
for w in
|
| 45 |
key = w["y"] // 15
|
| 46 |
lines.setdefault(key, []).append(w)
|
| 47 |
|
| 48 |
-
|
| 49 |
-
for
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
continue
|
| 53 |
-
results.append(line)
|
| 54 |
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
demo = gr.Interface(
|
| 59 |
fn=extract_descriptions,
|
| 60 |
-
inputs=gr.Image(type="pil"),
|
| 61 |
-
outputs=gr.Textbox(lines=20),
|
| 62 |
-
title="Extraction colonne Description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
)
|
| 64 |
|
| 65 |
-
demo.launch(
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import pytesseract
|
| 5 |
import numpy as np
|
| 6 |
|
| 7 |
+
# 🔴 IMPORTANT : chemin ABSOLU vers tesseract (HF Docker)
|
| 8 |
+
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
| 9 |
+
|
| 10 |
|
| 11 |
def extract_descriptions(image: Image.Image):
|
| 12 |
+
"""
|
| 13 |
+
Extrait uniquement le contenu de la colonne 'Description'
|
| 14 |
+
depuis une image de facture (tableau).
|
| 15 |
+
"""
|
| 16 |
+
if image is None:
|
| 17 |
+
return "Aucune image fournie."
|
| 18 |
+
|
| 19 |
+
# Conversion PIL -> OpenCV
|
| 20 |
img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 21 |
|
| 22 |
+
# OCR avec positions
|
| 23 |
data = pytesseract.image_to_data(
|
| 24 |
img,
|
| 25 |
output_type=pytesseract.Output.DICT,
|
|
|
|
| 35 |
"x": data["left"][i],
|
| 36 |
"y": data["top"][i],
|
| 37 |
"w": data["width"][i],
|
| 38 |
+
"h": data["height"][i]
|
| 39 |
})
|
| 40 |
|
| 41 |
+
# 1️⃣ Détection de l'en-tête "Description"
|
| 42 |
+
header = next(
|
| 43 |
+
(w for w in words if w["text"].lower() == "description"),
|
| 44 |
+
None
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
if header is None:
|
| 48 |
+
return "❌ Colonne 'Description' non détectée."
|
| 49 |
|
| 50 |
+
# 2️⃣ Définition de la zone de la colonne Description
|
| 51 |
x_min = header["x"] - 10
|
| 52 |
x_max = header["x"] + header["w"] + 350
|
| 53 |
y_min = header["y"] + header["h"] + 10
|
| 54 |
|
| 55 |
+
# 3️⃣ Filtrage des mots dans cette colonne
|
| 56 |
+
column_words = [
|
| 57 |
w for w in words
|
| 58 |
if x_min <= w["x"] <= x_max and w["y"] > y_min
|
| 59 |
]
|
| 60 |
|
| 61 |
+
# 4️⃣ Regroupement par lignes (Y proche)
|
| 62 |
lines = {}
|
| 63 |
+
for w in column_words:
|
| 64 |
key = w["y"] // 15
|
| 65 |
lines.setdefault(key, []).append(w)
|
| 66 |
|
| 67 |
+
extracted_lines = []
|
| 68 |
+
for key in sorted(lines.keys()):
|
| 69 |
+
line_words = sorted(lines[key], key=lambda x: x["x"])
|
| 70 |
+
line_text = " ".join(w["text"] for w in line_words)
|
| 71 |
+
|
| 72 |
+
# Filtrage des éléments non désirés
|
| 73 |
+
if any(k in line_text.lower() for k in ["vat", "gross", "net", "each"]):
|
| 74 |
+
continue
|
| 75 |
+
if line_text.replace(".", "").replace(",", "").isdigit():
|
| 76 |
continue
|
|
|
|
| 77 |
|
| 78 |
+
extracted_lines.append(line_text)
|
| 79 |
+
|
| 80 |
+
# 5️⃣ Fusion des cellules multilignes
|
| 81 |
+
final_descriptions = []
|
| 82 |
+
buffer = ""
|
| 83 |
|
| 84 |
+
for line in extracted_lines:
|
| 85 |
+
# Détection de début de nouvelle ligne de cellule (ex: "1.")
|
| 86 |
+
if line[:2].replace(".", "").isdigit():
|
| 87 |
+
if buffer:
|
| 88 |
+
final_descriptions.append(buffer.strip())
|
| 89 |
+
buffer = line.split(".", 1)[-1].strip()
|
| 90 |
+
else:
|
| 91 |
+
buffer += " " + line
|
| 92 |
+
|
| 93 |
+
if buffer:
|
| 94 |
+
final_descriptions.append(buffer.strip())
|
| 95 |
+
|
| 96 |
+
# Résultat final
|
| 97 |
+
if not final_descriptions:
|
| 98 |
+
return "⚠️ Aucun contenu détecté dans la colonne Description."
|
| 99 |
+
|
| 100 |
+
return "\n".join(final_descriptions)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# =========================
|
| 104 |
+
# Interface Gradio
|
| 105 |
+
# =========================
|
| 106 |
|
| 107 |
demo = gr.Interface(
|
| 108 |
fn=extract_descriptions,
|
| 109 |
+
inputs=gr.Image(type="pil", label="Image de facture"),
|
| 110 |
+
outputs=gr.Textbox(lines=20, label="Descriptions extraites"),
|
| 111 |
+
title="Extraction de la colonne Description (Factures)",
|
| 112 |
+
description=(
|
| 113 |
+
"Charge une image de facture contenant un tableau "
|
| 114 |
+
"et récupère uniquement le contenu de la colonne 'Description', "
|
| 115 |
+
"cellule par cellule."
|
| 116 |
+
)
|
| 117 |
)
|
| 118 |
|
| 119 |
+
demo.launch(
|
| 120 |
+
server_name="0.0.0.0",
|
| 121 |
+
server_port=7860
|
| 122 |
+
)
|