kebson's picture
Update app.py
ed975bc verified
raw
history blame
5.2 kB
import gradio as gr
import numpy as np
import unicodedata
from paddleocr import PaddleOCR
# -------------------------------------------------
# OCR (compatible Hugging Face)
# -------------------------------------------------
ocr = PaddleOCR(
lang="fr",
use_textline_orientation=True
)
# -------------------------------------------------
# Normalisation texte (casse + accents)
# -------------------------------------------------
def normalize(text: str) -> str:
text = text.lower()
text = unicodedata.normalize("NFD", text)
text = "".join(c for c in text if unicodedata.category(c) != "Mn")
return " ".join(text.split())
# -------------------------------------------------
# Titres valides de la colonne 2
# -------------------------------------------------
COL_TITLES = {
"designation",
"designations",
"description",
"description des services"
}
# -------------------------------------------------
# Mots / lignes à ignorer
# -------------------------------------------------
IGNORE_KEYWORDS = {
"prix", "total", "ht", "htva", "tva",
"ttc", "general", "generale"
}
# -------------------------------------------------
# Métadonnées à exclure (hors tableau)
# -------------------------------------------------
META_KEYWORDS = {
"dpo", "dao", "ref", "reference",
"date", "nme", ":"
}
# -------------------------------------------------
# Fonction principale
# -------------------------------------------------
def extract_second_column(image):
if image is None:
return "Aucune image fournie."
img = np.array(image)
result = ocr.predict(img)
if not result:
return "OCR : aucun texte détecté."
data = result[0]
texts = data.get("rec_texts", [])
boxes = data.get("dt_polys", [])
blocks = []
for text, box in zip(texts, boxes):
t = text.strip()
if len(t) < 2:
continue
x = np.mean([p[0] for p in box])
y = np.mean([p[1] for p in box])
blocks.append((t, x, y))
if len(blocks) < 5:
return "Pas assez de texte exploitable."
# -------------------------------------------------
# 1. Détection du X de la colonne cible (par le titre)
# -------------------------------------------------
col_x = None
title_y = None
for text, x, y in blocks:
if normalize(text) in COL_TITLES:
col_x = x
title_y = y
break
if col_x is None:
return "Titre de la colonne cible non détecté."
# -------------------------------------------------
# 2. Sélection des blocs de la colonne (SOUS le titre)
# -------------------------------------------------
X_THRESHOLD = 45
column_blocks = [
(t, x, y) for t, x, y in blocks
if abs(x - col_x) < X_THRESHOLD and y > title_y
]
if not column_blocks:
return "Colonne détectée mais vide."
# -------------------------------------------------
# 3. Tri vertical (haut → bas)
# -------------------------------------------------
column_blocks.sort(key=lambda e: e[2])
# -------------------------------------------------
# 4. Fusion contrôlée des lignes OCR
# -------------------------------------------------
merged = []
current = ""
last_y = None
Y_THRESHOLD = 22
for text, x, y in column_blocks:
nt = normalize(text)
# Ignore lignes de totaux / prix
if any(k in nt for k in IGNORE_KEYWORDS):
continue
# Ignore métadonnées résiduelles
if any(k in nt for k in META_KEYWORDS):
continue
if last_y is None or abs(y - last_y) > Y_THRESHOLD:
if current:
merged.append(current.strip())
current = text
else:
current += " " + text
last_y = y
if current:
merged.append(current.strip())
# -------------------------------------------------
# 5. Nettoyage final (cellules texte métier uniquement)
# -------------------------------------------------
final = []
for line in merged:
nt = normalize(line)
if len(nt) < 4:
continue
if sum(c.isdigit() for c in line) > len(line) / 2:
continue
final.append(line)
if not final:
return "Aucune cellule texte valide trouvée."
# -------------------------------------------------
# 6. Résultat numéroté
# -------------------------------------------------
return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
# -------------------------------------------------
# Interface Gradio (Hugging Face)
# -------------------------------------------------
demo = gr.Interface(
fn=extract_second_column,
inputs=gr.Image(type="pil", label="Image du tableau"),
outputs=gr.Textbox(label="Contenu de la colonne 2"),
title="Extraction fiable de la colonne 2",
description=(
"Extraction robuste de la deuxième colonne des tableaux scannés "
"(Désignation, DESIGNATIONS, Description, Description des services)."
)
)
demo.launch(server_name="0.0.0.0", server_port=7860)