Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 24, 2025

Commit

9697857

verified ·

1 Parent(s): ecf1403

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -26

app.py CHANGED Viewed

@@ -4,12 +4,11 @@ import unicodedata
 from paddleocr import PaddleOCR
 # =================================================
-# OCR Paddle
 # =================================================
 ocr = PaddleOCR(
     lang="fr",
-    use_textline_orientation=True,
-    show_log=False
 )
 # =================================================
@@ -23,7 +22,6 @@ def normalize(text: str) -> str:
 # =================================================
 # Titres possibles de la colonne 2
-# (casse ignorée automatiquement)
 # =================================================
 COL_TITLES = {
     "designation",
@@ -33,7 +31,7 @@ COL_TITLES = {
 }
 # =================================================
-# Mots à ignorer absolument
 # =================================================
 IGNORE_KEYWORDS = {
     "prix", "ht", "htva", "tva", "ttc",
@@ -50,14 +48,14 @@ def extract_second_column(image):
     img = np.array(image)
     # -------------------------------------------------
-    # 0. Rotation automatique si image couchée
     # -------------------------------------------------
     h, w = img.shape[:2]
     if w > h:
         img = np.rot90(img, 1)
     # -------------------------------------------------
-    # 1. OCR
     # -------------------------------------------------
     result = ocr.predict(img)
     if not result:
@@ -82,7 +80,7 @@ def extract_second_column(image):
         return "Pas assez de texte exploitable."
     # -------------------------------------------------
-    # 2. Détection robuste du X de la colonne 2
     # -------------------------------------------------
     col_x = None
     for text, x, y in blocks:
@@ -98,7 +96,7 @@ def extract_second_column(image):
         return "Titre de la colonne cible non détecté."
     # -------------------------------------------------
-    # 3. Sélection des blocs de la colonne 2
     # -------------------------------------------------
     X_THRESHOLD = 55
     column_blocks = [
@@ -110,12 +108,12 @@ def extract_second_column(image):
         return "Colonne détectée mais vide."
     # -------------------------------------------------
-    # 4. Tri vertical (haut → bas)
     # -------------------------------------------------
     column_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
-    # 5. Fusion intelligente des lignes OCR
     # -------------------------------------------------
     merged = []
     current = ""
@@ -141,25 +139,18 @@ def extract_second_column(image):
         merged.append(current.strip())
     # -------------------------------------------------
-    # 6. Nettoyage final des cellules texte
     # -------------------------------------------------
     final = []
     for line in merged:
         nt = normalize(line)
-        # ignorer le titre de colonne
         if nt in COL_TITLES:
             continue
-        # longueur minimale
         if len(nt) < 5:
             continue
-        # ignorer lignes trop numériques
         if sum(c.isdigit() for c in line) > len(line) / 3:
             continue
-        # règle métier : commence par majuscule
         if not line[0].isupper():
             continue
@@ -168,13 +159,10 @@ def extract_second_column(image):
     if not final:
         return "Aucune cellule texte valide trouvée."
-    # -------------------------------------------------
-    # 7. Résultat numéroté
-    # -------------------------------------------------
     return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
 # =================================================
-# Interface Gradio (Hugging Face)
 # =================================================
 demo = gr.Interface(
     fn=extract_second_column,
@@ -182,9 +170,8 @@ demo = gr.Interface(
     outputs=gr.Textbox(label="Contenu de la colonne 2"),
     title="Extraction fiable de la colonne 2",
     description=(
-        "Extraction automatique et adaptative de la deuxième colonne "
-        "(Désignation, DESIGNATIONS, Description, Description des services) "
-        "à partir de tableaux scannés."
     )
 )

 from paddleocr import PaddleOCR
 # =================================================
+# OCR Paddle (HF compatible)
 # =================================================
 ocr = PaddleOCR(
     lang="fr",
+    use_textline_orientation=True
 )
 # =================================================
 # =================================================
 # Titres possibles de la colonne 2
 # =================================================
 COL_TITLES = {
     "designation",
 }
 # =================================================
+# Mots à ignorer
 # =================================================
 IGNORE_KEYWORDS = {
     "prix", "ht", "htva", "tva", "ttc",
     img = np.array(image)
     # -------------------------------------------------
+    # Rotation automatique si image couchée
     # -------------------------------------------------
     h, w = img.shape[:2]
     if w > h:
         img = np.rot90(img, 1)
     # -------------------------------------------------
+    # OCR
     # -------------------------------------------------
     result = ocr.predict(img)
     if not result:
         return "Pas assez de texte exploitable."
     # -------------------------------------------------
+    # Détection du X de la colonne 2 via le titre
     # -------------------------------------------------
     col_x = None
     for text, x, y in blocks:
         return "Titre de la colonne cible non détecté."
     # -------------------------------------------------
+    # Sélection des blocs de la colonne
     # -------------------------------------------------
     X_THRESHOLD = 55
     column_blocks = [
         return "Colonne détectée mais vide."
     # -------------------------------------------------
+    # Tri vertical
     # -------------------------------------------------
     column_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
+    # Fusion intelligente des lignes OCR
     # -------------------------------------------------
     merged = []
     current = ""
         merged.append(current.strip())
     # -------------------------------------------------
+    # Nettoyage final
     # -------------------------------------------------
     final = []
     for line in merged:
         nt = normalize(line)
         if nt in COL_TITLES:
             continue
         if len(nt) < 5:
             continue
         if sum(c.isdigit() for c in line) > len(line) / 3:
             continue
         if not line[0].isupper():
             continue
     if not final:
         return "Aucune cellule texte valide trouvée."
     return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
 # =================================================
+# Interface Gradio
 # =================================================
 demo = gr.Interface(
     fn=extract_second_column,
     outputs=gr.Textbox(label="Contenu de la colonne 2"),
     title="Extraction fiable de la colonne 2",
     description=(
+        "Extraction adaptative de la deuxième colonne "
+        "(Désignation, DESIGNATIONS, Description, Description des services)."
     )
 )