Spaces:

kebson
/

paddleocr-table-extraction

Runtime error

App Files Files Community

kebson commited on Dec 18, 2025

Commit

3a9f6ca

verified ·

1 Parent(s): 4fbaf2b

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -53

app.py CHANGED Viewed

@@ -1,79 +1,106 @@
-import Sys
-sys.exit("Arret forcé")
 import os
 import cv2
 import pandas as pd
-import gradio as gr
 from paddleocr import PaddleOCR
-# Initialisation OCR (français)
 ocr = PaddleOCR(
     use_angle_cls=True,
-    lang="fr",
     show_log=False
 )
-def extract_second_column_image0():
-    image_path = "images/image0.jpeg"
-    if not os.path.exists(image_path):
-        return "❌ image0.jpeg introuvable dans le dossier images", None, None
     image = cv2.imread(image_path)
-    h, w, _ = image.shape
-    # OCR
-    result = ocr.ocr(image_path, cls=True)
-    # Récupération des textes avec positions
-    rows = []
-    for line in result[0]:
-        box, (text, score) = line
-        x_center = sum([p[0] for p in box]) / 4
-        y_center = sum([p[1] for p in box]) / 4
-        rows.append((y_center, x_center, text))
-    # Trier par ligne
-    rows.sort(key=lambda x: x[0])
-    # Séparer colonnes (milieu de l’image)
-    mid_x = w / 2
-    second_column = [r[2] for r in rows if r[1] > mid_x]
-    if not second_column:
-        return "❌ Aucun texte détecté dans la 2ᵉ colonne", None, None
-    # Création DataFrame
     df = pd.DataFrame({
-        "Colonne 2": second_column
     })
-    # Sauvegarde dans /tmp (autorisé par Gradio)
-    csv_path = "/tmp/resultats_colonne_2_image0.csv"
-    txt_path = "/tmp/resultats_colonne_2_image0.txt"
     df.to_csv(csv_path, index=False, encoding="utf-8")
     with open(txt_path, "w", encoding="utf-8") as f:
-        for t in second_column:
-            f.write(t + "\n")
     return df, csv_path, txt_path
-# Interface Gradio
-with gr.Blocks() as demo:
-    gr.Markdown("## 🧾 Extraction OCR – Colonne 2 (image0)")
-    btn = gr.Button("Extraire la 2ᵉ colonne")
-    table = gr.Dataframe()
-    csv_file = gr.File(label="Télécharger CSV")
-    txt_file = gr.File(label="Télécharger TXT")
-    btn.click(
-        extract_second_column_image0,
-        outputs=[table, csv_file, txt_file]
-    )
-demo.launch()

 import os
 import cv2
 import pandas as pd
 from paddleocr import PaddleOCR
+import gradio as gr
+# =========================
+# INITIALISATION OCR
+# =========================
 ocr = PaddleOCR(
     use_angle_cls=True,
+    lang="en",          # anglais pour la facture
     show_log=False
 )
+# =========================
+# FONCTION PRINCIPALE
+# =========================
+def extract_second_column(image_path):
+    """
+    Extrait UNIQUEMENT la 2e colonne (Description) d'un tableau
+    et retourne CSV + TXT lisibles
+    """
     image = cv2.imread(image_path)
+    result = ocr.ocr(image, cls=True)
+    # Stockage des blocs texte avec leurs positions X
+    boxes = []
+    for line in result:
+        for box, (text, conf) in line:
+            x_coords = [p[0] for p in box]
+            x_center = sum(x_coords) / len(x_coords)
+            boxes.append((x_center, text.strip()))
+    # Trier par position horizontale
+    boxes.sort(key=lambda x: x[0])
+    # Regroupement en colonnes (simple mais robuste)
+    columns = []
+    tolerance = 50  # pixels
+    for x, text in boxes:
+        placed = False
+        for col in columns:
+            if abs(col["x"] - x) < tolerance:
+                col["texts"].append(text)
+                placed = True
+                break
+        if not placed:
+            columns.append({"x": x, "texts": [text]})
+    # Trier les colonnes de gauche à droite
+    columns.sort(key=lambda c: c["x"])
+    if len(columns) < 2:
+        return "Erreur : colonne 2 non détectée", None, None
+    # 🔥 COLONNE 2 = Description
+    colonne_2 = columns[1]["texts"]
+    # Nettoyage (éviter chiffres seuls)
+    colonne_2 = [t for t in colonne_2 if len(t) > 2]
+    # DataFrame
     df = pd.DataFrame({
+        "Description": colonne_2
     })
+    # Fichiers de sortie
+    os.makedirs("outputs", exist_ok=True)
+    csv_path = "outputs/description_colonne_2.csv"
+    txt_path = "outputs/description_colonne_2.txt"
     df.to_csv(csv_path, index=False, encoding="utf-8")
     with open(txt_path, "w", encoding="utf-8") as f:
+        for line in colonne_2:
+            f.write(line + "\n")
     return df, csv_path, txt_path
+# =========================
+# INTERFACE GRADIO
+# =========================
+interface = gr.Interface(
+    fn=extract_second_column,
+    inputs=gr.Image(type="filepath", label="Image de facture"),
+    outputs=[
+        gr.Dataframe(label="Colonne 2 : Description"),
+        gr.File(label="Télécharger CSV"),
+        gr.File(label="Télécharger TXT (Bloc-notes)")
+    ],
+    title="Extraction OCR – Colonne Description",
+    description="Extraction uniquement de la colonne texte (Description) d'une facture"
+)
+# =========================
+# LANCEMENT
+# =========================
+if __name__ == "__main__":
+    interface.launch()