Spaces:

kebson
/

paddleocr-table-extraction

Runtime error

App Files Files Community

kebson commited on Dec 18, 2025

Commit

a6c6224

verified ·

1 Parent(s): 0a0c26d

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -73

app.py CHANGED Viewed

@@ -1,102 +1,70 @@
 import os
-import re
 import cv2
 import pandas as pd
 import gradio as gr
-from paddleocr import PaddleOCR
-# ===============================
-# INITIALISATION OCR (UNE SEULE FOIS)
-# ===============================
-ocr = PaddleOCR(
-    use_angle_cls=False,
-    lang="en",
-    show_log=False,
-    use_gpu=False
-)
-# ===============================
-# FONCTION PRINCIPALE
-# ===============================
-def extract_second_column():
-    image_dir = "images"
-    image_files = [f for f in os.listdir(image_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))]
-    if not image_files:
-        return "❌ Aucune image trouvée", None, None
-    image_path = os.path.join(image_dir, image_files[0])
-    img = cv2.imread(image_path)
-    if img is None:
-        return "❌ Impossible de lire l'image", None, None
-    result = ocr.ocr(img, cls=False)
-    # Récupération des bounding boxes + texte
-    rows = []
-    for line in result[0]:
-        box = line[0]
-        text = line[1][0]
-        x_center = sum([p[0] for p in box]) / 4
-        y_center = sum([p[1] for p in box]) / 4
-        rows.append((y_center, x_center, text))
-    # Tri par ligne
-    rows.sort(key=lambda x: x[0])
-    # Regroupement par lignes
-    lines = {}
-    for y, x, text in rows:
-        line_key = round(y / 25)
-        lines.setdefault(line_key, []).append((x, text))
-    second_column_texts = []
-    for line in lines.values():
-        line.sort(key=lambda x: x[0])
-        if len(line) >= 2:
-            candidate = line[1][1]
-            # Garder uniquement les textes (pas chiffres)
-            if not re.search(r"\d", candidate):
-                second_column_texts.append(candidate)
-    if not second_column_texts:
-        return "⚠️ Aucun texte valide trouvé", None, None
-    # ===============================
-    # SORTIES
-    # ===============================
-    df = pd.DataFrame({"Colonne 2 (Texte)": second_column_texts})
-    txt_path = "/tmp/resultats_colonne_2.txt"
-    csv_path = "/tmp/resultats_colonne_2.csv"
-    df.to_csv(csv_path, index=False)
-    with open(txt_path, "w", encoding="utf-8") as f:
-        for t in second_column_texts:
-            f.write(t + "\n")
     return df, csv_path, txt_path
-# ===============================
-# INTERFACE GRADIO
-# ===============================
-with gr.Blocks() as demo:
-    gr.Markdown("## 📄 Extraction OCR – Colonne 2 (Texte uniquement)")
-    btn = gr.Button("🔍 Extraire la colonne 2")
-    table = gr.Dataframe()
-    csv_file = gr.File(label="📥 Télécharger CSV")
-    txt_file = gr.File(label="📥 Télécharger TXT (Bloc-notes)")
     btn.click(
-        extract_second_column,
-        outputs=[table, csv_file, txt_file]
     )
 demo.launch()

 import os
 import cv2
+import easyocr
 import pandas as pd
 import gradio as gr
+from PIL import Image
+# Initialisation EasyOCR (anglais + français si besoin)
+reader = easyocr.Reader(['en', 'fr'], gpu=False)
+def extract_second_column(image):
+    """
+    OCR + extraction naïve de la 2e colonne
+    """
+    img = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+    results = reader.readtext(img)
+    # On trie les résultats par position horizontale (x)
+    results_sorted = sorted(results, key=lambda x: x[0][0][0])
+    # On estime les colonnes par position X
+    xs = [r[0][0][0] for r in results_sorted]
+    median_x = sorted(xs)[len(xs)//2]
+    column_2 = []
+    for bbox, text, conf in results_sorted:
+        x = bbox[0][0]
+        if x > median_x:  # 2e colonne
+            if text.strip():
+                column_2.append(text.strip())
+    return column_2
+def process_image(image):
+    texts = extract_second_column(image)
+    df = pd.DataFrame({"Colonne 2 (Texte)": texts})
+    os.makedirs("/tmp/results", exist_ok=True)
+    csv_path = "/tmp/results/colonne_2.csv"
+    txt_path = "/tmp/results/colonne_2.txt"
+    df.to_csv(csv_path, index=False, encoding="utf-8")
+    df.to_csv(txt_path, index=False, header=False, encoding="utf-8")
     return df, csv_path, txt_path
+with gr.Blocks(title="Extraction OCR – Colonne 2") as demo:
+    gr.Markdown("## 📄 Extraction OCR – Colonne 2 (EasyOCR)")
+    image_input = gr.Image(type="numpy", label="Télécharger une image")
+    btn = gr.Button("Extraire la colonne 2")
+    df_output = gr.Dataframe(label="Résultat")
+    csv_file = gr.File(label="Télécharger CSV")
+    txt_file = gr.File(label="Télécharger TXT")
     btn.click(
+        process_image,
+        inputs=image_input,
+        outputs=[df_output, csv_file, txt_file]
     )
 demo.launch()