Spaces:

kebson
/

paddleocr-table-extraction

Runtime error

File size: 1,828 Bytes

import os
import cv2
import easyocr
import pandas as pd
import gradio as gr
from PIL import Image

# Initialisation EasyOCR (anglais + français si besoin)
reader = easyocr.Reader(['en', 'fr'], gpu=False)


def extract_second_column(image):
    """
    OCR + extraction naïve de la 2e colonne
    """
    img = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    results = reader.readtext(img)

    # On trie les résultats par position horizontale (x)
    results_sorted = sorted(results, key=lambda x: x[0][0][0])

    # On estime les colonnes par position X
    xs = [r[0][0][0] for r in results_sorted]
    median_x = sorted(xs)[len(xs)//2]

    column_2 = []

    for bbox, text, conf in results_sorted:
        x = bbox[0][0]
        if x > median_x:  # 2e colonne
            if text.strip():
                column_2.append(text.strip())

    return column_2


def process_image(image):
    texts = extract_second_column(image)

    df = pd.DataFrame({"Colonne 2 (Texte)": texts})

    os.makedirs("/tmp/results", exist_ok=True)

    csv_path = "/tmp/results/colonne_2.csv"
    txt_path = "/tmp/results/colonne_2.txt"

    df.to_csv(csv_path, index=False, encoding="utf-8")
    df.to_csv(txt_path, index=False, header=False, encoding="utf-8")

    return df, csv_path, txt_path


with gr.Blocks(title="Extraction OCR – Colonne 2") as demo:
    gr.Markdown("## 📄 Extraction OCR – Colonne 2 (EasyOCR)")

    image_input = gr.Image(type="numpy", label="Télécharger une image")
    btn = gr.Button("Extraire la colonne 2")

    df_output = gr.Dataframe(label="Résultat")
    csv_file = gr.File(label="Télécharger CSV")
    txt_file = gr.File(label="Télécharger TXT")

    btn.click(
        process_image,
        inputs=image_input,
        outputs=[df_output, csv_file, txt_file]
    )

demo.launch()