Spaces:

kebson
/

paddleocr-table-extraction

Runtime error

App Files Files Community

paddleocr-table-extraction / app.py

kebson

Update app.py

a6c6224 verified about 2 months ago

raw

history blame contribute delete

1.83 kB

	import os
	import cv2
	import easyocr
	import pandas as pd
	import gradio as gr
	from PIL import Image

	# Initialisation EasyOCR (anglais + français si besoin)
	reader = easyocr.Reader(['en', 'fr'], gpu=False)


	def extract_second_column(image):
	"""
	OCR + extraction naïve de la 2e colonne
	"""
	img = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

	results = reader.readtext(img)

	# On trie les résultats par position horizontale (x)
	results_sorted = sorted(results, key=lambda x: x[0][0][0])

	# On estime les colonnes par position X
	xs = [r[0][0][0] for r in results_sorted]
	median_x = sorted(xs)[len(xs)//2]

	column_2 = []

	for bbox, text, conf in results_sorted:
	x = bbox[0][0]
	if x > median_x: # 2e colonne
	if text.strip():
	column_2.append(text.strip())

	return column_2


	def process_image(image):
	texts = extract_second_column(image)

	df = pd.DataFrame({"Colonne 2 (Texte)": texts})

	os.makedirs("/tmp/results", exist_ok=True)

	csv_path = "/tmp/results/colonne_2.csv"
	txt_path = "/tmp/results/colonne_2.txt"

	df.to_csv(csv_path, index=False, encoding="utf-8")
	df.to_csv(txt_path, index=False, header=False, encoding="utf-8")

	return df, csv_path, txt_path


	with gr.Blocks(title="Extraction OCR – Colonne 2") as demo:
	gr.Markdown("## 📄 Extraction OCR – Colonne 2 (EasyOCR)")

	image_input = gr.Image(type="numpy", label="Télécharger une image")
	btn = gr.Button("Extraire la colonne 2")

	df_output = gr.Dataframe(label="Résultat")
	csv_file = gr.File(label="Télécharger CSV")
	txt_file = gr.File(label="Télécharger TXT")

	btn.click(
	process_image,
	inputs=image_input,
	outputs=[df_output, csv_file, txt_file]
	)

	demo.launch()