Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

table_second_column_extractor / app.py

kebson

Update app.py

ed975bc verified 4 months ago

raw

history blame

5.2 kB

	import gradio as gr
	import numpy as np
	import unicodedata
	from paddleocr import PaddleOCR

	# -------------------------------------------------
	# OCR (compatible Hugging Face)
	# -------------------------------------------------
	ocr = PaddleOCR(
	lang="fr",
	use_textline_orientation=True
	)

	# -------------------------------------------------
	# Normalisation texte (casse + accents)
	# -------------------------------------------------
	def normalize(text: str) -> str:
	text = text.lower()
	text = unicodedata.normalize("NFD", text)
	text = "".join(c for c in text if unicodedata.category(c) != "Mn")
	return " ".join(text.split())

	# -------------------------------------------------
	# Titres valides de la colonne 2
	# -------------------------------------------------
	COL_TITLES = {
	"designation",
	"designations",
	"description",
	"description des services"
	}

	# -------------------------------------------------
	# Mots / lignes à ignorer
	# -------------------------------------------------
	IGNORE_KEYWORDS = {
	"prix", "total", "ht", "htva", "tva",
	"ttc", "general", "generale"
	}

	# -------------------------------------------------
	# Métadonnées à exclure (hors tableau)
	# -------------------------------------------------
	META_KEYWORDS = {
	"dpo", "dao", "ref", "reference",
	"date", "nme", ":"
	}

	# -------------------------------------------------
	# Fonction principale
	# -------------------------------------------------
	def extract_second_column(image):
	if image is None:
	return "Aucune image fournie."

	img = np.array(image)
	result = ocr.predict(img)

	if not result:
	return "OCR : aucun texte détecté."

	data = result[0]
	texts = data.get("rec_texts", [])
	boxes = data.get("dt_polys", [])

	blocks = []
	for text, box in zip(texts, boxes):
	t = text.strip()
	if len(t) < 2:
	continue

	x = np.mean([p[0] for p in box])
	y = np.mean([p[1] for p in box])

	blocks.append((t, x, y))

	if len(blocks) < 5:
	return "Pas assez de texte exploitable."

	# -------------------------------------------------
	# 1. Détection du X de la colonne cible (par le titre)
	# -------------------------------------------------
	col_x = None
	title_y = None

	for text, x, y in blocks:
	if normalize(text) in COL_TITLES:
	col_x = x
	title_y = y
	break

	if col_x is None:
	return "Titre de la colonne cible non détecté."

	# -------------------------------------------------
	# 2. Sélection des blocs de la colonne (SOUS le titre)
	# -------------------------------------------------
	X_THRESHOLD = 45
	column_blocks = [
	(t, x, y) for t, x, y in blocks
	if abs(x - col_x) < X_THRESHOLD and y > title_y
	]

	if not column_blocks:
	return "Colonne détectée mais vide."

	# -------------------------------------------------
	# 3. Tri vertical (haut → bas)
	# -------------------------------------------------
	column_blocks.sort(key=lambda e: e[2])

	# -------------------------------------------------
	# 4. Fusion contrôlée des lignes OCR
	# -------------------------------------------------
	merged = []
	current = ""
	last_y = None
	Y_THRESHOLD = 22

	for text, x, y in column_blocks:
	nt = normalize(text)

	# Ignore lignes de totaux / prix
	if any(k in nt for k in IGNORE_KEYWORDS):
	continue

	# Ignore métadonnées résiduelles
	if any(k in nt for k in META_KEYWORDS):
	continue

	if last_y is None or abs(y - last_y) > Y_THRESHOLD:
	if current:
	merged.append(current.strip())
	current = text
	else:
	current += " " + text

	last_y = y

	if current:
	merged.append(current.strip())

	# -------------------------------------------------
	# 5. Nettoyage final (cellules texte métier uniquement)
	# -------------------------------------------------
	final = []
	for line in merged:
	nt = normalize(line)

	if len(nt) < 4:
	continue

	if sum(c.isdigit() for c in line) > len(line) / 2:
	continue

	final.append(line)

	if not final:
	return "Aucune cellule texte valide trouvée."

	# -------------------------------------------------
	# 6. Résultat numéroté
	# -------------------------------------------------
	return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))

	# -------------------------------------------------
	# Interface Gradio (Hugging Face)
	# -------------------------------------------------
	demo = gr.Interface(
	fn=extract_second_column,
	inputs=gr.Image(type="pil", label="Image du tableau"),
	outputs=gr.Textbox(label="Contenu de la colonne 2"),
	title="Extraction fiable de la colonne 2",
	description=(
	"Extraction robuste de la deuxième colonne des tableaux scannés "
	"(Désignation, DESIGNATIONS, Description, Description des services)."
	)
	)

	demo.launch(server_name="0.0.0.0", server_port=7860)