Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import numpy as np | |
| import unicodedata | |
| from paddleocr import PaddleOCR | |
| # ------------------------------------------------- | |
| # OCR (compatible Hugging Face) | |
| # ------------------------------------------------- | |
| ocr = PaddleOCR( | |
| lang="fr", | |
| use_textline_orientation=True | |
| ) | |
| # ------------------------------------------------- | |
| # Normalisation texte (casse + accents) | |
| # ------------------------------------------------- | |
| def normalize(text: str) -> str: | |
| text = text.lower() | |
| text = unicodedata.normalize("NFD", text) | |
| text = "".join(c for c in text if unicodedata.category(c) != "Mn") | |
| return " ".join(text.split()) | |
| # ------------------------------------------------- | |
| # Titres valides de la colonne 2 | |
| # ------------------------------------------------- | |
| COL_TITLES = { | |
| "designation", | |
| "designations", | |
| "description", | |
| "description des services" | |
| } | |
| # ------------------------------------------------- | |
| # Mots / lignes à ignorer | |
| # ------------------------------------------------- | |
| IGNORE_KEYWORDS = { | |
| "prix", "total", "ht", "htva", "tva", | |
| "ttc", "general", "generale" | |
| } | |
| # ------------------------------------------------- | |
| # Métadonnées à exclure (hors tableau) | |
| # ------------------------------------------------- | |
| META_KEYWORDS = { | |
| "dpo", "dao", "ref", "reference", | |
| "date", "nme", ":" | |
| } | |
| # ------------------------------------------------- | |
| # Fonction principale | |
| # ------------------------------------------------- | |
| def extract_second_column(image): | |
| if image is None: | |
| return "Aucune image fournie." | |
| img = np.array(image) | |
| result = ocr.predict(img) | |
| if not result: | |
| return "OCR : aucun texte détecté." | |
| data = result[0] | |
| texts = data.get("rec_texts", []) | |
| boxes = data.get("dt_polys", []) | |
| blocks = [] | |
| for text, box in zip(texts, boxes): | |
| t = text.strip() | |
| if len(t) < 2: | |
| continue | |
| x = np.mean([p[0] for p in box]) | |
| y = np.mean([p[1] for p in box]) | |
| blocks.append((t, x, y)) | |
| if len(blocks) < 5: | |
| return "Pas assez de texte exploitable." | |
| # ------------------------------------------------- | |
| # 1. Détection du X de la colonne cible (par le titre) | |
| # ------------------------------------------------- | |
| col_x = None | |
| title_y = None | |
| for text, x, y in blocks: | |
| if normalize(text) in COL_TITLES: | |
| col_x = x | |
| title_y = y | |
| break | |
| if col_x is None: | |
| return "Titre de la colonne cible non détecté." | |
| # ------------------------------------------------- | |
| # 2. Sélection des blocs de la colonne (SOUS le titre) | |
| # ------------------------------------------------- | |
| X_THRESHOLD = 45 | |
| column_blocks = [ | |
| (t, x, y) for t, x, y in blocks | |
| if abs(x - col_x) < X_THRESHOLD and y > title_y | |
| ] | |
| if not column_blocks: | |
| return "Colonne détectée mais vide." | |
| # ------------------------------------------------- | |
| # 3. Tri vertical (haut → bas) | |
| # ------------------------------------------------- | |
| column_blocks.sort(key=lambda e: e[2]) | |
| # ------------------------------------------------- | |
| # 4. Fusion contrôlée des lignes OCR | |
| # ------------------------------------------------- | |
| merged = [] | |
| current = "" | |
| last_y = None | |
| Y_THRESHOLD = 22 | |
| for text, x, y in column_blocks: | |
| nt = normalize(text) | |
| # Ignore lignes de totaux / prix | |
| if any(k in nt for k in IGNORE_KEYWORDS): | |
| continue | |
| # Ignore métadonnées résiduelles | |
| if any(k in nt for k in META_KEYWORDS): | |
| continue | |
| if last_y is None or abs(y - last_y) > Y_THRESHOLD: | |
| if current: | |
| merged.append(current.strip()) | |
| current = text | |
| else: | |
| current += " " + text | |
| last_y = y | |
| if current: | |
| merged.append(current.strip()) | |
| # ------------------------------------------------- | |
| # 5. Nettoyage final (cellules texte métier uniquement) | |
| # ------------------------------------------------- | |
| final = [] | |
| for line in merged: | |
| nt = normalize(line) | |
| if len(nt) < 4: | |
| continue | |
| if sum(c.isdigit() for c in line) > len(line) / 2: | |
| continue | |
| final.append(line) | |
| if not final: | |
| return "Aucune cellule texte valide trouvée." | |
| # ------------------------------------------------- | |
| # 6. Résultat numéroté | |
| # ------------------------------------------------- | |
| return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final)) | |
| # ------------------------------------------------- | |
| # Interface Gradio (Hugging Face) | |
| # ------------------------------------------------- | |
| demo = gr.Interface( | |
| fn=extract_second_column, | |
| inputs=gr.Image(type="pil", label="Image du tableau"), | |
| outputs=gr.Textbox(label="Contenu de la colonne 2"), | |
| title="Extraction fiable de la colonne 2", | |
| description=( | |
| "Extraction robuste de la deuxième colonne des tableaux scannés " | |
| "(Désignation, DESIGNATIONS, Description, Description des services)." | |
| ) | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |