Spaces:
Sleeping
Sleeping
| # app.py | |
| import gradio as gr | |
| from PIL import Image | |
| import torch | |
| from transformers import TrOCRProcessor, VisionEncoderDecoderModel | |
| import re | |
| import json | |
| # Charger le modèle TrOCR | |
| processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1") | |
| model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1") | |
| model.eval() | |
| def ocr_trocr(pil_image): | |
| image = pil_image.convert("RGB") | |
| pixel_values = processor(images=image, return_tensors="pt").pixel_values | |
| with torch.no_grad(): | |
| generated_ids = model.generate(pixel_values) | |
| text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| return text | |
| def est_carte_identite_guineenne(texte): | |
| texte = texte.upper().replace("’", "'") | |
| mots_cles = [ | |
| "CARTE", "IDENTITE", "GUINEE", "GUINEENNE", "REPUBLIQUE", | |
| "CEDEAO", "GIN", "DATE DE NAISSANCE", "NUMERO", "MSPC", | |
| "NOM", "PRENOM" | |
| ] | |
| return sum(1 for mot in mots_cles if mot in texte) >= 3 | |
| def extraire_donnees(texte): | |
| texte = texte.upper() | |
| patterns = { | |
| "nom": r"NOM\s*[:\-]?\s*([A-Z\-]+)", | |
| "prenom": r"PRENOM\s*[:\-]?\s*([A-Z\-]+)", | |
| "sexe": r"SEXE\s*[:\-]?\s*([MF])", | |
| "taille": r"TAILLE\s*[:\-]?\s*([0-9,.]+\s?M)", | |
| "nationalite": r"NATIONALITE\s*[:\-]?\s*([A-Z]+)", | |
| "date_naissance": r"(\d{2}\s(?:JAN|FEB|MAR|APR|MAI|JUN|JUL|AOU|SEP|OCT|NOV|DEC)\s\d{4})", | |
| "numero_id": r"(\d{16})", | |
| "nin": r"(\d{15})", | |
| "date_emission": r"DATE D['’]?EMISSION\s*[:\-]?\s*(\d{2}\s\w+\s\d{4})", | |
| "date_expiration": r"DATE D['’]?EXPIRATION\s*[:\-]?\s*(\d{2}\s\w+\s\d{4})", | |
| "lieu": r"CONAKRY|KANKAN|NZEREKORE|LABE|KINDIA|BOKE|FARANAH" | |
| } | |
| data = {} | |
| for key, pattern in patterns.items(): | |
| match = re.search(pattern, texte) | |
| if match: | |
| data[key] = match.group(1) | |
| return data | |
| def analyse_carte(recto_img, verso_img): | |
| try: | |
| texte_recto = ocr_trocr(recto_img) | |
| texte_verso = ocr_trocr(verso_img) | |
| texte_total = texte_recto + "\n" + texte_verso | |
| if not est_carte_identite_guineenne(texte_total): | |
| return " Ce document ne semble pas être une carte d'identité guinéenne.", {} | |
| champs = extraire_donnees(texte_total) | |
| return texte_total, champs | |
| except Exception as e: | |
| return f"Erreur de traitement : {str(e)}", {} | |
| interface = gr.Interface( | |
| fn=analyse_carte, | |
| inputs=[ | |
| gr.Image(type="pil", label="Image Recto"), | |
| gr.Image(type="pil", label="Image Verso") | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Texte OCR extrait"), | |
| gr.JSON(label="Champs structurés extraits") | |
| ], | |
| title="OCRIA - Lecture intelligente de carte d'identité guinéenne", | |
| description="Scannez les deux faces d'une carte d'identité guinéenne. Le système vérifie et extrait automatiquement les informations clés." | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |