""" # VERSION EXC POUR NOTIFICATIONS # RENOMMAGE : AAAAMMJJ-NOM_PRENOM-DI1-DI2 # DOUBLONS : SUFFIXES _X # VERSION COMPLETE """ """" !pip install gradio !apt-get -qq install poppler-utils tesseract-ocr > /dev/null !pip install -q pdf2image pytesseract """ import gradio as gr import os import shutil import zipfile import pytesseract from pdf2image import convert_from_path from PIL import Image import re from datetime import datetime def process_pdfs(files, classement_mode): pdf_folder = "pdf_folder" output_log = "rename_log.csv" errors_folder = os.path.join(pdf_folder, "erreurs") if os.path.exists(pdf_folder): shutil.rmtree(pdf_folder) os.makedirs(pdf_folder, exist_ok=True) os.makedirs(errors_folder, exist_ok=True) if isinstance(files, list): for file in files: if zipfile.is_zipfile(file.name): with zipfile.ZipFile(file.name, 'r') as zip_ref: zip_ref.extractall(pdf_folder) else: shutil.copy(file.name, pdf_folder) else: if zipfile.is_zipfile(files.name): with zipfile.ZipFile(files.name, 'r') as zip_ref: zip_ref.extractall(pdf_folder) else: shutil.copy(files.name, pdf_folder) title_pattern = re.compile( r"(?:Monsieur|Madame|Morveuwer|De heer)\s+((?:[\w\-éèêëàâäîïôöùûüç']+\s+){1,3}[\w\-éèêëàâäîïôöùûüç']+)", re.IGNORECASE ) seance_pattern = re.compile(r"SEANCE\s+du\s+(\d{2})[\/\-](\d{2})[\/\-](\d{4})", re.IGNORECASE) ref_pattern = re.compile(r"n\.réf\s*[:\-]?\s*das\/(?:[\w]+\/)*(\d{4})\/(\d+)", re.IGNORECASE) log_lines = ["original_filename,new_filename,date_folder,nom_prenom,ref,date_séance"] processed_files = [] error_files = [] used_filenames = {} for filename in os.listdir(pdf_folder): filepath = os.path.join(pdf_folder, filename) if not filename.lower().endswith(".pdf") or not os.path.isfile(filepath): continue try: images = convert_from_path(filepath, first_page=1, last_page=1) text = pytesseract.image_to_string(images[0], lang='fra+eng') name_match = title_pattern.search(text) safe_name = "NO_NAME" if name_match: name = name_match.group(1).strip() safe_name = re.sub(r"[^\w]", "_", name) safe_name = re.sub(r"_+", "_", safe_name) safe_name = re.sub(r"(_?DEPARTEMENT|_?ACTION|_?DIRECTION|_?SERVICE|_?UNITE|_?DIVISION)+", "", safe_name, flags=re.IGNORECASE) safe_name = safe_name.strip("_") date_match = seance_pattern.search(text) date_str = "NO_DATE" folder_path = errors_folder if date_match: day, month, year = date_match.groups() date_str = f"{year}{month}{day}" folder_path = os.path.join(pdf_folder, date_str) os.makedirs(folder_path, exist_ok=True) ref_match = ref_pattern.search(text) di1 = ref_match.group(1) if ref_match else "0000" di2 = ref_match.group(2) if ref_match else "0000" base_filename = f"{date_str}-{safe_name}-{di1}-{di2}" counter = used_filenames.get(base_filename, 0) new_filename = f"{base_filename}.pdf" if counter == 0 else f"{base_filename}_{counter}.pdf" used_filenames[base_filename] = counter + 1 # Choix du classement if classement_mode == "Par date de séance (AAAAMMJJ)": folder_path = os.path.join(pdf_folder, date_str) if date_match else errors_folder else: folder_path = os.path.join(pdf_folder, safe_name) if name_match else errors_folder os.makedirs(folder_path, exist_ok=True) new_path = os.path.join(folder_path, new_filename) if not os.path.exists(new_path): os.rename(filepath, new_path) final_filename = re.sub(r"_D_(\d+)", r"_D_\1", new_filename) final_path = os.path.join(folder_path, final_filename) else: final_path = new_path nom_final = os.path.basename(final_path).replace(".pdf", "") try: date_part, name_part, di1_part, di2_part = nom_final.split("-") date_formatted = f"{date_part[6:]}/{date_part[4:6]}/{date_part[0:4]}" nom_prenom_csv = name_part.replace("_", " ") ref_csv = f"{di1_part}/{di2_part}" except Exception: date_formatted = "NA" nom_prenom_csv = "NA" ref_csv = "NA" log_lines.append(f"{filename},{nom_final}.pdf,{date_str},{nom_prenom_csv},{ref_csv},{date_formatted}") processed_files.append(f"✅ {filename} → {new_filename}") except Exception as e: error_path = os.path.join(errors_folder, filename) shutil.move(filepath, error_path) log_lines.append(f"{filename},ERROR:{str(e).replace(',', ';')},NO_DATE,NA,NA,NA") error_files.append(f"❌ {filename} (Erreur: {str(e)})") with open(output_log, "w", encoding="utf-8") as f: f.write("\n".join(log_lines)) shutil.make_archive("renamed_pdfs", 'zip', pdf_folder) last_files = "\n".join(processed_files[-5:] + error_files[-3:]) if processed_files or error_files else "Aucun fichier traité" report = f""" **Traitement terminé !** - Fichiers traités : {len(log_lines)-1} - Avec succès : {len(processed_files)} - En erreur : {len(error_files)} - Derniers fichiers : {last_files} """ return "renamed_pdfs.zip", output_log, report # Fonction pour afficher la documentation directement dans l'interface def afficher_doc(): return """ # 📄 Documentation - Traitement des Notifications du Comité de l’Action Sociale Cet outil vous permet de : - 🧠 **Extraire automatiquement** le **nom**, la **référence** et la **date de séance** des notifications PDF (scans) - ✅ Valable uniquement pour les documents contenant : `Monsieur`, `Madame`, `Morveuwer`, `De heer` - 📅 Le classement repose sur la mention **"SEANCE du"** - 🆔 Extraction de la **référence DI (interne)** - 📄 Analyse uniquement de la **1ère page (moitié supérieure)** du document - 🗂️ **Renommer les fichiers** selon le format : `AAAAMMJJ-NOM_PRENOM-DI1-DI2.pdf` - 📆 **Classer automatiquement** les fichiers dans des dossiers selon la **date de séance** (format : `AAAAMMJJ`) - 🔀 **Gérer les doublons** en ajoutant un suffixe `_x` si un nom existe déjà - 📾 **Générer un fichier CSV de log** pour le suivi des traitements 👤 *Conçu pour les collaborateurs du CPAS Bruxelles* 📬 Contact : [omar.bajouk@cpasbxl.brussels](mailto:omar.bajouk@cpasbxl.brussels) """ # Interface Gradio with gr.Blocks(title="Renommer les notifications et trier par date du séance") as demo: gr.Markdown(afficher_doc()) with gr.Row(): input_files = gr.File(label="1. ZIP ou plusieurs PDFs", file_types=[".zip", ".pdf"], file_count="multiple") class_option = gr.Radio( choices=["Par date de séance (AAAAMMJJ)", "Par nom/référence (NOM_PRENOM-DI1-DI2)"], label="2. Choisir le mode de classement", value="Par date de séance (AAAAMMJJ)" ) btn_process = gr.Button("🚀 Traiter les fichiers") with gr.Row(): output_zip = gr.File(label="2. PDFs Renommés (ZIP)") output_log = gr.File(label="3. Fichier Log (CSV)") output_report = gr.Markdown() btn_process.click( fn=process_pdfs, inputs=[input_files, class_option], outputs=[output_zip, output_log, output_report] ) demo.launch(share=True)