Spaces:
Sleeping
Sleeping
File size: 7,969 Bytes
45cf938 59a0659 45cf938 59a0659 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 45cf938 5d7f257 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | """
# VERSION EXC POUR NOTIFICATIONS
# RENOMMAGE : AAAAMMJJ-NOM_PRENOM-DI1-DI2
# DOUBLONS : SUFFIXES _X
# VERSION COMPLETE
"""
""""
!pip install gradio
!apt-get -qq install poppler-utils tesseract-ocr > /dev/null
!pip install -q pdf2image pytesseract
"""
import gradio as gr
import os
import shutil
import zipfile
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import re
from datetime import datetime
def process_pdfs(files, classement_mode):
pdf_folder = "pdf_folder"
output_log = "rename_log.csv"
errors_folder = os.path.join(pdf_folder, "erreurs")
if os.path.exists(pdf_folder):
shutil.rmtree(pdf_folder)
os.makedirs(pdf_folder, exist_ok=True)
os.makedirs(errors_folder, exist_ok=True)
if isinstance(files, list):
for file in files:
if zipfile.is_zipfile(file.name):
with zipfile.ZipFile(file.name, 'r') as zip_ref:
zip_ref.extractall(pdf_folder)
else:
shutil.copy(file.name, pdf_folder)
else:
if zipfile.is_zipfile(files.name):
with zipfile.ZipFile(files.name, 'r') as zip_ref:
zip_ref.extractall(pdf_folder)
else:
shutil.copy(files.name, pdf_folder)
title_pattern = re.compile(
r"(?:Monsieur|Madame|Morveuwer|De heer)\s+((?:[\w\-éèêëàâäîïôöùûüç']+\s+){1,3}[\w\-éèêëàâäîïôöùûüç']+)",
re.IGNORECASE
)
seance_pattern = re.compile(r"SEANCE\s+du\s+(\d{2})[\/\-](\d{2})[\/\-](\d{4})", re.IGNORECASE)
ref_pattern = re.compile(r"n\.réf\s*[:\-]?\s*das\/(?:[\w]+\/)*(\d{4})\/(\d+)", re.IGNORECASE)
log_lines = ["original_filename,new_filename,date_folder,nom_prenom,ref,date_séance"]
processed_files = []
error_files = []
used_filenames = {}
for filename in os.listdir(pdf_folder):
filepath = os.path.join(pdf_folder, filename)
if not filename.lower().endswith(".pdf") or not os.path.isfile(filepath):
continue
try:
images = convert_from_path(filepath, first_page=1, last_page=1)
text = pytesseract.image_to_string(images[0], lang='fra+eng')
name_match = title_pattern.search(text)
safe_name = "NO_NAME"
if name_match:
name = name_match.group(1).strip()
safe_name = re.sub(r"[^\w]", "_", name)
safe_name = re.sub(r"_+", "_", safe_name)
safe_name = re.sub(r"(_?DEPARTEMENT|_?ACTION|_?DIRECTION|_?SERVICE|_?UNITE|_?DIVISION)+", "", safe_name, flags=re.IGNORECASE)
safe_name = safe_name.strip("_")
date_match = seance_pattern.search(text)
date_str = "NO_DATE"
folder_path = errors_folder
if date_match:
day, month, year = date_match.groups()
date_str = f"{year}{month}{day}"
folder_path = os.path.join(pdf_folder, date_str)
os.makedirs(folder_path, exist_ok=True)
ref_match = ref_pattern.search(text)
di1 = ref_match.group(1) if ref_match else "0000"
di2 = ref_match.group(2) if ref_match else "0000"
base_filename = f"{date_str}-{safe_name}-{di1}-{di2}"
counter = used_filenames.get(base_filename, 0)
new_filename = f"{base_filename}.pdf" if counter == 0 else f"{base_filename}_{counter}.pdf"
used_filenames[base_filename] = counter + 1
# Choix du classement
if classement_mode == "Par date de séance (AAAAMMJJ)":
folder_path = os.path.join(pdf_folder, date_str) if date_match else errors_folder
else:
folder_path = os.path.join(pdf_folder, safe_name) if name_match else errors_folder
os.makedirs(folder_path, exist_ok=True)
new_path = os.path.join(folder_path, new_filename)
if not os.path.exists(new_path):
os.rename(filepath, new_path)
final_filename = re.sub(r"_D_(\d+)", r"_D_\1", new_filename)
final_path = os.path.join(folder_path, final_filename)
else:
final_path = new_path
nom_final = os.path.basename(final_path).replace(".pdf", "")
try:
date_part, name_part, di1_part, di2_part = nom_final.split("-")
date_formatted = f"{date_part[6:]}/{date_part[4:6]}/{date_part[0:4]}"
nom_prenom_csv = name_part.replace("_", " ")
ref_csv = f"{di1_part}/{di2_part}"
except Exception:
date_formatted = "NA"
nom_prenom_csv = "NA"
ref_csv = "NA"
log_lines.append(f"{filename},{nom_final}.pdf,{date_str},{nom_prenom_csv},{ref_csv},{date_formatted}")
processed_files.append(f"✅ {filename} → {new_filename}")
except Exception as e:
error_path = os.path.join(errors_folder, filename)
shutil.move(filepath, error_path)
log_lines.append(f"{filename},ERROR:{str(e).replace(',', ';')},NO_DATE,NA,NA,NA")
error_files.append(f"❌ {filename} (Erreur: {str(e)})")
with open(output_log, "w", encoding="utf-8") as f:
f.write("\n".join(log_lines))
shutil.make_archive("renamed_pdfs", 'zip', pdf_folder)
last_files = "\n".join(processed_files[-5:] + error_files[-3:]) if processed_files or error_files else "Aucun fichier traité"
report = f"""
**Traitement terminé !**
- Fichiers traités : {len(log_lines)-1}
- Avec succès : {len(processed_files)}
- En erreur : {len(error_files)}
- Derniers fichiers :
{last_files}
"""
return "renamed_pdfs.zip", output_log, report
# Fonction pour afficher la documentation directement dans l'interface
def afficher_doc():
return """
# 📄 Documentation - Traitement des Notifications du Comité de l’Action Sociale
Cet outil vous permet de :
- 🧠 **Extraire automatiquement** le **nom**, la **référence** et la **date de séance** des notifications PDF (scans)
- ✅ Valable uniquement pour les documents contenant : `Monsieur`, `Madame`, `Morveuwer`, `De heer`
- 📅 Le classement repose sur la mention **"SEANCE du"**
- 🆔 Extraction de la **référence DI (interne)**
- 📄 Analyse uniquement de la **1ère page (moitié supérieure)** du document
- 🗂️ **Renommer les fichiers** selon le format : `AAAAMMJJ-NOM_PRENOM-DI1-DI2.pdf`
- 📆 **Classer automatiquement** les fichiers dans des dossiers selon la **date de séance** (format : `AAAAMMJJ`)
- 🔀 **Gérer les doublons** en ajoutant un suffixe `_x` si un nom existe déjà
- 📾 **Générer un fichier CSV de log** pour le suivi des traitements
👤 *Conçu pour les collaborateurs du CPAS Bruxelles*
📬 Contact : [omar.bajouk@cpasbxl.brussels](mailto:omar.bajouk@cpasbxl.brussels)
"""
# Interface Gradio
with gr.Blocks(title="Renommer les notifications et trier par date du séance") as demo:
gr.Markdown(afficher_doc())
with gr.Row():
input_files = gr.File(label="1. ZIP ou plusieurs PDFs", file_types=[".zip", ".pdf"], file_count="multiple")
class_option = gr.Radio(
choices=["Par date de séance (AAAAMMJJ)", "Par nom/référence (NOM_PRENOM-DI1-DI2)"],
label="2. Choisir le mode de classement",
value="Par date de séance (AAAAMMJJ)"
)
btn_process = gr.Button("🚀 Traiter les fichiers")
with gr.Row():
output_zip = gr.File(label="2. PDFs Renommés (ZIP)")
output_log = gr.File(label="3. Fichier Log (CSV)")
output_report = gr.Markdown()
btn_process.click(
fn=process_pdfs,
inputs=[input_files, class_option],
outputs=[output_zip, output_log, output_report]
)
demo.launch(share=True)
|