Spaces:

lsottani
/

Prep_my_data

Running

File size: 5,252 Bytes

#!/usr/bin/env python

import os
import re
import tempfile
from pathlib import Path

import pdfplumber
import docx
import gradio as gr

def clean_text_for_rag(text: str) -> str:
    """Normalise et nettoie le texte pour un usage RAG."""
    # Normalisation des caractères typographiques
    text = re.sub(
        r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]",
        lambda m: {
            "’": "'", "‘": "'", "“": '"', "”": '"',
            "«": '"', "»": '"', "–": "-", "—": "-",
            "…": "...", "œ": "oe", "Œ": "OE",
            "æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)",
            "™": "TM", "§": "§", "°": "°", "±": "+/-",
            "×": "x", "÷": "/"
        }.get(m.group(0), m.group(0)),
        text,
    )
    # Conserver uniquement les caractères suivants
    text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text)
    # Réduire les espaces multiples
    return re.sub(r'\s+', ' ', text).strip()


def extract_and_clean_pdf(pdf_path: str) -> str:
    """Ouvre le PDF, récupère le texte et le nettoie."""
    print(f"[+] Extraction du PDF : {pdf_path}")
    all_pages = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            txt = page.extract_text()
            if txt:
                all_pages.append(txt)
    return clean_text_for_rag(" ".join(all_pages))


def extract_and_clean_docx(docx_path: str) -> str:
    """Lit un fichier DOCX et le nettoie."""
    print(f"[+] Extraction du DOCX : {docx_path}")
    doc = docx.Document(docx_path)
    paragraphs = []
    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            paragraphs.append(text)
    return clean_text_for_rag(" ".join(paragraphs))

def extract_and_clean_txt(txt_path: str) -> str:
    """Lit un fichier texte (txt, md, …) et le nettoie."""
    print(f"[+] Lecture du fichier texte : {txt_path}")
    with open(txt_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    cleaned = [
        clean_text_for_rag(line.strip())
        for line in lines
        if line.strip()
    ]
    return "\n".join(cleaned)

def process_file(input_file, output_name):

    """
    - Detecte le type (PDF ou texte)
    - Effectue l'extraction + nettoyage
    - Crée un fichier temporaire **avec le nom choisi** (output_name)
    - Retourne le chemin du fichier temporaire (Gradio le propose en téléchargement)
    """
    
    if input_file is None:
        return None
        
    if hasattr(input_file, "read"):
        data = input_file.read()
        filename = input_file.name
    elif isinstance(input_file, str):
        filename = input_file
        with open(input_file, "rb") as f:
            data = f.read()
    else:
        filename = input_file[0].name
        data = input_file[0].read()

    # écrire dans /tmp (important sur HF Spaces)
    suffix = os.path.splitext(filename)[1]
    tmp_path = os.path.join(tempfile.gettempdir(), "upload" + suffix)

    with open(tmp_path, "wb") as f:
        f.write(data)

    ext = suffix.lower()

    if ext == ".pdf":
        cleaned_text = extract_and_clean_pdf(tmp_path)

    elif ext == ".docx":
        cleaned_text = extract_and_clean_docx(tmp_path)

    else:
        cleaned_text = extract_and_clean_txt(tmp_path)

    if not output_name.lower().endswith(".md"):
        output_name += ".md"

    out_path = os.path.join(tempfile.gettempdir(), output_name)

    with open(out_path, "w", encoding="utf-8") as f:
        f.write(cleaned_text)

    return out_path

with gr.Blocks(title="Nettoyage de texte pour RAG") as demo:
    gr.Markdown("# 📄 Nettoyage d'un fichier pour optimisation de vos pipelines RAG")
    gr.Markdown(
        "Déposez simplement votre fichier : nous nous chargeons d’extraire son contenu textuel, de le nettoyer "
        "puis de vous le restituer en format markdown **sous le nom que vous choisissez.**"
    )

    with gr.Row():
        with gr.Column(scale=1):
            input_file = gr.File(
                label="Déposez votre fichier ici",
                file_types=[".pdf", ".txt", ".md", ".docx"],
                file_count="single",
            )
            output_name = gr.Textbox(
                value="output.md",
                label="Nom du fichier de sortie (en .md)",
                placeholder="exemple.md",
                interactive=True,
            )
            submit_btn = gr.Button("Traiter le fichier", variant="primary")
        with gr.Column(scale=1):
            output_file = gr.File(
                label="Fichier nettoyé (.md)",
                file_types=["md"],
            )

    submit_btn.click(
        fn=process_file,
        inputs=[input_file, output_name],
        outputs=output_file,
    )

    gr.Markdown(
        """
        ---
        **Prétraitements effectués :**
        - Suppression des symboles non imprimables et des caractères parasites  
        - Conservation des lettres (y compris accentuées), chiffres, espaces et ponctuation simple 
        - Normalisation des espaces pour un texte harmonieux  
        - Export automatique au format **`.md`**  
        
        """
    )

if __name__ == "__main__":
    demo.launch()