Spaces:

lsottani
/

RAG_file_preprocessing

Runtime error

App Files Files Community

lsottani commited on 4 days ago

Commit

f71d7f0

verified ·

1 Parent(s): e871e70

Upload file_cleaning_ui.py

Browse files

Files changed (1) hide show

file_cleaning_ui.py +142 -0

file_cleaning_ui.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#!/usr/bin/env python
+import os
+import re
+import tempfile
+from pathlib import Path
+import pdfplumber
+import docx
+import gradio as gr
+def clean_text_for_rag(text: str) -> str:
+    """Normalise et nettoie le texte pour un usage RAG."""
+    # Normalisation des caractères typographiques
+    text = re.sub(
+        r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]",
+        lambda m: {
+            "’": "'", "‘": "'", "“": '"', "”": '"',
+            "«": '"', "»": '"', "–": "-", "—": "-",
+            "…": "...", "œ": "oe", "Œ": "OE",
+            "æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)",
+            "™": "TM", "§": "§", "°": "°", "±": "+/-",
+            "×": "x", "÷": "/"
+        }.get(m.group(0), m.group(0)),
+        text,
+    )
+    # Conserver uniquement les caractères suivants
+    text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text)
+    # Réduire les espaces multiples
+    return re.sub(r'\s+', ' ', text).strip()
+def extract_and_clean_pdf(pdf_path: str) -> str:
+    """Ouvre le PDF, récupère le texte et le nettoie."""
+    print(f"[+] Extraction du PDF : {pdf_path}")
+    all_pages = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            txt = page.extract_text()
+            if txt:
+                all_pages.append(txt)
+    return clean_text_for_rag(" ".join(all_pages))
+def extract_and_clean_docx(docx_path: str) -> str:
+    """Lit un fichier DOCX et le nettoie."""
+    print(f"[+] Extraction du DOCX : {docx_path}")
+    doc = docx.Document(docx_path)
+    paragraphs = []
+    for para in doc.paragraphs:
+        text = para.text.strip()
+        if text:
+            paragraphs.append(text)
+    return clean_text_for_rag(" ".join(paragraphs))
+def extract_and_clean_txt(txt_path: str) -> str:
+    """Lit un fichier texte (txt, md, …) et le nettoie."""
+    print(f"[+] Lecture du fichier texte : {txt_path}")
+    with open(txt_path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+    cleaned = [
+        clean_text_for_rag(line.strip())
+        for line in lines
+        if line.strip()
+    ]
+    return "\n".join(cleaned)
+def process_file(input_file: gr.File, output_name: str) -> str:
+    """
+    - Detecte le type (PDF ou texte)
+    - Effectue l'extraction + nettoyage
+    - Crée un fichier temporaire **avec le nom choisi** (output_name)
+    - Retourne le chemin du fichier temporaire (Gradio le propose en téléchargement)
+    """
+    input_path = input_file.name
+    _, ext = os.path.splitext(input_path.lower())
+    if ext == ".pdf":
+        cleaned_text = extract_and_clean_pdf(input_path)
+    elif ext == ".docx":
+        cleaned_text = extract_and_clean_docx(input_path)
+    else:
+        cleaned_text = extract_and_clean_txt(input_path)
+    output_name = output_name.strip()
+    if not output_name.lower().endswith(".md"):
+        output_name = f"{output_name}.md"
+    temp_dir = tempfile.mkdtemp()
+    out_path = os.path.join(temp_dir, output_name)
+    with open(out_path, "w", encoding="utf-8") as f:
+        f.write(cleaned_text)
+    return out_path
+with gr.Blocks(title="Nettoyage de texte pour RAG") as demo:
+    gr.Markdown("# 📄 Nettoyage d'un fichier pour utilisation RAG")
+    gr.Markdown(
+        "Déposez simplement votre fichier : nous nous chargeons d’extraire son contenu textuel, de le nettoyer "
+        "puis de vous le restituer en format markdown **sous le nom que vous choisissez.**"
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_file = gr.File(
+                label="Déposez votre fichier ici",
+                file_types=["pdf", "txt", "md", "docx"],
+            )
+            output_name = gr.Textbox(
+                value="output.md",
+                label="Nom du fichier de sortie (en .md)",
+                placeholder="exemple.md",
+                interactive=True,
+            )
+            submit_btn = gr.Button("Traiter le fichier", variant="primary")
+        with gr.Column(scale=1):
+            output_file = gr.File(
+                label="Fichier nettoyé (.md)",
+                file_types=["md"],
+            )
+    submit_btn.click(
+        fn=process_file,
+        inputs=[input_file, output_name],
+        outputs=output_file,
+    )
+    gr.Markdown(
+        """
+        ---
+        **Prétraitements effectués :**
+        - Suppression des symboles non imprimables et des caractères parasites
+        - Conservation des lettres (y compris accentuées), chiffres, espaces et ponctuation simple
+        - Normalisation des espaces pour un texte harmonieux
+        - Export automatique au format **`.md`**
+        """
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)