Spaces:

lsottani
/

RAG_file_preprocessing

Runtime error

App Files Files Community

Laurine Sottani commited on 2 days ago

Commit

3550f03

1 Parent(s): cf99660

fix file support

Browse files

Files changed (2) hide show

file_cleaning_ui.py +15 -13
requirements.txt +1 -0

file_cleaning_ui.py CHANGED Viewed

@@ -6,6 +6,7 @@ import tempfile
 from pathlib import Path
 import pdfplumber
 import gradio as gr
 def clean_text_for_rag(text: str) -> str:
@@ -41,6 +42,17 @@ def extract_and_clean_pdf(pdf_path: str) -> str:
     return clean_text_for_rag(" ".join(all_pages))
 def extract_and_clean_txt(txt_path: str) -> str:
     """Lit un fichier texte (txt, md, …) et le nettoie."""
     print(f"[+] Lecture du fichier texte : {txt_path}")
@@ -53,18 +65,6 @@ def extract_and_clean_txt(txt_path: str) -> str:
     ]
     return "\n".join(cleaned)
-def extract_and_clean_docx(docx_path: str) -> str:
-    """Lit un fichier type Word et le nettoie."""
-    print(f"[+] Lecture du fichier docx : {docx_path}")
-    with open(docx_path, "r", encoding="utf-8") as f:
-        lines = f.readlines()
-    cleaned = [
-        clean_text_for_rag(line.strip())
-        for line in lines
-        if line.strip()
-    ]
-    return "\n".join(cleaned)
 def process_file(input_file: gr.File, output_name: str) -> str:
     """
     - Detecte le type (PDF ou texte)
@@ -77,6 +77,8 @@ def process_file(input_file: gr.File, output_name: str) -> str:
     if ext == ".pdf":
         cleaned_text = extract_and_clean_pdf(input_path)
     else:
         cleaned_text = extract_and_clean_txt(input_path)
@@ -103,7 +105,7 @@ with gr.Blocks(title="Nettoyage de texte pour RAG") as demo:
         with gr.Column(scale=1):
             input_file = gr.File(
                 label="Déposez votre fichier ici",
-                file_types=["pdf", "txt", "md", "file"],
             )
             output_name = gr.Textbox(
                 value="output.md",

 from pathlib import Path
 import pdfplumber
+import docx
 import gradio as gr
 def clean_text_for_rag(text: str) -> str:
     return clean_text_for_rag(" ".join(all_pages))
+def extract_and_clean_docx(docx_path: str) -> str:
+    """Lit un fichier DOCX et le nettoie."""
+    print(f"[+] Extraction du DOCX : {docx_path}")
+    doc = docx.Document(docx_path)
+    paragraphs = []
+    for para in doc.paragraphs:
+        text = para.text.strip()
+        if text:
+            paragraphs.append(text)
+    return clean_text_for_rag(" ".join(paragraphs))
 def extract_and_clean_txt(txt_path: str) -> str:
     """Lit un fichier texte (txt, md, …) et le nettoie."""
     print(f"[+] Lecture du fichier texte : {txt_path}")
     ]
     return "\n".join(cleaned)
 def process_file(input_file: gr.File, output_name: str) -> str:
     """
     - Detecte le type (PDF ou texte)
     if ext == ".pdf":
         cleaned_text = extract_and_clean_pdf(input_path)
+    elif ext == ".docx":
+        cleaned_text = extract_and_clean_docx(input_path)
     else:
         cleaned_text = extract_and_clean_txt(input_path)
         with gr.Column(scale=1):
             input_file = gr.File(
                 label="Déposez votre fichier ici",
+                file_types=["pdf", "txt", "md", "docx"],
             )
             output_name = gr.Textbox(
                 value="output.md",

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	pdfplumber


1	pdfplumber
2	+ python-docx