Laurine Sottani commited on
Commit
cf99660
·
1 Parent(s): fa0c299

add docx support

Browse files
Files changed (1) hide show
  1. file_cleaning_ui.py +12 -0
file_cleaning_ui.py CHANGED
@@ -53,6 +53,18 @@ def extract_and_clean_txt(txt_path: str) -> str:
53
  ]
54
  return "\n".join(cleaned)
55
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def process_file(input_file: gr.File, output_name: str) -> str:
57
  """
58
  - Detecte le type (PDF ou texte)
 
53
  ]
54
  return "\n".join(cleaned)
55
 
56
+ def extract_and_clean_docx(docx_path: str) -> str:
57
+ """Lit un fichier type Word et le nettoie."""
58
+ print(f"[+] Lecture du fichier docx : {docx_path}")
59
+ with open(docx_path, "r", encoding="utf-8") as f:
60
+ lines = f.readlines()
61
+ cleaned = [
62
+ clean_text_for_rag(line.strip())
63
+ for line in lines
64
+ if line.strip()
65
+ ]
66
+ return "\n".join(cleaned)
67
+
68
  def process_file(input_file: gr.File, output_name: str) -> str:
69
  """
70
  - Detecte le type (PDF ou texte)