import gradio as gr import os import pandas as pd from docx import Document import pdfplumber from tempfile import NamedTemporaryFile def extract_text(file): # Enregistrer temporairement le fichier suffix = os.path.splitext(file.name)[1].lower() with NamedTemporaryFile(delete=False, suffix=suffix) as tmp: tmp.write(file.read()) tmp_path = tmp.name # Extraction selon le type try: if suffix == ".docx": doc = Document(tmp_path) texte = "\n".join([p.text for p in doc.paragraphs]) filetype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" elif suffix == ".pdf": texte = "" with pdfplumber.open(tmp_path) as pdf: for page in pdf.pages: texte += page.extract_text() + "\n" filetype = "application/pdf" elif suffix in [".csv", ".xlsx"]: if suffix == ".csv": df = pd.read_csv(tmp_path) else: df = pd.read_excel(tmp_path) texte = df.to_string(index=False) filetype = "spreadsheet" else: return "Type de fichier non supporté." except Exception as e: return f"Erreur pendant l'extraction : {str(e)}" finally: os.remove(tmp_path) # Structure du retour return { "nom_fichier": os.path.basename(file.name), "type": filetype, "texte": texte } # Interface Gradio demo = gr.Interface( fn=extract_text, inputs=gr.File(label="Uploader un fichier (.docx, .pdf, .csv, .xlsx)"), outputs="json", title="🧠 Extracteur de texte", description="Envoie un fichier et récupère son contenu brut" ) demo.launch()