KJ24's picture
Update app.py
cda8c80 verified
import gradio as gr
import os
import pandas as pd
from docx import Document
import pdfplumber
from tempfile import NamedTemporaryFile
def extract_text(file):
# Enregistrer temporairement le fichier
suffix = os.path.splitext(file.name)[1].lower()
with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(file.read())
tmp_path = tmp.name
# Extraction selon le type
try:
if suffix == ".docx":
doc = Document(tmp_path)
texte = "\n".join([p.text for p in doc.paragraphs])
filetype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
elif suffix == ".pdf":
texte = ""
with pdfplumber.open(tmp_path) as pdf:
for page in pdf.pages:
texte += page.extract_text() + "\n"
filetype = "application/pdf"
elif suffix in [".csv", ".xlsx"]:
if suffix == ".csv":
df = pd.read_csv(tmp_path)
else:
df = pd.read_excel(tmp_path)
texte = df.to_string(index=False)
filetype = "spreadsheet"
else:
return "Type de fichier non supporté."
except Exception as e:
return f"Erreur pendant l'extraction : {str(e)}"
finally:
os.remove(tmp_path)
# Structure du retour
return {
"nom_fichier": os.path.basename(file.name),
"type": filetype,
"texte": texte
}
# Interface Gradio
demo = gr.Interface(
fn=extract_text,
inputs=gr.File(label="Uploader un fichier (.docx, .pdf, .csv, .xlsx)"),
outputs="json",
title="🧠 Extracteur de texte",
description="Envoie un fichier et récupère son contenu brut"
)
demo.launch()