Spaces:
Sleeping
Sleeping
File size: 1,776 Bytes
cda8c80 2d8097b e97581f 2d8097b cda8c80 2d8097b cda8c80 2d8097b cda8c80 2d8097b cda8c80 2d8097b cda8c80 2d8097b cda8c80 2d8097b cda8c80 2d8097b cda8c80 2d8097b cda8c80 2d8097b 8dd4433 cda8c80 e97581f cda8c80 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import gradio as gr
import os
import pandas as pd
from docx import Document
import pdfplumber
from tempfile import NamedTemporaryFile
def extract_text(file):
# Enregistrer temporairement le fichier
suffix = os.path.splitext(file.name)[1].lower()
with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(file.read())
tmp_path = tmp.name
# Extraction selon le type
try:
if suffix == ".docx":
doc = Document(tmp_path)
texte = "\n".join([p.text for p in doc.paragraphs])
filetype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
elif suffix == ".pdf":
texte = ""
with pdfplumber.open(tmp_path) as pdf:
for page in pdf.pages:
texte += page.extract_text() + "\n"
filetype = "application/pdf"
elif suffix in [".csv", ".xlsx"]:
if suffix == ".csv":
df = pd.read_csv(tmp_path)
else:
df = pd.read_excel(tmp_path)
texte = df.to_string(index=False)
filetype = "spreadsheet"
else:
return "Type de fichier non supporté."
except Exception as e:
return f"Erreur pendant l'extraction : {str(e)}"
finally:
os.remove(tmp_path)
# Structure du retour
return {
"nom_fichier": os.path.basename(file.name),
"type": filetype,
"texte": texte
}
# Interface Gradio
demo = gr.Interface(
fn=extract_text,
inputs=gr.File(label="Uploader un fichier (.docx, .pdf, .csv, .xlsx)"),
outputs="json",
title="🧠 Extracteur de texte",
description="Envoie un fichier et récupère son contenu brut"
)
demo.launch()
|