File size: 1,776 Bytes
cda8c80
2d8097b
e97581f
2d8097b
 
cda8c80
2d8097b
 
cda8c80
 
 
 
 
 
2d8097b
cda8c80
2d8097b
cda8c80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d8097b
cda8c80
 
2d8097b
cda8c80
2d8097b
cda8c80
2d8097b
cda8c80
2d8097b
cda8c80
 
2d8097b
 
8dd4433
cda8c80
 
 
 
 
 
 
 
e97581f
cda8c80
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
import os
import pandas as pd
from docx import Document
import pdfplumber
from tempfile import NamedTemporaryFile


def extract_text(file):
    # Enregistrer temporairement le fichier
    suffix = os.path.splitext(file.name)[1].lower()
    with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        tmp.write(file.read())
        tmp_path = tmp.name

    # Extraction selon le type
    try:
        if suffix == ".docx":
            doc = Document(tmp_path)
            texte = "\n".join([p.text for p in doc.paragraphs])
            filetype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"

        elif suffix == ".pdf":
            texte = ""
            with pdfplumber.open(tmp_path) as pdf:
                for page in pdf.pages:
                    texte += page.extract_text() + "\n"
            filetype = "application/pdf"

        elif suffix in [".csv", ".xlsx"]:
            if suffix == ".csv":
                df = pd.read_csv(tmp_path)
            else:
                df = pd.read_excel(tmp_path)
            texte = df.to_string(index=False)
            filetype = "spreadsheet"

        else:
            return "Type de fichier non supporté."

    except Exception as e:
        return f"Erreur pendant l'extraction : {str(e)}"
    finally:
        os.remove(tmp_path)

    # Structure du retour
    return {
        "nom_fichier": os.path.basename(file.name),
        "type": filetype,
        "texte": texte
    }

# Interface Gradio
demo = gr.Interface(
    fn=extract_text,
    inputs=gr.File(label="Uploader un fichier (.docx, .pdf, .csv, .xlsx)"),
    outputs="json",
    title="🧠 Extracteur de texte",
    description="Envoie un fichier et récupère son contenu brut"
)

demo.launch()