Spaces:

KJ24
/

Transcript_file_docx_doc_etc

Sleeping

File size: 1,776 Bytes

cda8c80
2d8097b
e97581f
2d8097b
 
cda8c80
2d8097b
 
cda8c80
 
 
 
 
 
2d8097b
cda8c80
2d8097b
cda8c80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d8097b
cda8c80
 
2d8097b
cda8c80
2d8097b
cda8c80
2d8097b
cda8c80
2d8097b
cda8c80
 
2d8097b
 
8dd4433
cda8c80
 
 
 
 
 
 
 
e97581f
cda8c80

import gradio as gr
import os
import pandas as pd
from docx import Document
import pdfplumber
from tempfile import NamedTemporaryFile


def extract_text(file):
    # Enregistrer temporairement le fichier
    suffix = os.path.splitext(file.name)[1].lower()
    with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        tmp.write(file.read())
        tmp_path = tmp.name

    # Extraction selon le type
    try:
        if suffix == ".docx":
            doc = Document(tmp_path)
            texte = "\n".join([p.text for p in doc.paragraphs])
            filetype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"

        elif suffix == ".pdf":
            texte = ""
            with pdfplumber.open(tmp_path) as pdf:
                for page in pdf.pages:
                    texte += page.extract_text() + "\n"
            filetype = "application/pdf"

        elif suffix in [".csv", ".xlsx"]:
            if suffix == ".csv":
                df = pd.read_csv(tmp_path)
            else:
                df = pd.read_excel(tmp_path)
            texte = df.to_string(index=False)
            filetype = "spreadsheet"

        else:
            return "Type de fichier non supporté."

    except Exception as e:
        return f"Erreur pendant l'extraction : {str(e)}"
    finally:
        os.remove(tmp_path)

    # Structure du retour
    return {
        "nom_fichier": os.path.basename(file.name),
        "type": filetype,
        "texte": texte
    }

# Interface Gradio
demo = gr.Interface(
    fn=extract_text,
    inputs=gr.File(label="Uploader un fichier (.docx, .pdf, .csv, .xlsx)"),
    outputs="json",
    title="🧠 Extracteur de texte",
    description="Envoie un fichier et récupère son contenu brut"
)

demo.launch()