import gradio as gr
import os
import pandas as pd
from docx import Document
import pdfplumber
from tempfile import NamedTemporaryFile


def extract_text(file):
    # Enregistrer temporairement le fichier
    suffix = os.path.splitext(file.name)[1].lower()
    with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        tmp.write(file.read())
        tmp_path = tmp.name

    # Extraction selon le type
    try:
        if suffix == ".docx":
            doc = Document(tmp_path)
            texte = "\n".join([p.text for p in doc.paragraphs])
            filetype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"

        elif suffix == ".pdf":
            texte = ""
            with pdfplumber.open(tmp_path) as pdf:
                for page in pdf.pages:
                    texte += page.extract_text() + "\n"
            filetype = "application/pdf"

        elif suffix in [".csv", ".xlsx"]:
            if suffix == ".csv":
                df = pd.read_csv(tmp_path)
            else:
                df = pd.read_excel(tmp_path)
            texte = df.to_string(index=False)
            filetype = "spreadsheet"

        else:
            return "Type de fichier non supporté."

    except Exception as e:
        return f"Erreur pendant l'extraction : {str(e)}"
    finally:
        os.remove(tmp_path)

    # Structure du retour
    return {
        "nom_fichier": os.path.basename(file.name),
        "type": filetype,
        "texte": texte
    }

# Interface Gradio
demo = gr.Interface(
    fn=extract_text,
    inputs=gr.File(label="Uploader un fichier (.docx, .pdf, .csv, .xlsx)"),
    outputs="json",
    title="🧠 Extracteur de texte",
    description="Envoie un fichier et récupère son contenu brut"
)

demo.launch()