Spaces:

KJ24
/

Transcript_file_docx_doc_etc

Sleeping

App Files Files Community

KJ24 commited on May 15, 2025

Commit

cda8c80

verified ·

1 Parent(s): e97581f

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -48

app.py CHANGED Viewed

@@ -1,65 +1,62 @@
-from fastapi import FastAPI, UploadFile, File
-from fastapi.responses import JSONResponse
 import os
-from tempfile import NamedTemporaryFile
 import pandas as pd
 from docx import Document
 import pdfplumber
-app = FastAPI()
-def extract_text_from_docx(file_path):
-    doc = Document(file_path)
-    return "\n".join([p.text for p in doc.paragraphs])
-def extract_text_from_pdf(file_path):
-    text = ""
-    with pdfplumber.open(file_path) as pdf:
-        for page in pdf.pages:
-            text += page.extract_text() + "\n"
-    return text
-def extract_text_from_sheet(file_path):
-    if file_path.endswith(".csv"):
-        df = pd.read_csv(file_path)
-    else:
-        df = pd.read_excel(file_path)
-    return df.to_string(index=False)
-@app.post("/upload")
-async def upload(file: UploadFile = File(...)):
-    extension = os.path.splitext(file.filename)[1].lower()
-    mime_type = file.content_type
-    with NamedTemporaryFile(delete=False, suffix=extension) as temp_file:
-        temp_file.write(await file.read())
-        temp_path = temp_file.name
     try:
-        if extension == ".docx":
-            texte = extract_text_from_docx(temp_path)
-        elif extension == ".pdf":
-            texte = extract_text_from_pdf(temp_path)
-        elif extension in [".csv", ".xlsx"]:
-            texte = extract_text_from_sheet(temp_path)
         else:
-            return JSONResponse(status_code=400, content={"erreur": "Type de fichier non supporté"})
     except Exception as e:
-        return JSONResponse(status_code=500, content={"erreur": str(e)})
     finally:
-        os.remove(temp_path)
     return {
-        "nom_fichier": file.filename,
-        "type": mime_type,
         "texte": texte
     }
-@app.get("/")
-def ping():
-    return {"message": "L'API fonctionne ✅"}

+import gradio as gr
 import os
 import pandas as pd
 from docx import Document
 import pdfplumber
+from tempfile import NamedTemporaryFile
+def extract_text(file):
+    # Enregistrer temporairement le fichier
+    suffix = os.path.splitext(file.name)[1].lower()
+    with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+        tmp.write(file.read())
+        tmp_path = tmp.name
+    # Extraction selon le type
     try:
+        if suffix == ".docx":
+            doc = Document(tmp_path)
+            texte = "\n".join([p.text for p in doc.paragraphs])
+            filetype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+        elif suffix == ".pdf":
+            texte = ""
+            with pdfplumber.open(tmp_path) as pdf:
+                for page in pdf.pages:
+                    texte += page.extract_text() + "\n"
+            filetype = "application/pdf"
+        elif suffix in [".csv", ".xlsx"]:
+            if suffix == ".csv":
+                df = pd.read_csv(tmp_path)
+            else:
+                df = pd.read_excel(tmp_path)
+            texte = df.to_string(index=False)
+            filetype = "spreadsheet"
         else:
+            return "Type de fichier non supporté."
     except Exception as e:
+        return f"Erreur pendant l'extraction : {str(e)}"
     finally:
+        os.remove(tmp_path)
+    # Structure du retour
     return {
+        "nom_fichier": os.path.basename(file.name),
+        "type": filetype,
         "texte": texte
     }
+# Interface Gradio
+demo = gr.Interface(
+    fn=extract_text,
+    inputs=gr.File(label="Uploader un fichier (.docx, .pdf, .csv, .xlsx)"),
+    outputs="json",
+    title="🧠 Extracteur de texte",
+    description="Envoie un fichier et récupère son contenu brut"
+)
+demo.launch()