KJ24 commited on
Commit
cda8c80
·
verified ·
1 Parent(s): e97581f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -48
app.py CHANGED
@@ -1,65 +1,62 @@
1
- from fastapi import FastAPI, UploadFile, File
2
- from fastapi.responses import JSONResponse
3
  import os
4
- from tempfile import NamedTemporaryFile
5
  import pandas as pd
6
  from docx import Document
7
  import pdfplumber
 
8
 
9
- app = FastAPI()
10
-
11
-
12
- def extract_text_from_docx(file_path):
13
- doc = Document(file_path)
14
- return "\n".join([p.text for p in doc.paragraphs])
15
-
16
-
17
- def extract_text_from_pdf(file_path):
18
- text = ""
19
- with pdfplumber.open(file_path) as pdf:
20
- for page in pdf.pages:
21
- text += page.extract_text() + "\n"
22
- return text
23
-
24
-
25
- def extract_text_from_sheet(file_path):
26
- if file_path.endswith(".csv"):
27
- df = pd.read_csv(file_path)
28
- else:
29
- df = pd.read_excel(file_path)
30
- return df.to_string(index=False)
31
-
32
-
33
- @app.post("/upload")
34
- async def upload(file: UploadFile = File(...)):
35
- extension = os.path.splitext(file.filename)[1].lower()
36
- mime_type = file.content_type
37
 
38
- with NamedTemporaryFile(delete=False, suffix=extension) as temp_file:
39
- temp_file.write(await file.read())
40
- temp_path = temp_file.name
 
 
 
41
 
 
42
  try:
43
- if extension == ".docx":
44
- texte = extract_text_from_docx(temp_path)
45
- elif extension == ".pdf":
46
- texte = extract_text_from_pdf(temp_path)
47
- elif extension in [".csv", ".xlsx"]:
48
- texte = extract_text_from_sheet(temp_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  else:
50
- return JSONResponse(status_code=400, content={"erreur": "Type de fichier non supporté"})
 
51
  except Exception as e:
52
- return JSONResponse(status_code=500, content={"erreur": str(e)})
53
  finally:
54
- os.remove(temp_path)
55
 
 
56
  return {
57
- "nom_fichier": file.filename,
58
- "type": mime_type,
59
  "texte": texte
60
  }
61
 
 
 
 
 
 
 
 
 
62
 
63
- @app.get("/")
64
- def ping():
65
- return {"message": "L'API fonctionne ✅"}
 
1
+ import gradio as gr
 
2
  import os
 
3
  import pandas as pd
4
  from docx import Document
5
  import pdfplumber
6
+ from tempfile import NamedTemporaryFile
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ def extract_text(file):
10
+ # Enregistrer temporairement le fichier
11
+ suffix = os.path.splitext(file.name)[1].lower()
12
+ with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
13
+ tmp.write(file.read())
14
+ tmp_path = tmp.name
15
 
16
+ # Extraction selon le type
17
  try:
18
+ if suffix == ".docx":
19
+ doc = Document(tmp_path)
20
+ texte = "\n".join([p.text for p in doc.paragraphs])
21
+ filetype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
22
+
23
+ elif suffix == ".pdf":
24
+ texte = ""
25
+ with pdfplumber.open(tmp_path) as pdf:
26
+ for page in pdf.pages:
27
+ texte += page.extract_text() + "\n"
28
+ filetype = "application/pdf"
29
+
30
+ elif suffix in [".csv", ".xlsx"]:
31
+ if suffix == ".csv":
32
+ df = pd.read_csv(tmp_path)
33
+ else:
34
+ df = pd.read_excel(tmp_path)
35
+ texte = df.to_string(index=False)
36
+ filetype = "spreadsheet"
37
+
38
  else:
39
+ return "Type de fichier non supporté."
40
+
41
  except Exception as e:
42
+ return f"Erreur pendant l'extraction : {str(e)}"
43
  finally:
44
+ os.remove(tmp_path)
45
 
46
+ # Structure du retour
47
  return {
48
+ "nom_fichier": os.path.basename(file.name),
49
+ "type": filetype,
50
  "texte": texte
51
  }
52
 
53
+ # Interface Gradio
54
+ demo = gr.Interface(
55
+ fn=extract_text,
56
+ inputs=gr.File(label="Uploader un fichier (.docx, .pdf, .csv, .xlsx)"),
57
+ outputs="json",
58
+ title="🧠 Extracteur de texte",
59
+ description="Envoie un fichier et récupère son contenu brut"
60
+ )
61
 
62
+ demo.launch()