Spaces:

KJ24
/

Transcript_file_docx_doc_etc

Sleeping

Update app.py

cda8c80 verified 9 months ago

1.78 kB

	import gradio as gr
	import os
	import pandas as pd
	from docx import Document
	import pdfplumber
	from tempfile import NamedTemporaryFile


	def extract_text(file):
	# Enregistrer temporairement le fichier
	suffix = os.path.splitext(file.name)[1].lower()
	with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
	tmp.write(file.read())
	tmp_path = tmp.name

	# Extraction selon le type
	try:
	if suffix == ".docx":
	doc = Document(tmp_path)
	texte = "\n".join([p.text for p in doc.paragraphs])
	filetype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"

	elif suffix == ".pdf":
	texte = ""
	with pdfplumber.open(tmp_path) as pdf:
	for page in pdf.pages:
	texte += page.extract_text() + "\n"
	filetype = "application/pdf"

	elif suffix in [".csv", ".xlsx"]:
	if suffix == ".csv":
	df = pd.read_csv(tmp_path)
	else:
	df = pd.read_excel(tmp_path)
	texte = df.to_string(index=False)
	filetype = "spreadsheet"

	else:
	return "Type de fichier non supporté."

	except Exception as e:
	return f"Erreur pendant l'extraction : {str(e)}"
	finally:
	os.remove(tmp_path)

	# Structure du retour
	return {
	"nom_fichier": os.path.basename(file.name),
	"type": filetype,
	"texte": texte
	}

	# Interface Gradio
	demo = gr.Interface(
	fn=extract_text,
	inputs=gr.File(label="Uploader un fichier (.docx, .pdf, .csv, .xlsx)"),
	outputs="json",
	title="🧠 Extracteur de texte",
	description="Envoie un fichier et récupère son contenu brut"
	)

	demo.launch()