download
raw
6.57 kB
import os
import tempfile
import subprocess
from pathlib import Path
import whisper
from deep_translator import GoogleTranslator
try:
from moviepy.editor import VideoFileClip
_HAS_MOVIEPY = True
except Exception:
VideoFileClip = None
_HAS_MOVIEPY = False
import gradio as gr
def extract_audio(video_path: str, out_audio_path: str) -> None:
"""Extract audio from `video_path` into `out_audio_path`.
Uses `moviepy` when available; otherwise falls back to the `ffmpeg` CLI.
This prevents the app from crashing at import time if `moviepy` isn't installed
(useful for Spaces where installation may fail)."""
if _HAS_MOVIEPY and VideoFileClip is not None:
clip = VideoFileClip(video_path)
clip.audio.write_audiofile(out_audio_path, logger=None)
clip.close()
return
# Fallback: use ffmpeg CLI (most Spaces images include ffmpeg)
cmd = [
"ffmpeg", "-y", "-i", str(video_path),
"-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", str(out_audio_path)
]
try:
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except Exception as e:
raise RuntimeError(f"Impossible d'extraire l'audio — installez 'moviepy' ou vérifiez que 'ffmpeg' est disponible: {e}")
def transcribe_audio(audio_path: str, model_size: str, device: str = "cpu") -> dict:
model = whisper.load_model(model_size, device=device)
result = model.transcribe(audio_path)
return result
def translate_text(text: str, target_lang: str = "fr") -> str:
return GoogleTranslator(source="auto", target=target_lang).translate(text)
def process_video(file, model_size, translate_to_fr, show_lang, device_choice):
# streamed updates for Gradio
yield "Étape 1/5 — enregistrement du fichier..."
tmpdir = Path(tempfile.mkdtemp(prefix="v2t_"))
video_path = tmpdir / Path(file.name)
with open(video_path, "wb") as f:
f.write(file.read())
yield "Étape 2/5 — extraction de l'audio (moviepy)..."
audio_path = tmpdir / "audio.wav"
try:
extract_audio(str(video_path), str(audio_path))
except Exception as e:
yield f"Erreur lors de l'extraction audio: {e}"
return
yield "Étape 3/5 — chargement du modèle Whisper..."
device = "cuda" if device_choice == "gpu" else "cpu"
yield f"Étape 4/5 — transcription (modèle {model_size})..."
try:
result = transcribe_audio(str(audio_path), model_size, device=device)
except Exception as e:
yield f"Erreur lors de la transcription: {e}"
return
original_text = result.get("text", "")
lang = result.get("language") if show_lang else None
if translate_to_fr:
yield "Étape 5/5 — traduction en français..."
try:
french_text = translate_text(original_text, "fr")
except Exception as e:
yield f"Erreur lors de la traduction: {e}"
return
else:
french_text = None
# save outputs
txt_path = tmpdir / "transcription.txt"
with open(txt_path, "w", encoding="utf-8") as f:
f.write(original_text)
if french_text is not None:
fr_path = tmpdir / "transcription_fr.txt"
with open(fr_path, "w", encoding="utf-8") as f:
f.write(french_text)
# results
yield {
"status": "Terminé — transcription prête",
"language": lang or "(non affiché)",
"transcription": original_text,
"translation": french_text or "(désactivée)",
"download_txt": str(txt_path),
"download_fr": str(fr_path) if french_text is not None else None,
}
with gr.Blocks(title="Vidéo → Texte (Whisper) — Gradio UI") as demo:
gr.Markdown("""
# 🎬 Vidéo → Transcription (Whisper)
- Téléverse une vidéo (mp4, mkv...).
- Choisis le `model` Whisper (tiny→large).
- Option: traduire le texte en français.
*Remarque : le modèle `small` est un bon compromis vitesse/qualité.*
""")
with gr.Row():
with gr.Column(scale=2):
video_in = gr.File(label="Téléverser une vidéo")
model_choice = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="small", label="Modèle Whisper")
translate_chk = gr.Checkbox(label="Traduire en français (après transcription)", value=True)
show_lang = gr.Checkbox(label="Afficher la langue détectée", value=True)
device_radio = gr.Radio(choices=["cpu", "gpu"], value="cpu", label="Exécution (device)")
run_button = gr.Button("Transcrire")
with gr.Column():
status_txt = gr.Textbox(label="Statut / Progrès", value="Prêt", lines=2)
lang_out = gr.Textbox(label="Langue détectée", value="", lines=1)
trans_out = gr.Textbox(label="Transcription (texte)", value="", lines=12)
trans_down = gr.File(file_count="single", label="Télécharger la transcription (.txt)")
fr_out = gr.Textbox(label="Traduction FR (si demandée)", value="", lines=12)
fr_down = gr.File(file_count="single", label="Télécharger la traduction (.txt)")
def runner(file, model_choice, translate_chk, show_lang, device_radio):
if file is None or getattr(file, "name", None) is None:
return "Aucun fichier fourni.", "", "", None, "", None
for update in process_video(file, model_choice, translate_chk, show_lang, device_radio):
# process_video yields either status strings or final dict
if isinstance(update, str):
yield update, gr.update(), gr.update(), None, gr.update(), None
elif isinstance(update, dict):
yield (
update.get("status", "Terminé"),
update.get("language", ""),
update.get("transcription", ""),
gr.update(value=update.get("download_txt")),
update.get("translation", ""),
gr.update(value=update.get("download_fr")),
)
run_button.click(runner, inputs=[video_in, model_choice, translate_chk, show_lang, device_radio],
outputs=[status_txt, lang_out, trans_out, trans_down, fr_out, fr_down])
gr.Markdown("---\n**Notes :** utilise `small` si tu es sur CPU. Pour des vidéos très longues, coupe-les en segments pour fiabiliser la mémoire.")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", share=False)

Xet Storage Details

Size:
6.57 kB
·
Xet hash:
6f01b6ae6c5f45f9fe7d44b3e3855dfadd5bf9e19ad494143eada33b1ffd0cd7

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.