voxtral / app.py
paulalbrecht's picture
Create app.py
a6b9939 verified
import gradio as gr
import os
import json
from datetime import datetime
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import soundfile as sf
MODEL_NAME = "mistralai/Voxtral-Small-24B-2507"
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)
def transcribe(audio_file, metadata_file):
# Lade Audio
audio_input, sample_rate = sf.read(audio_file)
inputs = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt")
with torch.no_grad():
generated_ids = model.generate(**inputs)
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Lade Metadaten
metadata = None
if metadata_file is not None:
metadata = json.load(metadata_file)
# Ergebnis
result = {
"transcription": transcription,
"model": MODEL_NAME,
"timestamp": datetime.now().isoformat(),
"metadata": metadata
}
return json.dumps(result, ensure_ascii=False, indent=2)
demo = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(type="filepath", label="Audio/Video-Datei (mp3, wav, mp4, etc.)"),
gr.File(label="Passende Metadata JSON-Datei")
],
outputs=gr.Textbox(label="Transkriptions-JSON"),
title="Voxtral Transkription mit Metadaten"
)
if __name__ == "__main__":
demo.launch()