Spaces:

VDNT11
/

AIML_project

Build error

App Files Files Community

VDNT11 commited on Nov 21, 2024

Commit

1252e4e

verified ·

1 Parent(s): b5bdd0a

Create app.py

Browse files

Files changed (1) hide show

app.py +74 -0

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+import streamlit as st
+from pydub import AudioSegment
+import os
+import soundfile as sf
+import uuid
+# Set device and dtype
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# Load Whisper model from Hugging Face
+@st.cache_resource
+def load_model():
+    model_id = "openai/whisper-large-v2"
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+    )
+    model.to(device)
+    processor = AutoProcessor.from_pretrained(model_id)
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        torch_dtype=torch_dtype,
+        device=device,
+    )
+    return pipe, processor
+# Load model and processor
+pipe, processor = load_model()
+# Streamlit UI
+st.title("Hindi Audio to Text Transcription")
+uploaded_file = st.file_uploader(
+    "Upload a .wav audio file for transcription", type=["wav"]
+)
+if uploaded_file is not None:
+    st.info("Processing uploaded file...")
+    temp_filename = f"temp_audio_{uuid.uuid4()}.wav"
+    with open(temp_filename, "wb") as f:
+        f.write(uploaded_file.read())
+    # Preprocess the audio
+    sound = AudioSegment.from_file(temp_filename)
+    sound = sound.set_channels(1)  # Convert to mono
+    sound.export(temp_filename, format="wav")  # Save the processed file
+    audio, _ = sf.read(temp_filename)  # Read audio data
+    # Preprocess the audio for the model
+    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Perform transcription
+    with torch.no_grad():
+        outputs = pipe.model.generate(**inputs)
+        transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+    # Display the transcription
+    st.success("Transcription complete!")
+    st.markdown(f"### Transcription:\n\n{transcription}")
+    os.remove(temp_filename)  # Clean up temporary file
+else:
+    st.warning("Please upload a .wav file to start transcription.")