Be-win's picture
Update app.py
a895ea1 verified
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from peft import PeftModel
import librosa
import os
# --- CONFIGURATION ---
# Replace with your actual model path on Hugging Face
# Format: "your_username/your_model_name"
ADAPTER_MODEL = "Be-win/whisper-medium-malayalam-agri"
BASE_MODEL = "openai/whisper-medium"
# Detect Hardware (Free Tier = CPU, Paid = GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Loading model on {device}...")
# 1. Load Base Model
base_model = WhisperForConditionalGeneration.from_pretrained(
BASE_MODEL,
device_map=device
)
# 2. Load Adapters
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
model.eval()
# 3. Load Processor
try:
processor = WhisperProcessor.from_pretrained(ADAPTER_MODEL)
except:
processor = WhisperProcessor.from_pretrained(BASE_MODEL)
def predict(audio_path):
if not audio_path:
return "Error: No audio provided"
# Load and resample audio
audio_array, _ = librosa.load(audio_path, sr=16000)
# Preprocess
inputs = processor(
audio_array,
sampling_rate=16000,
return_tensors="pt"
).input_features.to(device)
# Generate
with torch.no_grad():
generated_ids = model.generate(
input_features=inputs,
forced_decoder_ids=processor.get_decoder_prompt_ids(language="malayalam", task="translate"),
max_length=448
)
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return transcription
# Create the API
iface = gr.Interface(
fn=predict,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Agri-Whisper API",
description="Malayalam to English Agricultural Translation"
)
iface.queue().launch()