# %% [markdown] # # IELTS Speaking Evaluation, The Application # # IELTS is a popular test # %% # setup package install # import os # os.system("pip install faster-whisper gradio hf_xet") # %% [markdown] # First, we import the relevant packages. # %% from faster_whisper import WhisperModel, BatchedInferencePipeline import gradio as gr # %% [markdown] # First, we intitalize the model. The `hf_xet` package above comes into play, allowing faster downloads. Since IELTS is in English, we use the `.en` model which is optimized for English-only. # %% # this will take a while to setup the model... # change to cuda on GoogleColab model = WhisperModel( "small.en", device="cpu", compute_type="int8", cpu_threads=4, num_workers=2) batched_model = BatchedInferencePipeline(model=model) # %% [markdown] # Now we need to write down a simple audio recorder in gradio # %% from typing import Any from google import genai from google.genai import types client = genai.Client(api_key="AIzaSyAVlkr-yD-PhA5kqkKExL9TKj_2L34pEOA") def transcribe(audio: str) -> tuple[str, str | None]: segments, info = batched_model.transcribe( audio, language="en", beam_size=5, batch_size=12) segments = list(segments) result = [] for segment in segments: result.append(segment.text) transcript = "\n".join(result) response = client.models.generate_content( model="gemini-1.5-flash-8b", contents=["Rate this speaking exercise under IELTS speaking rubrics", transcript] ) return (transcript, response.text) # Specify type="filepath" to return the path to the audio file audio_input = gr.Audio(type="filepath") output_text = gr.Textbox(label="Transcript") ai_output_text = gr.Textbox(label="AI response") iface = gr.Interface( fn=transcribe, inputs=[audio_input], outputs=[output_text, ai_output_text], # live=True title="IELTS Speaking App", description="IELTS speaking app with AI test") iface.launch(debug=True) # %%