File size: 2,044 Bytes
d58ab2d
 
 
 
 
 
 
1509b66
 
d58ab2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# %% [markdown]
# # IELTS Speaking Evaluation, The Application
# 
# IELTS is a popular test 

# %%
# setup package install
# import os
# os.system("pip install faster-whisper gradio hf_xet")

# %% [markdown]
# First, we import the relevant packages.

# %%
from faster_whisper import WhisperModel, BatchedInferencePipeline
import gradio as gr

# %% [markdown]
# First, we intitalize the model. The `hf_xet` package above comes into play, allowing faster downloads. Since IELTS is in English, we use the `.en` model which is optimized for English-only.

# %%
# this will take a while to setup the model...

# change to cuda on GoogleColab
model = WhisperModel(
    "small.en", 
    device="cpu",
    compute_type="int8",
    cpu_threads=4,
    num_workers=2)
batched_model = BatchedInferencePipeline(model=model)

# %% [markdown]
# Now we need to write down a simple audio recorder in gradio

# %%
from typing import Any
from google import genai
from google.genai import types


client = genai.Client(api_key="AIzaSyAVlkr-yD-PhA5kqkKExL9TKj_2L34pEOA")


def transcribe(audio: str) -> tuple[str, str | None]:
    segments, info = batched_model.transcribe(
        audio,
        language="en", beam_size=5, batch_size=12)
    segments = list(segments)
    result = []
    for segment in segments:
        result.append(segment.text)
    transcript = "\n".join(result)
    
    response = client.models.generate_content(
        model="gemini-1.5-flash-8b",
        contents=["Rate this speaking exercise under IELTS speaking rubrics", transcript]
        )
    
    return (transcript, response.text)
    

# Specify type="filepath" to return the path to the audio file
audio_input = gr.Audio(type="filepath")
output_text = gr.Textbox(label="Transcript")
ai_output_text = gr.Textbox(label="AI response")

iface = gr.Interface(
    fn=transcribe, 
    inputs=[audio_input], 
    outputs=[output_text, ai_output_text], 
    # live=True
    title="IELTS Speaking App", description="IELTS speaking app with AI test")
iface.launch(debug=True)

# %%