File size: 2,246 Bytes
ca91e4b
 
 
 
 
 
 
 
 
 
f433513
 
ca91e4b
 
f433513
b5508b8
ca91e4b
 
 
 
 
 
 
 
f433513
ca91e4b
 
 
f433513
ca91e4b
f433513
ca91e4b
 
 
f433513
ca91e4b
 
 
 
 
 
 
 
b5508b8
 
ca91e4b
 
 
 
 
b5508b8
 
 
 
 
ca91e4b
 
 
f433513
ca91e4b
 
 
 
 
 
f433513
ca91e4b
 
 
 
 
 
 
f433513
ca91e4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f433513
ca91e4b
 
f433513
ca91e4b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import gradio as gr
import numpy as np
import tempfile
import scipy.io.wavfile as wav

# -------------------------------
# 1. Load Models (Lightweight)
# -------------------------------

from transformers import pipeline

# Speech-to-Text (Whisper)
stt = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# Better Tutor Model (FLAN-T5)
llm = pipeline("text-generation", model="distilgpt2")


# -------------------------------
# 2. Core Functions
# -------------------------------

def speech_to_text(audio):
    """
    Converts speech (audio input) to text
    """
    if audio is None:
        return "No audio provided."

    sample_rate, data = audio

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        wav.write(tmp.name, sample_rate, data)
        result = stt(tmp.name)

    return result["text"]


def generate_response(text):
    if not text or text == "No audio provided.":
        return "Please provide valid input."

    prompt = f"""
    You are an AI tutor.
    Explain clearly and simply.

    Question: {text}
    Answer:
    """

    output = llm(prompt, max_length=150, num_return_sequences=1)
    response = output[0]["generated_text"]

    # Clean output
    return response.split("Answer:")[-1].strip()


# -------------------------------
# 3. Main Pipeline
# -------------------------------

def voice_tutor(audio):
    transcription = speech_to_text(audio)
    response = generate_response(transcription)

    return transcription, response


# -------------------------------
# 4. Gradio UI
# -------------------------------

with gr.Blocks() as demo:
    gr.Markdown("## 🎓 AI Voice Tutor (No TTS Version)")

    audio_input = gr.Audio(
        sources=["microphone", "upload"],
        type="numpy",
        label="Speak or Upload Audio"
    )

    transcription_box = gr.Textbox(label="Transcription")
    response_box = gr.Textbox(label="Tutor Response")

    submit_btn = gr.Button("Generate Response")

    submit_btn.click(
        fn=voice_tutor,
        inputs=audio_input,
        outputs=[transcription_box, response_box]
    )


# -------------------------------
# 5. Launch
# -------------------------------

if __name__ == "__main__":
    demo.launch()