| import gradio as gr
|
| import torch
|
| from transformers import pipeline
|
| import time
|
| import logging
|
|
|
|
|
| logging.basicConfig(level=logging.INFO)
|
|
|
|
|
| asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", chunk_length_s=30)
|
| text_pipe = pipeline("text-generation", model="HuggingFaceTB/SmolLM2-360M", max_length=512, temperature=0.7, top_p=0.9)
|
| tts_pipe = pipeline("text-to-speech", model="mussacharles60/swahili-tts-female-voice")
|
|
|
|
|
| MAX_INPUT_SIZE = 100
|
| PREDEFINED_ATTRIBUTES = ["name", "age", "location"]
|
| CONTEXT_HISTORY = []
|
|
|
|
|
| def recognize_speech(audio):
|
| retries = 3
|
| for _ in range(retries):
|
| try:
|
| result = asr_pipe(audio, return_timestamps=True)
|
| return result['text']
|
| except Exception as e:
|
| logging.error(f"ASR failed: {e}")
|
| time.sleep(1)
|
| return ""
|
|
|
|
|
| def generate_text(prompt):
|
| global CONTEXT_HISTORY
|
| CONTEXT_HISTORY.append(prompt)
|
| if len(CONTEXT_HISTORY) > 5:
|
| CONTEXT_HISTORY.pop(0)
|
| context = " ".join(CONTEXT_HISTORY)
|
| outputs = text_pipe(context, max_length=512, num_return_sequences=1)
|
| generated_text = outputs[0]['generated_text']
|
| return generated_text
|
|
|
|
|
| def synthesize_speech(text):
|
| audio = tts_pipe(text, output_format="wav", sample_rate=16000)
|
| return audio
|
|
|
|
|
| def handle_conversation(audio):
|
| recognized_text = recognize_speech(audio)
|
| if any(attr in recognized_text.lower() for attr in PREDEFINED_ATTRIBUTES):
|
| generated_text = generate_text(f"Please provide your {recognized_text}")
|
| else:
|
| generated_text = generate_text(recognized_text)
|
| synthesized_audio = synthesize_speech(generated_text)
|
| return synthesized_audio, generated_text
|
|
|
|
|
| demo = gr.Blocks()
|
|
|
|
|
| input_audio = gr.Audio(label="Input Audio")
|
| output_audio = gr.Audio(label="Output Audio")
|
| output_text = gr.Textbox(label="Output Text")
|
|
|
|
|
| conversation_button = gr.Button("Start Conversation")
|
|
|
|
|
| conversation_button.click(handle_conversation, inputs=input_audio, outputs=[output_audio, output_text])
|
|
|
|
|
| demo.launch()
|
|
|