Spaces:
No application file
No application file
| import os | |
| import cv2 | |
| import uuid | |
| import gradio as gr | |
| import numpy as np | |
| import webbrowser | |
| import webcamgpt | |
| from gtts import gTTS | |
| import speech_recognition as sr | |
| from pydub import AudioSegment | |
| MARKDOWN = """ | |
| # Webcam with GPT | |
| Visual analysis of live webcam footage | |
| """ | |
| connector = webcamgpt.OpanAIConnector() | |
| duration_in_seconds=0 | |
| def save_image_to_drive(image: np.ndarray) -> str: | |
| image_filename = f"{uuid.uuid4()}.jpeg" | |
| image_directory = "data" | |
| os.makedirs(image_directory, exist_ok=True) | |
| image_path = os.path.join(image_directory, image_filename) | |
| cv2.imwrite(image_path, image) | |
| return image_path | |
| def speech_to_text(): | |
| recognizer = sr.Recognizer() | |
| with sr.Microphone() as source: | |
| recognizer.adjust_for_ambient_noise(source) | |
| print("Say something...") | |
| audio = recognizer.listen(source, timeout=5) | |
| try: | |
| return recognizer.recognize_google(audio) | |
| except sr.UnknownValueError: | |
| return "Could not understand audio" | |
| except sr.RequestError as e: | |
| return f"Error with the speech recognition service; {e}" | |
| def respond(image: np.ndarray, prompt: str, chat_history): | |
| image = np.fliplr(image) | |
| image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) | |
| image_path = save_image_to_drive(image) | |
| # Add speech-to-text for the prompt | |
| speech_input = speech_to_text() | |
| chat_history.append(((image_path,), None)) | |
| chat_history.append((speech_input, None)) | |
| response = connector.simple_prompt(image=image, prompt=speech_input) | |
| chat_history.append((speech_input, response)) | |
| # Initialize gTTS with the text to convert | |
| speech = gTTS(response, lang='en', slow=False) | |
| # Save the audio file to a temporary file | |
| speech_file = 'speech.mp3' | |
| speech.save(speech_file) | |
| audio = AudioSegment.from_file(speech_file) | |
| global duration_in_seconds | |
| duration_in_seconds = len(audio) / 1000 | |
| print(f"Speech duration: {duration_in_seconds} seconds") | |
| # Play the audio file | |
| webbrowser.open(speech_file) | |
| return "", chat_history | |
| with gr.Blocks() as demo: | |
| gr.Markdown(MARKDOWN) | |
| with gr.Row(): | |
| webcam = gr.Image(source="webcam", streaming=True) | |
| with gr.Column(): | |
| chatbot = gr.Chatbot(height=500) | |
| message = gr.Textbox(autofocus=True) | |
| clear_button = gr.ClearButton([message, chatbot]) | |
| message.submit(respond, [webcam, message, chatbot], [message, chatbot]) | |
| demo.launch(debug=False, show_error=True, share=True) |