Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import torch | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| from huggingface_hub import inference_api | |
| import speech_recognition as sr | |
| import pyttsx3 | |
| import diffusers | |
| # Set up speech recognition and synthesis | |
| r = sr.Recognizer() | |
| engine = pyttsx3.init() | |
| # Set up the Hugging Face Hub model and tokenizer | |
| model_name = "distilbert-base-uncased-finetuned-sst-2-english" | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # Set up the Serverless Inference API | |
| inference_api_token = HF_TOKEN | |
| inference_api = inference_api.InferenceApi(token=inference_api_token) | |
| # Set up the Diffusers library | |
| diffusers_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| diffusers_model = diffusers.DDPMPipeline.from_pretrained("ByteDance/SDXL-Lightning") | |
| def recognize_speech(): | |
| with sr.Microphone() as source: | |
| print("Say something!") | |
| audio = r.listen(source) | |
| try: | |
| text = r.recognize_google(audio, language="en-US") | |
| return text | |
| except sr.UnknownValueError: | |
| print("Sorry, I didn't catch that. Try again!") | |
| return None | |
| def respond_to_text(text): | |
| inputs = tokenizer.encode_plus( | |
| text, | |
| add_special_tokens=True, | |
| max_length=512, | |
| return_attention_mask=True, | |
| return_tensors='pt' | |
| ) | |
| outputs = inference_api.predict(model_name, inputs) | |
| logits = outputs.logits | |
| _, predicted = torch.max(logits, dim=1) | |
| response = tokenizer.decode(predicted[0], skip_special_tokens=True) | |
| return response | |
| def generate_image(prompt): | |
| image = diffusers_model(prompt, num_inference_steps=50, device=diffusers_device) | |
| return image | |
| def speak_text(text): | |
| engine.say(text) | |
| engine.runAndWait() | |
| st.title("Chat with LLM and Generate Images") | |
| chat_input = st.text_input("Type or speak something:") | |
| if chat_input: | |
| response = respond_to_text(chat_input) | |
| st.write("LLM Response:", response) | |
| speak_text(response) | |
| generate_image_button = st.button("Generate Image") | |
| if generate_image_button: | |
| prompt = st.text_input("Enter a prompt for the image:") | |
| image = generate_image(prompt) | |
| st.image(image, use_column_width=True) | |
| mic_button = st.button("Speak") | |
| if mic_button: | |
| text = recognize_speech() | |
| if text: | |
| response = respond_to_text(text) | |
| st.write("LLM Response:", response) | |
| speak_text(response) |