Spaces:

Sayiqa
/

voice_app

Sleeping

App Files Files Community

Sayiqa commited on Dec 18, 2024

Commit

bb4ad60

verified ·

1 Parent(s): 6cf8578

Update app.py

Browse files

Files changed (1) hide show

app.py +179 -199

app.py CHANGED Viewed

@@ -1,182 +1,3 @@
-import subprocess
-# Install required libraries
-subprocess.check_call(["pip", "install", "torch>=1.11.0"])
-subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
-subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
-subprocess.check_call(["pip", "install", "librosa"])
-subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
-subprocess.check_call(["pip", "install", "gradio>=3.35.2"])
-subprocess.check_call(["pip", "install", "huggingface_hub"])
-import os
-import threading
-import numpy as np
-import librosa
-import torch
-import gradio as gr
-from functools import lru_cache
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import login
-from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
-# Ensure required dependencies are installed
-def install_missing_packages():
-    required_packages = {
-        "librosa": None,
-        "diffusers": ">=0.14.0",
-        "gradio": ">=3.35.2",
-        "huggingface_hub": None,
-        "accelerate": ">=0.20.1",
-        "transformers": ">=4.31.0"
-    }
-    for package, version in required_packages.items():
-        try:
-            __import__(package)
-        except ImportError:
-            package_name = f"{package}{version}" if version else package
-            subprocess.check_call(["pip", "install", package_name])
-install_missing_packages()
-# Get Hugging Face token for authentication
-hf_token = os.getenv("HF_TOKEN")
-if hf_token:
-    login(hf_token)
-else:
-    raise ValueError("HF_TOKEN environment variable not set.")
-# Load speech-to-text model (Whisper)
-speech_to_text = pipeline(
-    "automatic-speech-recognition",
-    model="openai/whisper-tiny",
-    return_timestamps=True
-)
-# Load Stable Diffusion model for text-to-image
-text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-text_to_image.to(device)
-text_to_image.enable_attention_slicing()
-text_to_image.safety_checker = None
-text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
-# Load ChatGPT-like conversational model
-chat_model_name = "microsoft/DialoGPT-medium"
-chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
-chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name)
-# Preprocess audio file into NumPy array
-def preprocess_audio(audio_path):
-    try:
-        audio, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz
-        return np.array(audio, dtype=np.float32)
-    except Exception as e:
-        return f"Error in preprocessing audio: {str(e)}"
-# Speech-to-text function with long-form transcription support
-@lru_cache(maxsize=10)
-def transcribe_audio(audio_path):
-    try:
-        audio_array = preprocess_audio(audio_path)
-        if isinstance(audio_array, str):  # Error message from preprocessing
-            return audio_array
-        result = speech_to_text(audio_array)
-        # Combine text from multiple segments for long-form transcription
-        transcription = " ".join(segment["text"] for segment in result["chunks"])
-        return transcription
-    except Exception as e:
-        return f"Error in transcription: {str(e)}"
-# Text-to-image function
-@lru_cache(maxsize=10)
-def generate_image_from_text(text):
-    try:
-        image = text_to_image(text, height=256, width=256).images[0]  # Generate smaller images for speed
-        return image
-    except Exception as e:
-        return f"Error in image generation: {str(e)}"
-# ChatGPT-like conversational response
-def chat_with_gpt(prompt):
-    try:
-        inputs = chat_tokenizer.encode(prompt, return_tensors="pt")
-        outputs = chat_model.generate(inputs, max_length=200, pad_token_id=chat_tokenizer.eos_token_id)
-        response = chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return response
-    except Exception as e:
-        return f"Error in chat response: {str(e)}"
-# Combined processing function
-def process_audio_and_generate_results(audio_path):
-    transcription_result = {"result": None}
-    image_result = {"result": None}
-    # Function to run transcription and image generation in parallel
-    def transcription_thread():
-        transcription_result["result"] = transcribe_audio(audio_path)
-    def image_generation_thread():
-        transcription = transcription_result["result"]
-        if transcription and "Error" not in transcription:
-            image_result["result"] = generate_image_from_text(transcription)
-    # Start both tasks in parallel
-    t1 = threading.Thread(target=transcription_thread)
-    t2 = threading.Thread(target=image_generation_thread)
-    t1.start()
-    t2.start()
-    t1.join()  # Wait for transcription to finish
-    t2.join()  # Wait for image generation to finish
-    transcription = transcription_result["result"]
-    image = image_result["result"]
-    if "Error" in transcription:
-        return None, transcription
-    if isinstance(image, str) and "Error" in image:
-        return None, image
-    return image, transcription
-# Gradio interface for speech-to-text
-speech_to_text_iface = gr.Interface(
-    fn=transcribe_audio,
-    inputs=gr.Audio(type="filepath", label="Upload audio file for transcription (WAV/MP3)"),
-    outputs=gr.Textbox(label="Transcription"),
-    title="Speech-to-Text Transcription",
-    description="Upload an audio file to transcribe speech into text.",
-)
-# Gradio interface for voice-to-image and chat
-voice_to_image_and_chat_iface = gr.Interface(
-    fn=process_audio_and_generate_results,
-    inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
-    outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
-    title="Voice-to-Image and Chat",
-    description="Upload an audio file to transcribe speech to text, generate an image based on the transcription, or chat with GPT.",
-)
-# Gradio interface for ChatGPT-like functionality
-chat_iface = gr.Interface(
-    fn=chat_with_gpt,
-    inputs=gr.Textbox(label="Enter your prompt for ChatGPT"),
-    outputs=gr.Textbox(label="ChatGPT Response"),
-    title="ChatGPT",
-    description="Chat with GPT-like conversational AI.",
-)
-# Combined Gradio app
-iface = gr.TabbedInterface(
-    interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, chat_iface],
-    tab_names=["Speech-to-Text", "Voice-to-Image & Chat", "ChatGPT"]
-)
-# Launch Gradio interface
-iface.launch(debug=True, share=True)
 # import subprocess
 # # Install required libraries
@@ -195,7 +16,7 @@ iface.launch(debug=True, share=True)
 # import torch
 # import gradio as gr
 # from functools import lru_cache
-# from transformers import pipeline
 # from huggingface_hub import login
 # from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
@@ -240,8 +61,10 @@ iface.launch(debug=True, share=True)
 # text_to_image.safety_checker = None
 # text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
-# # Load question-answering model (DistilBERT for factual answers)
-# qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
 # # Preprocess audio file into NumPy array
 # def preprocess_audio(audio_path):
@@ -274,14 +97,15 @@ iface.launch(debug=True, share=True)
 #     except Exception as e:
 #         return f"Error in image generation: {str(e)}"
-# # Question answering function
-# def answer_question(question):
 #     try:
-#         context = """Imran Khan is a Pakistani politician, former cricketer, and philanthropist. He is the 22nd Prime Minister of Pakistan, serving from 2018 to 2022. Khan is the founder of the political party Pakistan Tehreek-e-Insaf (PTI). He was one of the most successful cricketers of his time and led Pakistan to victory in the 1992 Cricket World Cup."""
-#         answer = qa_pipeline(question=question, context=context)
-#         return answer['answer']
 #     except Exception as e:
-#         return f"Error in answering question: {str(e)}"
 # # Combined processing function
 # def process_audio_and_generate_results(audio_path):
@@ -331,28 +155,184 @@ iface.launch(debug=True, share=True)
 #     fn=process_audio_and_generate_results,
 #     inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
 #     outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
-#     title="Voice-to-Image",
-#     description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
 # )
-# # Gradio interface for Question Answering
-# qa_iface = gr.Interface(
-#     fn=answer_question,
-#     inputs=gr.Textbox(label="Ask a question"),
-#     outputs=gr.Textbox(label="Answer"),
-#     title="Question Answering",
-#     description="Ask a factual question, and get an answer.",
 # )
 # # Combined Gradio app
 # iface = gr.TabbedInterface(
-#     interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, qa_iface],
-#     tab_names=["Speech-to-Text", "Voice-to-Image", "Question Answering"]
 # )
 # # Launch Gradio interface
 # iface.launch(debug=True, share=True)

 # import subprocess
 # # Install required libraries
 # import torch
 # import gradio as gr
 # from functools import lru_cache
+# from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
 # from huggingface_hub import login
 # from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
 # text_to_image.safety_checker = None
 # text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
+# # Load ChatGPT-like conversational model
+# chat_model_name = "microsoft/DialoGPT-medium"
+# chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
+# chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name)
 # # Preprocess audio file into NumPy array
 # def preprocess_audio(audio_path):
 #     except Exception as e:
 #         return f"Error in image generation: {str(e)}"
+# # ChatGPT-like conversational response
+# def chat_with_gpt(prompt):
 #     try:
+#         inputs = chat_tokenizer.encode(prompt, return_tensors="pt")
+#         outputs = chat_model.generate(inputs, max_length=200, pad_token_id=chat_tokenizer.eos_token_id)
+#         response = chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
+#         return response
 #     except Exception as e:
+#         return f"Error in chat response: {str(e)}"
 # # Combined processing function
 # def process_audio_and_generate_results(audio_path):
 #     fn=process_audio_and_generate_results,
 #     inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
 #     outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
+#     title="Voice-to-Image and Chat",
+#     description="Upload an audio file to transcribe speech to text, generate an image based on the transcription, or chat with GPT.",
 # )
+# # Gradio interface for ChatGPT-like functionality
+# chat_iface = gr.Interface(
+#     fn=chat_with_gpt,
+#     inputs=gr.Textbox(label="Enter your prompt for ChatGPT"),
+#     outputs=gr.Textbox(label="ChatGPT Response"),
+#     title="ChatGPT",
+#     description="Chat with GPT-like conversational AI.",
 # )
 # # Combined Gradio app
 # iface = gr.TabbedInterface(
+#     interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, chat_iface],
+#     tab_names=["Speech-to-Text", "Voice-to-Image & Chat", "ChatGPT"]
 # )
 # # Launch Gradio interface
 # iface.launch(debug=True, share=True)
+import subprocess
+# Install required libraries
+subprocess.check_call(["pip", "install", "torch>=1.11.0"])
+subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
+subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
+subprocess.check_call(["pip", "install", "librosa"])
+subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
+subprocess.check_call(["pip", "install", "gradio>=3.35.2"])
+subprocess.check_call(["pip", "install", "huggingface_hub"])
+import os
+import threading
+import numpy as np
+import librosa
+import torch
+import gradio as gr
+from functools import lru_cache
+from transformers import pipeline
+from huggingface_hub import login
+from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
+# Ensure required dependencies are installed
+def install_missing_packages():
+    required_packages = {
+        "librosa": None,
+        "diffusers": ">=0.14.0",
+        "gradio": ">=3.35.2",
+        "huggingface_hub": None,
+        "accelerate": ">=0.20.1",
+        "transformers": ">=4.31.0"
+    }
+    for package, version in required_packages.items():
+        try:
+            __import__(package)
+        except ImportError:
+            package_name = f"{package}{version}" if version else package
+            subprocess.check_call(["pip", "install", package_name])
+install_missing_packages()
+# Get Hugging Face token for authentication
+hf_token = os.getenv("HF_TOKEN")
+if hf_token:
+    login(hf_token)
+else:
+    raise ValueError("HF_TOKEN environment variable not set.")
+# Load speech-to-text model (Whisper)
+speech_to_text = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-tiny",
+    return_timestamps=True
+)
+# Load Stable Diffusion model for text-to-image
+text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+text_to_image.to(device)
+text_to_image.enable_attention_slicing()
+text_to_image.safety_checker = None
+text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
+# Preprocess audio file into NumPy array
+def preprocess_audio(audio_path):
+    try:
+        audio, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz
+        return np.array(audio, dtype=np.float32)
+    except Exception as e:
+        return f"Error in preprocessing audio: {str(e)}"
+# Speech-to-text function with long-form transcription support
+@lru_cache(maxsize=10)
+def transcribe_audio(audio_path):
+    try:
+        audio_array = preprocess_audio(audio_path)
+        if isinstance(audio_array, str):  # Error message from preprocessing
+            return audio_array
+        result = speech_to_text(audio_array)
+        # Combine text from multiple segments for long-form transcription
+        transcription = " ".join(segment["text"] for segment in result["chunks"])
+        return transcription
+    except Exception as e:
+        return f"Error in transcription: {str(e)}"
+# Text-to-image function
+@lru_cache(maxsize=10)
+def generate_image_from_text(text):
+    try:
+        image = text_to_image(text, height=256, width=256).images[0]  # Generate smaller images for speed
+        return image
+    except Exception as e:
+        return f"Error in image generation: {str(e)}"
+# Combined processing function
+def process_audio_and_generate_results(audio_path):
+    transcription_result = {"result": None}
+    image_result = {"result": None}
+    # Function to run transcription and image generation in parallel
+    def transcription_thread():
+        transcription_result["result"] = transcribe_audio(audio_path)
+    def image_generation_thread():
+        transcription = transcription_result["result"]
+        if transcription and "Error" not in transcription:
+            image_result["result"] = generate_image_from_text(transcription)
+    # Start both tasks in parallel
+    t1 = threading.Thread(target=transcription_thread)
+    t2 = threading.Thread(target=image_generation_thread)
+    t1.start()
+    t2.start()
+    t1.join()  # Wait for transcription to finish
+    t2.join()  # Wait for image generation to finish
+    transcription = transcription_result["result"]
+    image = image_result["result"]
+    if "Error" in transcription:
+        return None, transcription
+    if isinstance(image, str) and "Error" in image:
+        return None, image
+    return image, transcription
+# Gradio interface for speech-to-text
+speech_to_text_iface = gr.Interface(
+    fn=transcribe_audio,
+    inputs=gr.Audio(type="filepath", label="Upload audio file for transcription (WAV/MP3)"),
+    outputs=gr.Textbox(label="Transcription"),
+    title="Speech-to-Text Transcription",
+    description="Upload an audio file to transcribe speech into text.",
+)
+# Gradio interface for voice-to-image
+voice_to_image_iface = gr.Interface(
+    fn=process_audio_and_generate_results,
+    inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
+    outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
+    title="Voice-to-Image",
+    description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
+)
+# Combined Gradio app
+iface = gr.TabbedInterface(
+    interface_list=[speech_to_text_iface, voice_to_image_iface],
+    tab_names=["Speech-to-Text", "Voice-to-Image"]
+)
+# Launch Gradio interface
+iface.launch(debug=True, share=True)