Spaces:

Sayiqa
/

voice_app

Sleeping

App Files Files Community

Update app.py

by Sayiqa7 - opened Dec 18, 2024

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+26

-39

Files changed (1) hide show

app.py +26 -39

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # import subprocess
 # # Install required libraries
@@ -16,7 +17,7 @@
 # import torch
 # import gradio as gr
 # from functools import lru_cache
-# from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
 # from huggingface_hub import login
 # from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
@@ -61,11 +62,6 @@
 # text_to_image.safety_checker = None
 # text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
-# # Load ChatGPT-like conversational model
-# chat_model_name = "microsoft/DialoGPT-medium"
-# chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
-# chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name)
 # # Preprocess audio file into NumPy array
 # def preprocess_audio(audio_path):
 #     try:
@@ -97,16 +93,6 @@
 #     except Exception as e:
 #         return f"Error in image generation: {str(e)}"
-# # ChatGPT-like conversational response
-# def chat_with_gpt(prompt):
-#     try:
-#         inputs = chat_tokenizer.encode(prompt, return_tensors="pt")
-#         outputs = chat_model.generate(inputs, max_length=200, pad_token_id=chat_tokenizer.eos_token_id)
-#         response = chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
-#         return response
-#     except Exception as e:
-#         return f"Error in chat response: {str(e)}"
 # # Combined processing function
 # def process_audio_and_generate_results(audio_path):
 #     transcription_result = {"result": None}
@@ -150,33 +136,25 @@
 #     description="Upload an audio file to transcribe speech into text.",
 # )
-# # Gradio interface for voice-to-image and chat
-# voice_to_image_and_chat_iface = gr.Interface(
 #     fn=process_audio_and_generate_results,
 #     inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
 #     outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
-#     title="Voice-to-Image and Chat",
-#     description="Upload an audio file to transcribe speech to text, generate an image based on the transcription, or chat with GPT.",
-# )
-# # Gradio interface for ChatGPT-like functionality
-# chat_iface = gr.Interface(
-#     fn=chat_with_gpt,
-#     inputs=gr.Textbox(label="Enter your prompt for ChatGPT"),
-#     outputs=gr.Textbox(label="ChatGPT Response"),
-#     title="ChatGPT",
-#     description="Chat with GPT-like conversational AI.",
 # )
 # # Combined Gradio app
 # iface = gr.TabbedInterface(
-#     interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, chat_iface],
-#     tab_names=["Speech-to-Text", "Voice-to-Image & Chat", "ChatGPT"]
 # )
 # # Launch Gradio interface
 # iface.launch(debug=True, share=True)
 import subprocess
 # Install required libraries
@@ -240,7 +218,7 @@ text_to_image.enable_attention_slicing()
 text_to_image.safety_checker = None
 text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
-# Preprocess audio file into NumPy array
 def preprocess_audio(audio_path):
     try:
         audio, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz
@@ -248,25 +226,35 @@ def preprocess_audio(audio_path):
     except Exception as e:
         return f"Error in preprocessing audio: {str(e)}"
-# Speech-to-text function with long-form transcription support
 @lru_cache(maxsize=10)
 def transcribe_audio(audio_path):
     try:
         audio_array = preprocess_audio(audio_path)
         if isinstance(audio_array, str):  # Error message from preprocessing
             return audio_array
-        result = speech_to_text(audio_array)
-        # Combine text from multiple segments for long-form transcription
-        transcription = " ".join(segment["text"] for segment in result["chunks"])
         return transcription
     except Exception as e:
         return f"Error in transcription: {str(e)}"
-# Text-to-image function
 @lru_cache(maxsize=10)
 def generate_image_from_text(text):
     try:
-        image = text_to_image(text, height=256, width=256).images[0]  # Generate smaller images for speed
         return image
     except Exception as e:
         return f"Error in image generation: {str(e)}"
@@ -337,4 +325,3 @@ iface.launch(debug=True, share=True)

 # import subprocess
 # # Install required libraries
 # import torch
 # import gradio as gr
 # from functools import lru_cache
+# from transformers import pipeline
 # from huggingface_hub import login
 # from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
 # text_to_image.safety_checker = None
 # text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
 # # Preprocess audio file into NumPy array
 # def preprocess_audio(audio_path):
 #     try:
 #     except Exception as e:
 #         return f"Error in image generation: {str(e)}"
 # # Combined processing function
 # def process_audio_and_generate_results(audio_path):
 #     transcription_result = {"result": None}
 #     description="Upload an audio file to transcribe speech into text.",
 # )
+# # Gradio interface for voice-to-image
+# voice_to_image_iface = gr.Interface(
 #     fn=process_audio_and_generate_results,
 #     inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
 #     outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
+#     title="Voice-to-Image",
+#     description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
 # )
 # # Combined Gradio app
 # iface = gr.TabbedInterface(
+#     interface_list=[speech_to_text_iface, voice_to_image_iface],
+#     tab_names=["Speech-to-Text", "Voice-to-Image"]
 # )
 # # Launch Gradio interface
 # iface.launch(debug=True, share=True)
 import subprocess
 # Install required libraries
 text_to_image.safety_checker = None
 text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
+# Preprocess audio file into NumPy array with chunking for long files
 def preprocess_audio(audio_path):
     try:
         audio, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz
     except Exception as e:
         return f"Error in preprocessing audio: {str(e)}"
+# Speech-to-text function with chunking support for long-form transcription
 @lru_cache(maxsize=10)
 def transcribe_audio(audio_path):
     try:
         audio_array = preprocess_audio(audio_path)
         if isinstance(audio_array, str):  # Error message from preprocessing
             return audio_array
+        chunk_size = 30 * 16000  # 30 seconds per chunk
+        num_chunks = int(np.ceil(len(audio_array) / chunk_size))
+        transcription = ""
+        for i in range(num_chunks):
+            start = i * chunk_size
+            end = min((i + 1) * chunk_size, len(audio_array))
+            chunk = audio_array[start:end]
+            result = speech_to_text(chunk)
+            transcription += " ".join(segment["text"] for segment in result["chunks"])
         return transcription
     except Exception as e:
         return f"Error in transcription: {str(e)}"
+# Text-to-image function for HD image generation
 @lru_cache(maxsize=10)
 def generate_image_from_text(text):
     try:
+        image = text_to_image(text, height=1024, width=1024).images[0]  # HD image resolution
         return image
     except Exception as e:
         return f"Error in image generation: {str(e)}"