EdmundYi commited on
Commit
b144d59
·
verified ·
1 Parent(s): 60fe297

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -15
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
3
  from diffusers import StableDiffusionPipeline
4
  import torch
5
- import librosa
6
  # Step 1: Prompt-to-Prompt Generation using BART (or any LLM except GPT or DeepSeek)
7
  prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn")
8
 
@@ -24,25 +24,13 @@ def generate_image(prompt: str):
24
  processor = WhisperProcessor.from_pretrained("openai/whisper-large")
25
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
26
 
27
- import librosa # For handling audio files
28
-
29
  def transcribe_audio(audio):
30
  # Convert audio to text using Whisper
31
-
32
- # Check if audio is a numpy array (for recordings)
33
- if isinstance(audio, np.ndarray):
34
- audio_input = processor(audio, return_tensors="pt").input_features
35
- else:
36
- # If it's a file path (upload), use librosa to load the file
37
- audio_input, _ = librosa.load(audio, sr=16000) # Load audio with 16kHz sample rate
38
- audio_input = processor(audio_input, return_tensors="pt").input_features
39
-
40
- # Transcribe the audio
41
  predicted_ids = model.generate(audio_input)
42
  transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
43
  return transcription
44
 
45
-
46
  # Step 3: Gradio Interface with Multiple Controllers (Textbox, Slider, Checkbox, Audio)
47
  def process_input(description: str, creativity: float, include_background: bool):
48
  # Generate a detailed prompt
@@ -72,7 +60,6 @@ background_checkbox = gr.Checkbox(label="Include Background", value=True)
72
 
73
  audio_input = gr.Audio(type="numpy", label="Speak your Description")
74
 
75
-
76
  # Create interface with both text and audio inputs
77
  interface = gr.Interface(
78
  fn=process_input,
 
2
  from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
3
  from diffusers import StableDiffusionPipeline
4
  import torch
5
+
6
  # Step 1: Prompt-to-Prompt Generation using BART (or any LLM except GPT or DeepSeek)
7
  prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn")
8
 
 
24
  processor = WhisperProcessor.from_pretrained("openai/whisper-large")
25
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
26
 
 
 
27
  def transcribe_audio(audio):
28
  # Convert audio to text using Whisper
29
+ audio_input = processor(audio, return_tensors="pt").input_features
 
 
 
 
 
 
 
 
 
30
  predicted_ids = model.generate(audio_input)
31
  transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
32
  return transcription
33
 
 
34
  # Step 3: Gradio Interface with Multiple Controllers (Textbox, Slider, Checkbox, Audio)
35
  def process_input(description: str, creativity: float, include_background: bool):
36
  # Generate a detailed prompt
 
60
 
61
  audio_input = gr.Audio(type="numpy", label="Speak your Description")
62
 
 
63
  # Create interface with both text and audio inputs
64
  interface = gr.Interface(
65
  fn=process_input,