szili2011 commited on
Commit
809a47f
·
verified ·
1 Parent(s): 1798665

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -13
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  import tensorflow as tf
3
  import numpy as np
4
  import nltk
5
- import soundfile as sf
6
  from nltk.corpus import cmudict
7
  from scipy.io.wavfile import write
8
 
@@ -17,8 +16,6 @@ model = tf.keras.models.load_model('audio_model.h5')
17
  def preprocess_text(text):
18
  """
19
  Process the input text to prepare it for the model.
20
- This could include tokenization, phoneme extraction, etc.
21
- The model expects input of shape (batch_size, sequence_length, 13).
22
  """
23
  d = cmudict.dict()
24
  words = text.lower().split()
@@ -28,19 +25,19 @@ def preprocess_text(text):
28
  if word in d:
29
  phonemes.append(d[word][0])
30
  else:
31
- # If word not found in cmudict, use a placeholder or skip
32
  phonemes.append(['UNKNOWN'])
33
 
34
  # Flatten the list of phonemes
35
  flattened_phonemes = [p for sublist in phonemes for p in sublist]
36
 
37
- # Create dummy 13-feature vectors for each phoneme (you need to implement your own feature extraction)
38
  num_features = 13
39
  sequence_length = len(flattened_phonemes)
40
- input_data = np.random.rand(sequence_length, num_features) # Placeholder, replace with actual feature extraction
41
 
42
  # Add batch dimension
43
- input_data = np.expand_dims(input_data, axis=0) # Shape (1, sequence_length, 13)
44
 
45
  return input_data
46
 
@@ -48,17 +45,20 @@ def preprocess_text(text):
48
  def convert_to_audio(model_output, filename="output.wav", sample_rate=22050):
49
  """
50
  Convert the model output into a .wav file.
51
- Model output is expected to be a numpy array.
52
  """
53
- # Normalize the audio output (optional, based on your model)
 
 
 
 
54
  normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
55
 
56
- # Write the audio data to a file (assuming model_output is a waveform)
57
  write(filename, sample_rate, normalized_output)
58
 
59
  return filename
60
 
61
- # Define function to generate sound effect
62
  def generate_sfx(text):
63
  """
64
  Takes input text, preprocesses it, runs it through the model,
@@ -69,8 +69,12 @@ def generate_sfx(text):
69
  # Generate prediction
70
  prediction = model.predict(input_data)
71
 
 
 
 
 
72
  # Convert the prediction to an audio file
73
- audio_file = convert_to_audio(prediction.flatten(), filename="output.wav")
74
 
75
  return audio_file
76
 
@@ -78,7 +82,7 @@ def generate_sfx(text):
78
  interface = gr.Interface(
79
  fn=generate_sfx,
80
  inputs=gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
81
- outputs=gr.Audio(label="Generated SFX", type="filepath"), # Changed here
82
  live=False,
83
  title="SFX Generator from Text",
84
  description="Enter a word or sentence, and the model will generate an SFX sound.",
 
2
  import tensorflow as tf
3
  import numpy as np
4
  import nltk
 
5
  from nltk.corpus import cmudict
6
  from scipy.io.wavfile import write
7
 
 
16
  def preprocess_text(text):
17
  """
18
  Process the input text to prepare it for the model.
 
 
19
  """
20
  d = cmudict.dict()
21
  words = text.lower().split()
 
25
  if word in d:
26
  phonemes.append(d[word][0])
27
  else:
28
+ # Use a placeholder for words not found in cmudict
29
  phonemes.append(['UNKNOWN'])
30
 
31
  # Flatten the list of phonemes
32
  flattened_phonemes = [p for sublist in phonemes for p in sublist]
33
 
34
+ # Create dummy feature vectors (this should be replaced with actual feature extraction)
35
  num_features = 13
36
  sequence_length = len(flattened_phonemes)
37
+ input_data = np.random.rand(sequence_length, num_features) # Placeholder
38
 
39
  # Add batch dimension
40
+ input_data = np.expand_dims(input_data, axis=0)
41
 
42
  return input_data
43
 
 
45
  def convert_to_audio(model_output, filename="output.wav", sample_rate=22050):
46
  """
47
  Convert the model output into a .wav file.
 
48
  """
49
+ # Check if model_output is empty
50
+ if model_output is None or len(model_output) == 0:
51
+ raise ValueError("Model output is empty.")
52
+
53
+ # Normalize the audio output
54
  normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
55
 
56
+ # Write the audio data to a file
57
  write(filename, sample_rate, normalized_output)
58
 
59
  return filename
60
 
61
+ # Generate sound effect
62
  def generate_sfx(text):
63
  """
64
  Takes input text, preprocesses it, runs it through the model,
 
69
  # Generate prediction
70
  prediction = model.predict(input_data)
71
 
72
+ # Ensure prediction shape is correct
73
+ if prediction.ndim == 2 and prediction.shape[1] > 1:
74
+ prediction = prediction.flatten() # Flatten if necessary
75
+
76
  # Convert the prediction to an audio file
77
+ audio_file = convert_to_audio(prediction, filename="output.wav")
78
 
79
  return audio_file
80
 
 
82
  interface = gr.Interface(
83
  fn=generate_sfx,
84
  inputs=gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
85
+ outputs=gr.Audio(label="Generated SFX", type="filepath"),
86
  live=False,
87
  title="SFX Generator from Text",
88
  description="Enter a word or sentence, and the model will generate an SFX sound.",