szili2011 commited on
Commit
543c357
·
verified ·
1 Parent(s): c3f5e81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -27
app.py CHANGED
@@ -5,6 +5,9 @@ import nltk
5
  from nltk.corpus import cmudict
6
  from scipy.io.wavfile import write
7
 
 
 
 
8
  # Download required NLTK data
9
  nltk.download('averaged_perceptron_tagger')
10
  nltk.download('cmudict')
@@ -52,7 +55,6 @@ def convert_to_audio(model_output, sample_rate=22050):
52
  # Normalize the audio output
53
  normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
54
 
55
- # Return normalized output for further processing
56
  return normalized_output
57
 
58
  # Generate sound effect with specified duration
@@ -63,34 +65,21 @@ def generate_sfx(text, duration=30):
63
  """
64
  input_data = preprocess_text(text)
65
 
66
- # Initialize an empty list to hold audio segments
67
- audio_segments = []
68
- total_samples = duration * 22050 # Calculate total samples for 30 seconds
69
- generated_samples = 0
70
-
71
- while generated_samples < total_samples:
72
- # Generate prediction
73
- prediction = model.predict(input_data)
74
-
75
- # Ensure prediction shape is correct
76
- if prediction.ndim == 2 and prediction.shape[1] > 1:
77
- prediction = prediction.flatten() # Flatten if necessary
78
-
79
- # Convert the prediction to audio data
80
- audio_segment = convert_to_audio(prediction)
81
-
82
- # Append the generated segment to the list
83
- audio_segments.append(audio_segment)
84
-
85
- # Increment the total samples generated
86
- generated_samples += len(audio_segment)
87
 
88
- # Concatenate all segments to form the final audio output
89
- final_audio = np.concatenate(audio_segments)[:total_samples] # Ensure we cut to the correct length
90
 
91
- # Write the audio data to a file
92
  output_filename = "output.wav"
93
- write(output_filename, 22050, final_audio)
94
 
95
  return output_filename
96
 
@@ -99,7 +88,7 @@ interface = gr.Interface(
99
  fn=generate_sfx,
100
  inputs=[
101
  gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
102
- gr.Slider(label="Duration (seconds)", minimum=1, maximum=60, value=30) # Added duration slider
103
  ],
104
  outputs=gr.Audio(label="Generated SFX", type="filepath"),
105
  live=False,
 
5
  from nltk.corpus import cmudict
6
  from scipy.io.wavfile import write
7
 
8
+ # Ensure TensorFlow uses CPU only
9
+ tf.config.set_visible_devices([], 'GPU')
10
+
11
  # Download required NLTK data
12
  nltk.download('averaged_perceptron_tagger')
13
  nltk.download('cmudict')
 
55
  # Normalize the audio output
56
  normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
57
 
 
58
  return normalized_output
59
 
60
  # Generate sound effect with specified duration
 
65
  """
66
  input_data = preprocess_text(text)
67
 
68
+ # Calculate total samples for the specified duration
69
+ total_samples = duration * 22050 # Samples for 30 seconds
70
+ # Generate audio samples
71
+ generated_samples = model.predict(input_data)
72
+
73
+ # Check the length of generated samples and ensure it meets the required duration
74
+ if len(generated_samples) < total_samples:
75
+ raise ValueError(f"Generated audio is shorter than {duration} seconds.")
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ # Convert the prediction to audio data
78
+ audio_data = convert_to_audio(generated_samples)
79
 
80
+ # Write the audio data to a file, limiting to the specified duration
81
  output_filename = "output.wav"
82
+ write(output_filename, 22050, audio_data[:total_samples]) # Limit to total_samples
83
 
84
  return output_filename
85
 
 
88
  fn=generate_sfx,
89
  inputs=[
90
  gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
91
+ gr.Slider(label="Duration (seconds)", minimum=30, maximum=120, value=30) # Set duration options
92
  ],
93
  outputs=gr.Audio(label="Generated SFX", type="filepath"),
94
  live=False,