szili2011 commited on
Commit
3ffb926
·
verified ·
1 Parent(s): 2b6bee1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -84
app.py CHANGED
@@ -1,104 +1,57 @@
 
 
1
  import gradio as gr
 
2
  import tensorflow as tf
3
- import numpy as np
4
  import nltk
5
  from nltk.corpus import cmudict
6
- from scipy.io.wavfile import write
 
 
7
 
8
  # Ensure TensorFlow uses CPU only
9
- tf.config.set_visible_devices([], 'GPU')
10
 
11
- # Download required NLTK data
12
- nltk.download('averaged_perceptron_tagger')
13
- nltk.download('cmudict')
14
 
15
- # Load your model
16
- model = tf.keras.models.load_model('audio_model.h5')
 
17
 
18
- # Preprocess input text
19
- def preprocess_text(text):
20
- """
21
- Process the input text to prepare it for the model.
22
- """
23
- d = cmudict.dict()
24
- words = text.lower().split()
25
- phonemes = []
26
 
27
- for word in words:
28
- if word in d:
29
- phonemes.append(d[word][0])
30
- else:
31
- phonemes.append(['UNKNOWN'])
32
-
33
- # Flatten the list of phonemes
34
- flattened_phonemes = [p for sublist in phonemes for p in sublist]
35
-
36
- # Create dummy feature vectors (this should be replaced with actual feature extraction)
37
- num_features = 13
38
- sequence_length = len(flattened_phonemes)
39
- input_data = np.random.rand(sequence_length, num_features) # Placeholder
40
-
41
- # Add batch dimension
42
- input_data = np.expand_dims(input_data, axis=0)
43
 
44
- return input_data
 
 
45
 
46
- # Convert model output to an audio file
47
- def convert_to_audio(model_output, sample_rate=22050):
48
- """
49
- Convert the model output into a .wav file.
50
- """
51
- # Check if model_output is empty
52
- if model_output is None or len(model_output) == 0:
53
- raise ValueError("Model output is empty.")
54
 
55
- # Normalize the audio output
56
- normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
57
-
58
- return normalized_output
 
59
 
60
- # Generate sound effect with specified duration
61
- def generate_sfx(text, duration=30):
62
- """
63
- Takes input text, preprocesses it, runs it through the model,
64
- and generates a downloadable audio file for a specified duration.
65
- """
66
- input_data = preprocess_text(text)
67
-
68
- # Calculate total samples for the specified duration
69
- total_samples = duration * 22050 # Samples for 30 seconds
70
-
71
- # Generate audio samples
72
- generated_samples = model.predict(input_data)
73
-
74
- # Ensure generated samples meet the required duration
75
- if generated_samples.shape[1] < total_samples:
76
- # Pad with zeros if not enough audio is generated
77
- padding = np.zeros((1, total_samples - generated_samples.shape[1]))
78
- generated_samples = np.concatenate([generated_samples, padding], axis=1)
79
-
80
- # Convert the prediction to audio data
81
- audio_data = convert_to_audio(generated_samples)
82
 
83
- # Write the audio data to a file, limiting to the specified duration
84
  output_filename = "output.wav"
85
- write(output_filename, 22050, audio_data[:total_samples]) # Limit to total_samples
86
-
87
  return output_filename
88
 
89
- # Define the Gradio interface with updated slider settings
90
- interface = gr.Interface(
91
- fn=generate_sfx,
92
- inputs=[
93
- gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
94
- gr.Slider(label="Duration (seconds)", minimum=2, maximum=20, value=5) # Set duration options
95
- ],
96
- outputs=gr.Audio(label="Generated SFX", type="filepath"),
97
- live=False,
98
- title="SFX Generator from Text",
99
- description="Enter a word or sentence, and the model will generate an SFX sound for the specified duration.",
100
- )
101
 
102
- # Run the interface
103
  if __name__ == "__main__":
104
- interface.launch()
 
1
+ import os
2
+ import numpy as np
3
  import gradio as gr
4
+ from scipy.io.wavfile import write
5
  import tensorflow as tf
 
6
  import nltk
7
  from nltk.corpus import cmudict
8
+
9
+ # Download CMU dictionary if not already downloaded
10
+ nltk.download('cmudict', quiet=True)
11
 
12
  # Ensure TensorFlow uses CPU only
13
+ os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Disable GPU
14
 
15
+ # Load CMU dictionary for pronunciation
16
+ cmu_dict = cmudict.dict()
 
17
 
18
+ # Load your pre-trained model (adjust the model loading according to your implementation)
19
+ # For example, if your model is a Keras model, you would use:
20
+ # model = tf.keras.models.load_model('path_to_your_model')
21
 
22
+ # Replace this with your actual model loading code
23
+ # model = ...
 
 
 
 
 
 
24
 
25
+ def generate_audio(text, duration):
26
+ sample_rate = 22050 # Sample rate in Hz
27
+ t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ # Placeholder: Generate a simple sine wave audio signal
30
+ frequency = 440 # Frequency in Hz (A4 note)
31
+ audio_data = 0.5 * np.sin(2 * np.pi * frequency * t) # Generate sine wave
32
 
33
+ return audio_data
 
 
 
 
 
 
 
34
 
35
+ def generate_sfx(duration):
36
+ text = "Sample text for audio generation" # Replace with actual input text if needed
37
+ audio_data = generate_audio(text, duration)
38
+ audio_data = (audio_data * 32767).astype(np.int16) # Scale to 16-bit PCM
39
+ total_samples = duration * 22050 # Adjust based on sample rate
40
 
41
+ if len(audio_data) < total_samples:
42
+ raise ValueError(f"Generated audio is shorter than {duration} seconds.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
 
44
  output_filename = "output.wav"
45
+ write(output_filename, 22050, audio_data[:total_samples]) # Write to WAV file
 
46
  return output_filename
47
 
48
+ duration_slider = gr.Slider(minimum=2, maximum=20, label="Duration (seconds)", value=10)
49
+
50
+ app = gr.Interface(fn=generate_sfx,
51
+ inputs=duration_slider,
52
+ outputs="audio",
53
+ title="Sound Effect Generator",
54
+ description="Generate sound effects for a specified duration.")
 
 
 
 
 
55
 
 
56
  if __name__ == "__main__":
57
+ app.launch()