szili2011 commited on
Commit
38b530f
·
verified ·
1 Parent(s): 683cc0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -68
app.py CHANGED
@@ -1,81 +1,70 @@
1
- import numpy as np
2
- import soundfile as sf
3
  import gradio as gr
4
- from g2p_en import G2p
5
- from tensorflow.keras.models import load_model
6
- import os
 
7
 
8
- # Load the pre-trained model from the local directory
9
- model_path = './audio_model.h5'
10
- model = load_model(model_path)
11
 
12
- # Initialize the g2p model for converting text to phonemes
13
- g2p = G2p()
14
 
15
- def text_to_phonemes(text):
16
- """Convert input text to phonemes."""
17
- phonemes = g2p(text)
18
- return phonemes
 
 
 
 
 
19
 
20
- def generate_audio_features(phonemes, duration):
21
- """Generate audio features from phonemes using the trained model."""
22
- # Convert phonemes into a format suitable for the model input
23
- # This is an example; modify it to match your model's requirements
24
- phoneme_features = np.array([ord(p) for p in phonemes]) # Convert phonemes to numerical features
25
- phoneme_features = np.expand_dims(phoneme_features, axis=0) # Reshape for model input
26
-
27
- # Generate audio features using the model
28
- audio_features = model.predict(phoneme_features)
29
-
30
- # Adjust the length of the features based on the selected duration
31
- num_samples = int(duration * 22050) # Example calculation assuming 22050 samples per second
32
- audio_features = np.resize(audio_features, (num_samples,))
33
 
34
- return audio_features
35
-
36
- def features_to_audio(features):
37
- """Convert generated features back to audio."""
38
- # Normalize the audio to a suitable range (-1 to 1)
39
- audio = np.interp(features, (features.min(), features.max()), (-1, 1))
40
 
41
- return audio
42
 
43
- def generate_audio(text, duration):
44
- """Main function to handle text-to-audio conversion."""
45
- # Step 1: Convert text to phonemes
46
- phonemes = text_to_phonemes(text)
47
-
48
- # Step 2: Generate audio features using the pre-trained model and duration
49
- audio_features = generate_audio_features(phonemes, duration)
50
 
51
- # Step 3: Convert features to actual audio
52
- audio_data = features_to_audio(audio_features)
53
 
54
- # Step 4: Save the generated audio
55
- audio_file = 'generated_audio.wav'
56
- sample_rate = 22050 # Use the sample rate for audio generation
57
- sf.write(audio_file, audio_data, sample_rate)
58
-
59
- return audio_file
60
 
61
- # Gradio interface
62
- def text_to_audio_interface(text, duration):
63
- """Gradio interface function to generate and return audio."""
64
- # Call the generate_audio function with the text and selected duration
65
- audio_file = generate_audio(text, duration)
66
 
67
- # Return the path to the generated audio file
68
- return audio_file
 
 
 
 
 
 
 
 
 
 
69
 
70
- # Create the Gradio interface with a note, labeled button, and a slider for duration
71
- gr.Interface(
72
- fn=text_to_audio_interface,
73
- inputs=[
74
- gr.inputs.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into Sfx Sound"),
75
- gr.inputs.Slider(minimum=1, maximum=20, default=5, step=1, label="Audio Duration (seconds)")
76
- ],
77
- outputs=gr.outputs.Audio(label="Generated Audio Preview"),
78
- title="Text-to-Audio Generator",
79
- description="Write a Word, set the duration, and press 'Generate' to convert the word into an audio effect!",
80
- live=True
81
- ).launch()
 
 
 
1
  import gradio as gr
2
+ import tensorflow as tf
3
+ import numpy as np
4
+ import nltk
5
+ from nltk.corpus import cmudict
6
 
7
+ # Download required NLTK data
8
+ nltk.download('averaged_perceptron_tagger')
9
+ nltk.download('cmudict')
10
 
11
+ # Load your model
12
+ model = tf.keras.models.load_model('audio_model.h5')
13
 
14
+ # Preprocess input text
15
+ def preprocess_text(text):
16
+ """
17
+ Process the input text to prepare it for the model.
18
+ This could include tokenization, phoneme extraction, etc.
19
+ """
20
+ d = cmudict.dict()
21
+ words = text.lower().split()
22
+ phonemes = []
23
 
24
+ for word in words:
25
+ if word in d:
26
+ phonemes.append(d[word][0])
27
+ else:
28
+ # If word not found in cmudict, use a placeholder or skip
29
+ phonemes.append(['UNKNOWN'])
 
 
 
 
 
 
 
30
 
31
+ # Flatten the list of phonemes
32
+ flattened_phonemes = [p for sublist in phonemes for p in sublist]
33
+ # Convert phonemes to numeric format for the model (customize this based on your model's input requirements)
34
+ numeric_input = np.array([hash(p) % 1000 for p in flattened_phonemes])
 
 
35
 
36
+ return numeric_input
37
 
38
+ # Define function to generate sound
39
+ def generate_sfx(text):
40
+ """
41
+ Takes input text, preprocesses it, runs it through the model,
42
+ and generates an SFX sound.
43
+ """
44
+ input_data = preprocess_text(text)
45
 
46
+ # Add batch dimension
47
+ input_data = np.expand_dims(input_data, axis=0)
48
 
49
+ # Generate prediction
50
+ prediction = model.predict(input_data)
 
 
 
 
51
 
52
+ # Postprocess the output to generate a sound file or data
53
+ # Customize based on how your model outputs audio (e.g., generating a WAV file)
 
 
 
54
 
55
+ # For now, let's return the prediction array as a placeholder
56
+ return prediction
57
+
58
+ # Define the Gradio interface
59
+ interface = gr.Interface(
60
+ fn=generate_sfx,
61
+ inputs=gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
62
+ outputs="numpy", # Assuming the model output is numerical, you can change this to audio or any other type as needed.
63
+ live=False,
64
+ title="SFX Generator from Text",
65
+ description="Enter a word or sentence, and the model will generate an SFX sound.",
66
+ )
67
 
68
+ # Run the interface
69
+ if __name__ == "__main__":
70
+ interface.launch()