Spaces:

szili2011
/

FNaF-Audio-Generation

Runtime error

App Files Files Community

szili2011 commited on Sep 24, 2024

Commit

38b530f

verified ·

1 Parent(s): 683cc0e

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -68

app.py CHANGED Viewed

@@ -1,81 +1,70 @@
-import numpy as np
-import soundfile as sf
 import gradio as gr
-from g2p_en import G2p
-from tensorflow.keras.models import load_model
-import os
-# Load the pre-trained model from the local directory
-model_path = './audio_model.h5'
-model = load_model(model_path)
-# Initialize the g2p model for converting text to phonemes
-g2p = G2p()
-def text_to_phonemes(text):
-    """Convert input text to phonemes."""
-    phonemes = g2p(text)
-    return phonemes
-def generate_audio_features(phonemes, duration):
-    """Generate audio features from phonemes using the trained model."""
-    # Convert phonemes into a format suitable for the model input
-    # This is an example; modify it to match your model's requirements
-    phoneme_features = np.array([ord(p) for p in phonemes])  # Convert phonemes to numerical features
-    phoneme_features = np.expand_dims(phoneme_features, axis=0)  # Reshape for model input
-    # Generate audio features using the model
-    audio_features = model.predict(phoneme_features)
-    # Adjust the length of the features based on the selected duration
-    num_samples = int(duration * 22050)  # Example calculation assuming 22050 samples per second
-    audio_features = np.resize(audio_features, (num_samples,))
-    return audio_features
-def features_to_audio(features):
-    """Convert generated features back to audio."""
-    # Normalize the audio to a suitable range (-1 to 1)
-    audio = np.interp(features, (features.min(), features.max()), (-1, 1))
-    return audio
-def generate_audio(text, duration):
-    """Main function to handle text-to-audio conversion."""
-    # Step 1: Convert text to phonemes
-    phonemes = text_to_phonemes(text)
-    # Step 2: Generate audio features using the pre-trained model and duration
-    audio_features = generate_audio_features(phonemes, duration)
-    # Step 3: Convert features to actual audio
-    audio_data = features_to_audio(audio_features)
-    # Step 4: Save the generated audio
-    audio_file = 'generated_audio.wav'
-    sample_rate = 22050  # Use the sample rate for audio generation
-    sf.write(audio_file, audio_data, sample_rate)
-    return audio_file
-# Gradio interface
-def text_to_audio_interface(text, duration):
-    """Gradio interface function to generate and return audio."""
-    # Call the generate_audio function with the text and selected duration
-    audio_file = generate_audio(text, duration)
-    # Return the path to the generated audio file
-    return audio_file
-# Create the Gradio interface with a note, labeled button, and a slider for duration
-gr.Interface(
-    fn=text_to_audio_interface,
-    inputs=[
-        gr.inputs.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into Sfx Sound"),
-        gr.inputs.Slider(minimum=1, maximum=20, default=5, step=1, label="Audio Duration (seconds)")
-    ],
-    outputs=gr.outputs.Audio(label="Generated Audio Preview"),
-    title="Text-to-Audio Generator",
-    description="Write a Word, set the duration, and press 'Generate' to convert the word into an audio effect!",
-    live=True
-).launch()

 import gradio as gr
+import tensorflow as tf
+import numpy as np
+import nltk
+from nltk.corpus import cmudict
+# Download required NLTK data
+nltk.download('averaged_perceptron_tagger')
+nltk.download('cmudict')
+# Load your model
+model = tf.keras.models.load_model('audio_model.h5')
+# Preprocess input text
+def preprocess_text(text):
+    """
+    Process the input text to prepare it for the model.
+    This could include tokenization, phoneme extraction, etc.
+    """
+    d = cmudict.dict()
+    words = text.lower().split()
+    phonemes = []
+    for word in words:
+        if word in d:
+            phonemes.append(d[word][0])
+        else:
+            # If word not found in cmudict, use a placeholder or skip
+            phonemes.append(['UNKNOWN'])
+    # Flatten the list of phonemes
+    flattened_phonemes = [p for sublist in phonemes for p in sublist]
+    # Convert phonemes to numeric format for the model (customize this based on your model's input requirements)
+    numeric_input = np.array([hash(p) % 1000 for p in flattened_phonemes])
+    return numeric_input
+# Define function to generate sound
+def generate_sfx(text):
+    """
+    Takes input text, preprocesses it, runs it through the model,
+    and generates an SFX sound.
+    """
+    input_data = preprocess_text(text)
+    # Add batch dimension
+    input_data = np.expand_dims(input_data, axis=0)
+    # Generate prediction
+    prediction = model.predict(input_data)
+    # Postprocess the output to generate a sound file or data
+    # Customize based on how your model outputs audio (e.g., generating a WAV file)
+    # For now, let's return the prediction array as a placeholder
+    return prediction
+# Define the Gradio interface
+interface = gr.Interface(
+    fn=generate_sfx,
+    inputs=gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
+    outputs="numpy",  # Assuming the model output is numerical, you can change this to audio or any other type as needed.
+    live=False,
+    title="SFX Generator from Text",
+    description="Enter a word or sentence, and the model will generate an SFX sound.",
+)
+# Run the interface
+if __name__ == "__main__":
+    interface.launch()