Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -67,19 +67,10 @@ def empty_output_folder(output_dir):
|
|
| 67 |
|
| 68 |
# Function to create a temporary file with string content
|
| 69 |
def create_temp_file(content, prefix, suffix=".txt"):
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
temp_file.write(content)
|
| 75 |
-
temp_file.close()
|
| 76 |
-
|
| 77 |
-
# Debug: Print file contents
|
| 78 |
-
print(f"\nContent written to {prefix}{suffix}:")
|
| 79 |
-
print(content)
|
| 80 |
-
print("---")
|
| 81 |
-
|
| 82 |
-
return temp_file.name
|
| 83 |
|
| 84 |
def get_last_mp3_file(output_dir):
|
| 85 |
# List all files in the output directory
|
|
@@ -121,13 +112,13 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
|
| 121 |
"python", "infer.py",
|
| 122 |
"--stage1_model", "m-a-p/YuE-s1-7B-anneal-en-cot",
|
| 123 |
"--stage2_model", "m-a-p/YuE-s2-1B-general",
|
| 124 |
-
"--genre_txt", f"{genre_txt_path}",
|
| 125 |
-
"--lyrics_txt", f"{lyrics_txt_path}",
|
| 126 |
-
"--run_n_segments",
|
| 127 |
"--stage2_batch_size", "4",
|
| 128 |
-
"--output_dir", f"{output_dir}",
|
| 129 |
"--cuda_idx", "0",
|
| 130 |
-
"--max_new_tokens",
|
| 131 |
"--disable_offload_model"
|
| 132 |
]
|
| 133 |
|
|
@@ -191,16 +182,38 @@ with gr.Blocks() as demo:
|
|
| 191 |
""")
|
| 192 |
with gr.Row():
|
| 193 |
with gr.Column():
|
| 194 |
-
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
with gr.Column():
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
num_segments = gr.Number(label="Number of Song Segments", value=2, interactive=True)
|
| 203 |
-
max_new_tokens = gr.Slider(label="Max New Tokens", minimum=500, maximum="24000", step=500, value=3000, interactive=True)
|
| 204 |
submit_btn = gr.Button("Submit")
|
| 205 |
music_out = gr.Audio(label="Audio Result")
|
| 206 |
|
|
|
|
| 67 |
|
| 68 |
# Function to create a temporary file with string content
|
| 69 |
def create_temp_file(content, prefix, suffix=".txt"):
|
| 70 |
+
fd, path = tempfile.mkstemp(prefix=prefix, suffix=suffix)
|
| 71 |
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
| 72 |
+
f.write(content)
|
| 73 |
+
return path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
def get_last_mp3_file(output_dir):
|
| 76 |
# List all files in the output directory
|
|
|
|
| 112 |
"python", "infer.py",
|
| 113 |
"--stage1_model", "m-a-p/YuE-s1-7B-anneal-en-cot",
|
| 114 |
"--stage2_model", "m-a-p/YuE-s2-1B-general",
|
| 115 |
+
"--genre_txt", f"'{genre_txt_path}'",
|
| 116 |
+
"--lyrics_txt", f"'{lyrics_txt_path}'",
|
| 117 |
+
"--run_n_segments", str(num_segments),
|
| 118 |
"--stage2_batch_size", "4",
|
| 119 |
+
"--output_dir", f"'{output_dir}'",
|
| 120 |
"--cuda_idx", "0",
|
| 121 |
+
"--max_new_tokens", str(max_new_tokens),
|
| 122 |
"--disable_offload_model"
|
| 123 |
]
|
| 124 |
|
|
|
|
| 182 |
""")
|
| 183 |
with gr.Row():
|
| 184 |
with gr.Column():
|
| 185 |
+
with gr.Accordion("Pro Tips", open=False):
|
| 186 |
+
gr.Markdown(f"""
|
| 187 |
+
**Tips:**
|
| 188 |
+
1. `genres` should include details like instruments, genre, mood, vocal timbre, and vocal gender.
|
| 189 |
+
2. The length of `lyrics` segments and the `--max_new_tokens` value should be matched. For example, if `--max_new_tokens` is set to 3000, the maximum duration for a segment is around 30 seconds. Ensure your lyrics fit this time frame.
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
**Notice:**
|
| 193 |
+
1. A suitable [Genre] tag consists of five components: genre, instrument, mood, gender, and timbre. All five should be included if possible, separated by spaces. The values of timbre should include "vocal" (e.g., "bright vocal").
|
| 194 |
+
|
| 195 |
+
2. Although our tags have an open vocabulary, we have provided the 200 most commonly used <a href="https://github.com/multimodal-art-projection/YuE/blob/main/top_200_tags.json" id="tags_link" target="_blank">tags</a>. It is recommended to select tags from this list for more stable results.
|
| 196 |
+
|
| 197 |
+
3. The order of the tags is flexible. For example, a stable genre control string might look like: "inspiring female uplifting pop airy vocal electronic bright vocal vocal."
|
| 198 |
+
|
| 199 |
+
4. Additionally, we have introduced the "Mandarin" and "Cantonese" tags to distinguish between Mandarin and Cantonese, as their lyrics often share similarities.
|
| 200 |
+
""")
|
| 201 |
+
genre_txt = gr.Textbox(
|
| 202 |
+
label="Genre",
|
| 203 |
+
placeholder="Example: inspiring female uplifting pop airy vocal...",
|
| 204 |
+
info="Text containing genre tags that describe the musical style or characteristics (e.g., instrumental, genre, mood, vocal timbre, vocal gender). This is used as part of the generation prompt."
|
| 205 |
+
)
|
| 206 |
+
lyrics_txt = gr.Textbox(
|
| 207 |
+
label="Lyrics", lines=12,
|
| 208 |
+
placeholder="Type the lyrics here...",
|
| 209 |
+
info="Text containing the lyrics for the music generation. These lyrics will be processed and split into structured segments to guide the generation process."
|
| 210 |
+
)
|
| 211 |
|
| 212 |
with gr.Column():
|
| 213 |
+
|
| 214 |
+
num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
|
| 215 |
+
max_new_tokens = gr.Slider(label="Max New Tokens", minimum=500, maximum="3000", step=500, value=1500, interactive=True)
|
| 216 |
+
|
|
|
|
|
|
|
| 217 |
submit_btn = gr.Button("Submit")
|
| 218 |
music_out = gr.Audio(label="Audio Result")
|
| 219 |
|