Ftps commited on
Commit
cb98587
·
1 Parent(s): 08e674b

Add model ZIP download support

Browse files
Files changed (2) hide show
  1. app.py +5 -1
  2. tabs/api/realtime_api.py +58 -21
app.py CHANGED
@@ -104,6 +104,10 @@ with gr.Blocks(
104
  )
105
 
106
  gr.Markdown("### Realtime Voice Conversion (Streaming)")
 
 
 
 
107
  with gr.Row():
108
  rt_model = gr.Dropdown(
109
  label="Model",
@@ -127,7 +131,7 @@ with gr.Blocks(
127
 
128
  rt_input.stream(
129
  fn=process_audio_stream,
130
- inputs=[rt_state, rt_input, rt_model, rt_pitch, rt_index_rate],
131
  outputs=[rt_state, rt_output],
132
  api_name="realtime_convert",
133
  )
 
104
  )
105
 
106
  gr.Markdown("### Realtime Voice Conversion (Streaming)")
107
+ rt_model_zip = gr.Textbox(
108
+ label="Model ZIP URL (optional)",
109
+ placeholder="https://example.com/model.zip",
110
+ )
111
  with gr.Row():
112
  rt_model = gr.Dropdown(
113
  label="Model",
 
131
 
132
  rt_input.stream(
133
  fn=process_audio_stream,
134
+ inputs=[rt_state, rt_input, rt_model_zip, rt_model, rt_pitch, rt_index_rate],
135
  outputs=[rt_state, rt_output],
136
  api_name="realtime_convert",
137
  )
tabs/api/realtime_api.py CHANGED
@@ -1,7 +1,10 @@
1
  import os
2
  import sys
 
 
 
3
  import numpy as np
4
- from typing import Optional, Tuple, Any
5
 
6
  now_dir = os.getcwd()
7
  sys.path.append(now_dir)
@@ -10,6 +13,44 @@ LOGS_DIR = os.path.join(now_dir, "logs")
10
  SAMPLE_RATE = 48000
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def get_available_models() -> list:
14
  if not os.path.exists(LOGS_DIR):
15
  return []
@@ -52,13 +93,7 @@ class RealtimeVoiceChanger:
52
  "f0_autotune_strength": 1.0,
53
  }
54
 
55
- def load_model(
56
- self,
57
- model_name: str,
58
- f0_method: str = "rmvpe",
59
- pitch: int = 0,
60
- index_rate: float = 0.75,
61
- ) -> str:
62
  from rvc.realtime.core import VoiceChanger
63
 
64
  pth_path, index_path, error = get_model_paths(model_name)
@@ -74,8 +109,6 @@ class RealtimeVoiceChanger:
74
  f0_method=f0_method,
75
  )
76
  self.model_name = model_name
77
- self.settings["pitch"] = pitch
78
- self.settings["index_rate"] = index_rate
79
  return f"Model '{model_name}' loaded"
80
 
81
  def convert(self, audio: np.ndarray) -> Optional[np.ndarray]:
@@ -94,30 +127,34 @@ class RealtimeVoiceChanger:
94
  return result
95
 
96
 
97
- def create_voice_changer_state() -> RealtimeVoiceChanger:
98
- return RealtimeVoiceChanger()
99
-
100
-
101
  def process_audio_stream(
102
  state: Optional[RealtimeVoiceChanger],
103
  audio_chunk: Optional[Tuple[int, np.ndarray]],
 
104
  model_name: str,
105
  pitch: int,
106
  index_rate: float,
107
  ) -> Tuple[RealtimeVoiceChanger, Optional[Tuple[int, np.ndarray]]]:
108
  if state is None:
109
- state = create_voice_changer_state()
110
 
111
  if audio_chunk is None:
112
  return state, None
113
 
114
- sr, audio = audio_chunk
 
 
 
 
115
 
116
- if state.model_name != model_name and model_name:
117
- state.load_model(model_name, pitch=pitch, index_rate=index_rate)
118
- else:
119
- state.settings["pitch"] = pitch
120
- state.settings["index_rate"] = index_rate
 
 
 
121
 
122
  if audio.ndim > 1:
123
  audio = audio.mean(axis=1)
 
1
  import os
2
  import sys
3
+ import io
4
+ import zipfile
5
+ import requests
6
  import numpy as np
7
+ from typing import Optional, Tuple
8
 
9
  now_dir = os.getcwd()
10
  sys.path.append(now_dir)
 
13
  SAMPLE_RATE = 48000
14
 
15
 
16
+ def download_and_extract_model(url: str) -> Tuple[Optional[str], str]:
17
+ if not url:
18
+ return None, "URL is empty"
19
+ try:
20
+ response = requests.get(url, stream=True)
21
+ response.raise_for_status()
22
+
23
+ with zipfile.ZipFile(io.BytesIO(response.content)) as z:
24
+ model_name = os.path.splitext(os.path.basename(url))[0]
25
+ model_path = os.path.join(LOGS_DIR, model_name)
26
+ os.makedirs(model_path, exist_ok=True)
27
+
28
+ pth_file = next((n for n in z.namelist() if n.endswith(".pth")), None)
29
+ index_file = next((n for n in z.namelist() if n.endswith(".index")), None)
30
+
31
+ if not pth_file:
32
+ return None, "No .pth file in zip"
33
+
34
+ z.extract(pth_file, model_path)
35
+ if os.path.dirname(pth_file):
36
+ os.rename(
37
+ os.path.join(model_path, pth_file),
38
+ os.path.join(model_path, os.path.basename(pth_file)),
39
+ )
40
+
41
+ if index_file:
42
+ z.extract(index_file, model_path)
43
+ if os.path.dirname(index_file):
44
+ os.rename(
45
+ os.path.join(model_path, index_file),
46
+ os.path.join(model_path, os.path.basename(index_file)),
47
+ )
48
+
49
+ return model_name, f"Model '{model_name}' downloaded"
50
+ except Exception as e:
51
+ return None, str(e)
52
+
53
+
54
  def get_available_models() -> list:
55
  if not os.path.exists(LOGS_DIR):
56
  return []
 
93
  "f0_autotune_strength": 1.0,
94
  }
95
 
96
+ def load_model(self, model_name: str, f0_method: str = "rmvpe") -> str:
 
 
 
 
 
 
97
  from rvc.realtime.core import VoiceChanger
98
 
99
  pth_path, index_path, error = get_model_paths(model_name)
 
109
  f0_method=f0_method,
110
  )
111
  self.model_name = model_name
 
 
112
  return f"Model '{model_name}' loaded"
113
 
114
  def convert(self, audio: np.ndarray) -> Optional[np.ndarray]:
 
127
  return result
128
 
129
 
 
 
 
 
130
  def process_audio_stream(
131
  state: Optional[RealtimeVoiceChanger],
132
  audio_chunk: Optional[Tuple[int, np.ndarray]],
133
+ model_zip_link: str,
134
  model_name: str,
135
  pitch: int,
136
  index_rate: float,
137
  ) -> Tuple[RealtimeVoiceChanger, Optional[Tuple[int, np.ndarray]]]:
138
  if state is None:
139
+ state = RealtimeVoiceChanger()
140
 
141
  if audio_chunk is None:
142
  return state, None
143
 
144
+ # Download model from ZIP if provided
145
+ if model_zip_link and not model_name:
146
+ downloaded_name, msg = download_and_extract_model(model_zip_link)
147
+ if downloaded_name:
148
+ model_name = downloaded_name
149
 
150
+ # Load model if changed
151
+ if model_name and state.model_name != model_name:
152
+ state.load_model(model_name)
153
+
154
+ state.settings["pitch"] = pitch
155
+ state.settings["index_rate"] = index_rate
156
+
157
+ sr, audio = audio_chunk
158
 
159
  if audio.ndim > 1:
160
  audio = audio.mean(axis=1)