mail0000009 commited on
Commit
ca2ead3
Β·
verified Β·
1 Parent(s): e2806da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -23
app.py CHANGED
@@ -1,32 +1,95 @@
1
- import gradio as gr
2
  import torch
3
  import spaces
4
- from transformers import AutoModelForCausalLM, AutoTokenizer
5
- import scipy.io.wavfile as wavfile
6
-
7
- # Model loads on HF Server, 0% of your local data used
8
- model_id = "maya-research/Veena"
9
- tokenizer = AutoTokenizer.from_pretrained(model_id)
10
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
11
-
12
- @spaces.GPU # Essential for when you get the GPU grant
13
- def generate_audio(text):
14
- inputs = tokenizer(text, return_tensors="pt")
15
- # Veena generation logic
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  with torch.no_grad():
17
- output_tokens = model.generate(**inputs, max_new_tokens=512)
 
 
 
 
 
18
 
19
- # Save to a temporary file for the API to return
20
- filename = "output.wav"
21
- # (Assuming Veena's standard output processing here)
22
- # wavfile.write(filename, 24000, audio_data)
23
- return filename
 
24
 
 
25
  demo = gr.Interface(
26
- fn=generate_audio,
27
- inputs=gr.Textbox(label="Input Text"),
28
- outputs=gr.Audio(type="filepath"),
29
- api_name="predict" # This creates the /predict endpoint for n8n
 
 
 
 
30
  )
31
 
32
  demo.launch()
 
1
+ import os
2
  import torch
3
  import spaces
4
+ import gradio as gr
5
+ import soundfile as sf
6
+ from snac import SNAC
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
8
+
9
+ # --- CONFIGURATION ---
10
+ MODEL_ID = "maya-research/Veena"
11
+ SNAC_MODEL_ID = "hubertsiuzdak/snac_24khz"
12
+ VALID_KEY = os.environ.get("MY_API_KEY") # Set this in HF Space Secrets
13
+
14
+ # Token Offsets for Veena
15
+ START_OF_SPEECH_TOKEN = 128257
16
+ END_OF_SPEECH_TOKEN = 128258
17
+ START_OF_HUMAN_TOKEN = 128259
18
+ END_OF_HUMAN_TOKEN = 128260
19
+ START_OF_AI_TOKEN = 128261
20
+ END_OF_AI_TOKEN = 128262
21
+ AUDIO_CODE_BASE_OFFSET = 128266
22
+
23
+ # --- MODEL LOADING ---
24
+ # 4-bit config allows it to run on smaller/shared GPUs
25
+ quant_config = BitsAndBytesConfig(
26
+ load_in_4bit=True,
27
+ bnb_4bit_quant_type="nf4",
28
+ bnb_4bit_compute_dtype=torch.bfloat16,
29
+ )
30
+
31
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
32
+ model = AutoModelForCausalLM.from_pretrained(
33
+ MODEL_ID,
34
+ quantization_config=quant_config,
35
+ device_map="auto"
36
+ )
37
+ snac_model = SNAC.from_pretrained(SNAC_MODEL_ID).eval().to("cuda" if torch.cuda.is_available() else "cpu")
38
+
39
+ def decode_audio(tokens):
40
+ """Converts Veena's tokens into a WAV file"""
41
+ snac_tokens = [t for t in tokens if t >= AUDIO_CODE_BASE_OFFSET]
42
+ if not snac_tokens or len(snac_tokens) % 7 != 0:
43
+ return None
44
+
45
+ codes_lvl = [[] for _ in range(3)]
46
+ # De-interleave based on Veena's 7-token frame structure
47
+ for i in range(0, len(snac_tokens), 7):
48
+ codes_lvl[0].append(snac_tokens[i] - AUDIO_CODE_BASE_OFFSET)
49
+ codes_lvl[1].extend([snac_tokens[i+1]- (AUDIO_CODE_BASE_OFFSET + 4096), snac_tokens[i+2]- (AUDIO_CODE_BASE_OFFSET + 8192)])
50
+ codes_lvl[2].extend([snac_tokens[i+3]- (AUDIO_CODE_BASE_OFFSET + 12288), snac_tokens[i+4]- (AUDIO_CODE_BASE_OFFSET + 16384),
51
+ snac_tokens[i+5]- (AUDIO_CODE_BASE_OFFSET + 20480), snac_tokens[i+6]- (AUDIO_CODE_BASE_OFFSET + 24576)])
52
+
53
+ codes = [torch.tensor([c]).to(snac_model.device) for c in codes_lvl]
54
+ with torch.no_grad():
55
+ audio_values = snac_model.decode(codes)
56
+ return audio_values.cpu().numpy().squeeze()
57
+
58
+ @spaces.GPU
59
+ def generate_veena_speech(text, api_key, speaker="kavya"):
60
+ # Security check for n8n
61
+ if api_key != VALID_KEY:
62
+ raise gr.Error("Invalid API Key")
63
+
64
+ # Format prompt for Veena
65
+ prompt = [START_OF_HUMAN_TOKEN] + tokenizer.encode(f"<spk_{speaker}> {text}") + [END_OF_HUMAN_TOKEN, START_OF_AI_TOKEN]
66
+ input_ids = torch.tensor([prompt]).to(model.device)
67
+
68
  with torch.no_grad():
69
+ output = model.generate(
70
+ input_ids,
71
+ max_new_tokens=1024,
72
+ do_sample=True,
73
+ eos_token_id=[END_OF_SPEECH_TOKEN, END_OF_AI_TOKEN]
74
+ )
75
 
76
+ audio_data = decode_audio(output[0].tolist())
77
+ if audio_data is not None:
78
+ output_path = "output.wav"
79
+ sf.write(output_path, audio_data, 24000)
80
+ return output_path
81
+ return None
82
 
83
+ # --- GRADIO INTERFACE ---
84
  demo = gr.Interface(
85
+ fn=generate_veena_speech,
86
+ inputs=[
87
+ gr.Textbox(label="Text to Speak"),
88
+ gr.Textbox(label="API Key", type="password"),
89
+ gr.Dropdown(choices=["kavya", "agastya", "maitri", "vinaya"], value="kavya", label="Speaker")
90
+ ],
91
+ outputs=gr.Audio(label="Generated Audio"),
92
+ api_name="predict"
93
  )
94
 
95
  demo.launch()