Boobs00 commited on
Commit
207ef1d
·
verified ·
1 Parent(s): 696ab70

Add 2 files

Browse files
Files changed (2) hide show
  1. index.html +209 -4
  2. prompts.txt +2 -1
index.html CHANGED
@@ -3,7 +3,7 @@
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>Xortron7 - Advanced AI Companion with IDE</title>
7
  <script src="https://cdn.tailwindcss.com"></script>
8
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
9
  <script src="https://cdnjs.cloudflare.com/ajax/libs/ace/1.32.7/ace.js"></script>
@@ -156,6 +156,81 @@
156
  text-align: center;
157
  font-weight: bold;
158
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  </style>
160
  </head>
161
  <body class="bg-gray-900 text-gray-100 min-h-screen">
@@ -190,6 +265,7 @@
190
  <span class="text-purple-400"><i class="fas fa-shield-alt mr-1"></i> ENCRYPTED</span>
191
  <span class="ml-4 text-blue-400"><i class="fas fa-search mr-1"></i> WEB SEARCH ENABLED</span>
192
  <span class="ml-4 text-yellow-400"><i class="fas fa-code mr-1"></i> IDE ACTIVE</span>
 
193
  </div>
194
  </div>
195
 
@@ -212,6 +288,9 @@
212
  <button id="tools-tab" class="px-4 py-2 font-medium text-gray-500">
213
  <i class="fas fa-tools mr-2"></i>Tools
214
  </button>
 
 
 
215
  </div>
216
 
217
  <!-- Chat panel -->
@@ -223,7 +302,7 @@
223
  <div class="bot-message text-white rounded-lg p-4 max-w-xs md:max-w-md lg:max-w-lg relative">
224
  <div class="absolute -left-2 top-3 w-4 h-4 rotate-45 bg-indigo-900"></div>
225
  <p class="font-bold text-purple-300">Xortron7:</p>
226
- <p>Neural pathways initialized. Persistent memory database connected. Web search integration active. IDE environment loaded. Ready to serve, human.</p>
227
  <div class="text-xs text-gray-400 mt-2 terminal-text">[SYSTEM BOOT COMPLETE]</div>
228
  </div>
229
  </div>
@@ -260,6 +339,10 @@
260
  <input type="checkbox" id="ide-toggle" class="form-checkbox h-3 w-3 text-yellow-600" checked>
261
  <span class="ml-2">IDE</span>
262
  </label>
 
 
 
 
263
  </div>
264
  <span id="status-indicator">[SYSTEM READY]</span>
265
  </div>
@@ -380,6 +463,7 @@
380
  <p class="text-green-400">$ Python 3.10.6 | Java 17.0.6 | Node.js 18.12.1</p>
381
  <p class="text-green-400">$ Android SDK tools available</p>
382
  <p class="text-green-400">$ APK tools and decompilers ready</p>
 
383
  <p class="text-green-400">$ Ready for commands</p>
384
  </div>
385
  </div>
@@ -556,6 +640,106 @@
556
  </div>
557
  </div>
558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
  <!-- System controls -->
560
  <div class="mt-6 bg-gray-800 rounded-lg p-4 cyberpunk-border">
561
  <div class="flex flex-wrap justify-between items-center">
@@ -572,6 +756,9 @@
572
  <button id="run-code" class="bg-green-700 hover:bg-green-600 text-white px-3 py-2 rounded text-sm terminal-text">
573
  <i class="fas fa-play mr-1"></i> Run Code
574
  </button>
 
 
 
575
  </div>
576
  <div class="flex items-center space-x-4">
577
  <div class="flex items-center">
@@ -581,7 +768,7 @@
581
  </div>
582
  </div>
583
  <div class="text-xs terminal-text">
584
- <span class="text-green-400">ACTIVE</span> | <span class="text-blue-400">SEARCH</span> | <span class="text-purple-400">SECURE</span> | <span class="text-yellow-400">IDE</span>
585
  </div>
586
  </div>
587
  </div>
@@ -591,7 +778,7 @@
591
  <!-- Footer -->
592
  <footer class="mt-8 py-4 px-6 bg-black text-center text-xs text-gray-500 terminal-text cyberpunk-border">
593
  <p>XORTRON CYBERNETIC LABORATORIES © 2023 | ALL SYSTEMS SECURE | SANDBOXED ENVIRONMENT</p>
594
- <p class="mt-1">WARNING: This AI system operates with persistent memory, web search, and full development capabilities.</p>
595
  </footer>
596
 
597
  <script>
@@ -640,9 +827,11 @@
640
  const webSearchToggle = document.getElementById('web-search-toggle');
641
  const memoryToggle = document.getElementById('memory-toggle');
642
  const ideToggle = document.getElementById('ide-toggle');
 
643
  const clearChatButton = document.getElementById('clear-chat');
644
  const exportChatButton = document.getElementById('export-chat');
645
  const runCodeButton = document.getElementById('run-code');
 
646
 
647
  // Tab elements
648
  const chatTab = document.getElementById('chat-tab');
@@ -650,11 +839,13 @@
650
  const searchTab = document.getElementById('search-tab');
651
  const ideTab = document.getElementById('ide-tab');
652
  const toolsTab = document.getElementById('tools-tab');
 
653
  const chatPanel = document.getElementById('chat-panel');
654
  const memoryPanel = document.getElementById('memory-panel');
655
  const searchPanel = document.getElementById('search-panel');
656
  const idePanel = document.getElementById('ide-panel');
657
  const toolsPanel = document.getElementById('tools-panel');
 
658
 
659
  // Search elements
660
  const searchQuery = document.getElementById('search-query');
@@ -667,6 +858,20 @@
667
  const terminalOutput = document.getElementById('terminal-output');
668
  const fileItems = document.querySelectorAll('.file-item');
669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
670
  // Initialize ACE Editor
671
  editor.setTheme("ace/theme/monokai");
672
  editor.session.setMode("ace/mode/python");
 
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Xortron7 - Advanced AI Companion with IDE & TTS</title>
7
  <script src="https://cdn.tailwindcss.com"></script>
8
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
9
  <script src="https://cdnjs.cloudflare.com/ajax/libs/ace/1.32.7/ace.js"></script>
 
156
  text-align: center;
157
  font-weight: bold;
158
  }
159
+
160
+ .tts-controls {
161
+ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
162
+ border-radius: 8px;
163
+ padding: 15px;
164
+ margin-top: 15px;
165
+ }
166
+
167
+ .tts-slider {
168
+ width: 100%;
169
+ -webkit-appearance: none;
170
+ height: 8px;
171
+ border-radius: 4px;
172
+ background: #4a00e0;
173
+ outline: none;
174
+ }
175
+
176
+ .tts-slider::-webkit-slider-thumb {
177
+ -webkit-appearance: none;
178
+ appearance: none;
179
+ width: 18px;
180
+ height: 18px;
181
+ border-radius: 50%;
182
+ background: #8e2de2;
183
+ cursor: pointer;
184
+ }
185
+
186
+ .tts-audio-container {
187
+ display: flex;
188
+ align-items: center;
189
+ gap: 10px;
190
+ margin-top: 10px;
191
+ }
192
+
193
+ .tts-audio-visualizer {
194
+ flex-grow: 1;
195
+ height: 40px;
196
+ background: rgba(0, 0, 0, 0.3);
197
+ border-radius: 4px;
198
+ overflow: hidden;
199
+ position: relative;
200
+ }
201
+
202
+ .tts-audio-wave {
203
+ position: absolute;
204
+ top: 0;
205
+ left: 0;
206
+ width: 100%;
207
+ height: 100%;
208
+ background: linear-gradient(90deg, transparent, rgba(138, 43, 226, 0.5), transparent);
209
+ animation: wave 2s infinite linear;
210
+ }
211
+
212
+ @keyframes wave {
213
+ 0% { transform: translateX(-100%); }
214
+ 100% { transform: translateX(100%); }
215
+ }
216
+
217
+ .tts-voice-selector {
218
+ background: #1a1a2e;
219
+ border: 1px solid #4a00e0;
220
+ color: white;
221
+ padding: 8px;
222
+ border-radius: 4px;
223
+ width: 100%;
224
+ }
225
+
226
+ .tts-voice-option {
227
+ padding: 8px;
228
+ background: #16213e;
229
+ }
230
+
231
+ .tts-voice-option:hover {
232
+ background: #4a00e0;
233
+ }
234
  </style>
235
  </head>
236
  <body class="bg-gray-900 text-gray-100 min-h-screen">
 
265
  <span class="text-purple-400"><i class="fas fa-shield-alt mr-1"></i> ENCRYPTED</span>
266
  <span class="ml-4 text-blue-400"><i class="fas fa-search mr-1"></i> WEB SEARCH ENABLED</span>
267
  <span class="ml-4 text-yellow-400"><i class="fas fa-code mr-1"></i> IDE ACTIVE</span>
268
+ <span class="ml-4 text-pink-400"><i class="fas fa-volume-up mr-1"></i> TTS ACTIVE</span>
269
  </div>
270
  </div>
271
 
 
288
  <button id="tools-tab" class="px-4 py-2 font-medium text-gray-500">
289
  <i class="fas fa-tools mr-2"></i>Tools
290
  </button>
291
+ <button id="tts-tab" class="px-4 py-2 font-medium text-gray-500">
292
+ <i class="fas fa-volume-up mr-2"></i>TTS
293
+ </button>
294
  </div>
295
 
296
  <!-- Chat panel -->
 
302
  <div class="bot-message text-white rounded-lg p-4 max-w-xs md:max-w-md lg:max-w-lg relative">
303
  <div class="absolute -left-2 top-3 w-4 h-4 rotate-45 bg-indigo-900"></div>
304
  <p class="font-bold text-purple-300">Xortron7:</p>
305
+ <p>Neural pathways initialized. Persistent memory database connected. Web search integration active. IDE environment loaded. TTS system ready. Ready to serve, human.</p>
306
  <div class="text-xs text-gray-400 mt-2 terminal-text">[SYSTEM BOOT COMPLETE]</div>
307
  </div>
308
  </div>
 
339
  <input type="checkbox" id="ide-toggle" class="form-checkbox h-3 w-3 text-yellow-600" checked>
340
  <span class="ml-2">IDE</span>
341
  </label>
342
+ <label class="inline-flex items-center ml-4">
343
+ <input type="checkbox" id="tts-toggle" class="form-checkbox h-3 w-3 text-pink-600" checked>
344
+ <span class="ml-2">TTS</span>
345
+ </label>
346
  </div>
347
  <span id="status-indicator">[SYSTEM READY]</span>
348
  </div>
 
463
  <p class="text-green-400">$ Python 3.10.6 | Java 17.0.6 | Node.js 18.12.1</p>
464
  <p class="text-green-400">$ Android SDK tools available</p>
465
  <p class="text-green-400">$ APK tools and decompilers ready</p>
466
+ <p class="text-green-400">$ TTS system initialized</p>
467
  <p class="text-green-400">$ Ready for commands</p>
468
  </div>
469
  </div>
 
640
  </div>
641
  </div>
642
 
643
+ <!-- TTS panel (hidden by default) -->
644
+ <div id="tts-panel" class="hidden bg-gray-800 rounded-lg p-4">
645
+ <h3 class="text-lg font-bold mb-4 text-pink-300"><i class="fas fa-volume-up mr-2"></i>Text-to-Speech System</h3>
646
+
647
+ <div class="tts-controls">
648
+ <div class="mb-4">
649
+ <label class="block text-sm font-medium text-gray-300 mb-2">Voice Model</label>
650
+ <select id="tts-voice" class="tts-voice-selector">
651
+ <option value="dia-1.6b">Dia 1.6B (Default)</option>
652
+ <option value="male-1">Male Voice 1</option>
653
+ <option value="female-1">Female Voice 1</option>
654
+ <option value="robot-1">Robotic Voice</option>
655
+ <option value="custom">Custom Voice (Upload Sample)</option>
656
+ </select>
657
+ </div>
658
+
659
+ <div class="mb-4">
660
+ <label class="block text-sm font-medium text-gray-300 mb-2">Text Input</label>
661
+ <textarea id="tts-text" class="w-full bg-gray-700 text-white px-4 py-2 rounded-lg focus:outline-none focus:ring-2 focus:ring-purple-500" rows="4" placeholder="Enter text to convert to speech..."></textarea>
662
+ </div>
663
+
664
+ <div class="mb-4">
665
+ <label class="block text-sm font-medium text-gray-300 mb-2">Audio Prompt (Optional)</label>
666
+ <input type="file" id="tts-audio-prompt" class="hidden" accept="audio/*">
667
+ <div class="flex items-center gap-2">
668
+ <button id="tts-upload-btn" class="bg-purple-600 hover:bg-purple-700 text-white px-4 py-2 rounded-lg">
669
+ <i class="fas fa-upload mr-2"></i>Upload Audio
670
+ </button>
671
+ <span id="tts-audio-filename" class="text-gray-400 text-sm">No file selected</span>
672
+ </div>
673
+ </div>
674
+
675
+ <div class="grid grid-cols-1 md:grid-cols-2 gap-4 mb-4">
676
+ <div>
677
+ <label class="block text-sm font-medium text-gray-300 mb-2">Speed Factor</label>
678
+ <input type="range" id="tts-speed" min="0.8" max="1.2" step="0.05" value="0.94" class="tts-slider">
679
+ <div class="flex justify-between text-xs text-gray-400 mt-1">
680
+ <span>0.8x</span>
681
+ <span>1.0x</span>
682
+ <span>1.2x</span>
683
+ </div>
684
+ </div>
685
+
686
+ <div>
687
+ <label class="block text-sm font-medium text-gray-300 mb-2">Temperature</label>
688
+ <input type="range" id="tts-temperature" min="1.0" max="1.5" step="0.05" value="1.3" class="tts-slider">
689
+ <div class="flex justify-between text-xs text-gray-400 mt-1">
690
+ <span>1.0</span>
691
+ <span>1.25</span>
692
+ <span>1.5</span>
693
+ </div>
694
+ </div>
695
+ </div>
696
+
697
+ <div class="grid grid-cols-1 md:grid-cols-2 gap-4 mb-4">
698
+ <div>
699
+ <label class="block text-sm font-medium text-gray-300 mb-2">CFG Scale</label>
700
+ <input type="range" id="tts-cfg-scale" min="1.0" max="5.0" step="0.1" value="3.0" class="tts-slider">
701
+ <div class="flex justify-between text-xs text-gray-400 mt-1">
702
+ <span>1.0</span>
703
+ <span>3.0</span>
704
+ <span>5.0</span>
705
+ </div>
706
+ </div>
707
+
708
+ <div>
709
+ <label class="block text-sm font-medium text-gray-300 mb-2">Top P</label>
710
+ <input type="range" id="tts-top-p" min="0.8" max="1.0" step="0.01" value="0.95" class="tts-slider">
711
+ <div class="flex justify-between text-xs text-gray-400 mt-1">
712
+ <span>0.8</span>
713
+ <span>0.9</span>
714
+ <span>1.0</span>
715
+ </div>
716
+ </div>
717
+ </div>
718
+
719
+ <button id="tts-generate-btn" class="w-full bg-pink-600 hover:bg-pink-700 text-white px-4 py-3 rounded-lg font-bold transition-all duration-200 glow-button">
720
+ <i class="fas fa-play mr-2"></i> Generate Speech
721
+ </button>
722
+
723
+ <div class="tts-audio-container mt-4">
724
+ <div class="tts-audio-visualizer">
725
+ <div id="tts-wave" class="tts-audio-wave" style="display: none;"></div>
726
+ </div>
727
+ <audio id="tts-audio-player" controls class="hidden"></audio>
728
+ </div>
729
+ </div>
730
+
731
+ <div class="mt-6 bg-gray-700 rounded-lg p-4">
732
+ <h4 class="font-bold text-purple-300 mb-2"><i class="fas fa-info-circle mr-2"></i>TTS System Info</h4>
733
+ <div class="terminal-text text-sm">
734
+ <p>Model: Dia-1.6B (Nari Labs)</p>
735
+ <p>Sample Rate: 44.1kHz</p>
736
+ <p>Max Tokens: 3072</p>
737
+ <p>Status: <span class="text-green-400">Ready</span></p>
738
+ <p class="mt-2 text-gray-400">Supports multi-speaker dialogue generation with optional audio prompts for voice matching.</p>
739
+ </div>
740
+ </div>
741
+ </div>
742
+
743
  <!-- System controls -->
744
  <div class="mt-6 bg-gray-800 rounded-lg p-4 cyberpunk-border">
745
  <div class="flex flex-wrap justify-between items-center">
 
756
  <button id="run-code" class="bg-green-700 hover:bg-green-600 text-white px-3 py-2 rounded text-sm terminal-text">
757
  <i class="fas fa-play mr-1"></i> Run Code
758
  </button>
759
+ <button id="tts-chat" class="bg-pink-700 hover:bg-pink-600 text-white px-3 py-2 rounded text-sm terminal-text">
760
+ <i class="fas fa-volume-up mr-1"></i> TTS Chat
761
+ </button>
762
  </div>
763
  <div class="flex items-center space-x-4">
764
  <div class="flex items-center">
 
768
  </div>
769
  </div>
770
  <div class="text-xs terminal-text">
771
+ <span class="text-green-400">ACTIVE</span> | <span class="text-blue-400">SEARCH</span> | <span class="text-purple-400">SECURE</span> | <span class="text-yellow-400">IDE</span> | <span class="text-pink-400">TTS</span>
772
  </div>
773
  </div>
774
  </div>
 
778
  <!-- Footer -->
779
  <footer class="mt-8 py-4 px-6 bg-black text-center text-xs text-gray-500 terminal-text cyberpunk-border">
780
  <p>XORTRON CYBERNETIC LABORATORIES © 2023 | ALL SYSTEMS SECURE | SANDBOXED ENVIRONMENT</p>
781
+ <p class="mt-1">WARNING: This AI system operates with persistent memory, web search, full development capabilities, and advanced TTS.</p>
782
  </footer>
783
 
784
  <script>
 
827
  const webSearchToggle = document.getElementById('web-search-toggle');
828
  const memoryToggle = document.getElementById('memory-toggle');
829
  const ideToggle = document.getElementById('ide-toggle');
830
+ const ttsToggle = document.getElementById('tts-toggle');
831
  const clearChatButton = document.getElementById('clear-chat');
832
  const exportChatButton = document.getElementById('export-chat');
833
  const runCodeButton = document.getElementById('run-code');
834
+ const ttsChatButton = document.getElementById('tts-chat');
835
 
836
  // Tab elements
837
  const chatTab = document.getElementById('chat-tab');
 
839
  const searchTab = document.getElementById('search-tab');
840
  const ideTab = document.getElementById('ide-tab');
841
  const toolsTab = document.getElementById('tools-tab');
842
+ const ttsTab = document.getElementById('tts-tab');
843
  const chatPanel = document.getElementById('chat-panel');
844
  const memoryPanel = document.getElementById('memory-panel');
845
  const searchPanel = document.getElementById('search-panel');
846
  const idePanel = document.getElementById('ide-panel');
847
  const toolsPanel = document.getElementById('tools-panel');
848
+ const ttsPanel = document.getElementById('tts-panel');
849
 
850
  // Search elements
851
  const searchQuery = document.getElementById('search-query');
 
858
  const terminalOutput = document.getElementById('terminal-output');
859
  const fileItems = document.querySelectorAll('.file-item');
860
 
861
+ // TTS elements
862
+ const ttsVoice = document.getElementById('tts-voice');
863
+ const ttsText = document.getElementById('tts-text');
864
+ const ttsAudioPrompt = document.getElementById('tts-audio-prompt');
865
+ const ttsUploadBtn = document.getElementById('tts-upload-btn');
866
+ const ttsAudioFilename = document.getElementById('tts-audio-filename');
867
+ const ttsSpeed = document.getElementById('tts-speed');
868
+ const ttsTemperature = document.getElementById('tts-temperature');
869
+ const ttsCfgScale = document.getElementById('tts-cfg-scale');
870
+ const ttsTopP = document.getElementById('tts-top-p');
871
+ const ttsGenerateBtn = document.getElementById('tts-generate-btn');
872
+ const ttsWave = document.getElementById('tts-wave');
873
+ const ttsAudioPlayer = document.getElementById('tts-audio-player');
874
+
875
  // Initialize ACE Editor
876
  editor.setTheme("ace/theme/monokai");
877
  editor.session.setMode("ace/mode/python");
prompts.txt CHANGED
@@ -1,2 +1,3 @@
1
  Please include web search functionality with a duckduckgo.com integration and let's make this fully functional operational and a hundred percent working primarily, and most importantly, with the a I properly installed and responding and set up with long term persistent memory with the database, API and server set up through the same integration.
2
- Let's really upscale this and include an embedded built in pre installed IDE sand-boxed and fully loaded with an advanced vscode setup for security systems management cybersecurity NSFW and adult content, as well as Android app. Development and APK modification with cutlin and python react Java script. Html css all pre installed with any other essential packages included. along with
 
 
1
  Please include web search functionality with a duckduckgo.com integration and let's make this fully functional operational and a hundred percent working primarily, and most importantly, with the a I properly installed and responding and set up with long term persistent memory with the database, API and server set up through the same integration.
2
+ Let's really upscale this and include an embedded built in pre installed IDE sand-boxed and fully loaded with an advanced vscode setup for security systems management cybersecurity NSFW and adult content, as well as Android app. Development and APK modification with cutlin and python react Java script. Html css all pre installed with any other essential packages included. along with
3
+ Now add TTS to the setup by combining the code we already generated with this gradio space code:raw Copy download link history blame contribute delete 15.3 kB import tempfile import time from pathlib import Path from typing import Optional, Tuple import spaces import gradio as gr import numpy as np import soundfile as sf import torch from dia.model import Dia # Load Nari model and config print("Loading Nari model...") try: # Use the function from inference.py model = Dia.from_pretrained("nari-labs/Dia-1.6B", compute_dtype="float32") except Exception as e: print(f"Error loading Nari model: {e}") raise @spaces.GPU def run_inference( text_input: str, audio_prompt_input: Optional[Tuple[int, np.ndarray]], max_new_tokens: int, cfg_scale: float, temperature: float, top_p: float, cfg_filter_top_k: int, speed_factor: float, ): """ Runs Nari inference using the globally loaded model and provided inputs. Uses temporary files for text and audio prompt compatibility with inference.generate. """ # global model, device # Access global model, config, device if not text_input or text_input.isspace(): raise gr.Error("Text input cannot be empty.") temp_txt_file_path = None temp_audio_prompt_path = None output_audio = (44100, np.zeros(1, dtype=np.float32)) try: prompt_path_for_generate = None if audio_prompt_input is not None: sr, audio_data = audio_prompt_input # Check if audio_data is valid if ( audio_data is None or audio_data.size == 0 or audio_data.max() == 0 ): # Check for silence/empty gr.Warning("Audio prompt seems empty or silent, ignoring prompt.") else: # Save prompt audio to a temporary WAV file with tempfile.NamedTemporaryFile( mode="wb", suffix=".wav", delete=False ) as f_audio: temp_audio_prompt_path = f_audio.name # Store path for cleanup # Basic audio preprocessing for consistency # Convert to float32 in [-1, 1] range if integer type if np.issubdtype(audio_data.dtype, np.integer): max_val = np.iinfo(audio_data.dtype).max audio_data = audio_data.astype(np.float32) / max_val elif not np.issubdtype(audio_data.dtype, np.floating): gr.Warning( f"Unsupported audio prompt dtype {audio_data.dtype}, attempting conversion." ) # Attempt conversion, might fail for complex types try: audio_data = audio_data.astype(np.float32) except Exception as conv_e: raise gr.Error( f"Failed to convert audio prompt to float32: {conv_e}" ) # Ensure mono (average channels if stereo) if audio_data.ndim > 1: if audio_data.shape[0] == 2: # Assume (2, N) audio_data = np.mean(audio_data, axis=0) elif audio_data.shape[1] == 2: # Assume (N, 2) audio_data = np.mean(audio_data, axis=1) else: gr.Warning( f"Audio prompt has unexpected shape {audio_data.shape}, taking first channel/axis." ) audio_data = ( audio_data[0] if audio_data.shape[0] < audio_data.shape[1] else audio_data[:, 0] ) audio_data = np.ascontiguousarray( audio_data ) # Ensure contiguous after slicing/mean # Write using soundfile try: sf.write( temp_audio_prompt_path, audio_data, sr, subtype="FLOAT" ) # Explicitly use FLOAT subtype prompt_path_for_generate = temp_audio_prompt_path print( f"Created temporary audio prompt file: {temp_audio_prompt_path} (orig sr: {sr})" ) except Exception as write_e: print(f"Error writing temporary audio file: {write_e}") raise gr.Error(f"Failed to save audio prompt: {write_e}") # 3. Run Generation start_time = time.time() # Use torch.inference_mode() context manager for the generation call with torch.inference_mode(): output_audio_np = model.generate( text_input, max_tokens=max_new_tokens, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, cfg_filter_top_k=cfg_filter_top_k, # Pass the value here use_torch_compile=False, # Keep False for Gradio stability audio_prompt=prompt_path_for_generate, ) end_time = time.time() print(f"Generation finished in {end_time - start_time:.2f} seconds.") # 4. Convert Codes to Audio if output_audio_np is not None: # Get sample rate from the loaded DAC model output_sr = 44100 # --- Slow down audio --- original_len = len(output_audio_np) # Ensure speed_factor is positive and not excessively small/large to avoid issues speed_factor = max(0.1, min(speed_factor, 5.0)) target_len = int( original_len / speed_factor ) # Target length based on speed_factor if ( target_len != original_len and target_len > 0 ): # Only interpolate if length changes and is valid x_original = np.arange(original_len) x_resampled = np.linspace(0, original_len - 1, target_len) resampled_audio_np = np.interp(x_resampled, x_original, output_audio_np) output_audio = ( output_sr, resampled_audio_np.astype(np.float32), ) # Use resampled audio print( f"Resampled audio from {original_len} to {target_len} samples for {speed_factor:.2f}x speed." ) else: output_audio = ( output_sr, output_audio_np, ) # Keep original if calculation fails or no change print(f"Skipping audio speed adjustment (factor: {speed_factor:.2f}).") # --- End slowdown --- print( f"Audio conversion successful. Final shape: {output_audio[1].shape}, Sample Rate: {output_sr}" ) # Explicitly convert to int16 to prevent Gradio warning if ( output_audio[1].dtype == np.float32 or output_audio[1].dtype == np.float64 ): audio_for_gradio = np.clip(output_audio[1], -1.0, 1.0) audio_for_gradio = (audio_for_gradio * 32767).astype(np.int16) output_audio = (output_sr, audio_for_gradio) print("Converted audio to int16 for Gradio output.") else: print("\nGeneration finished, but no valid tokens were produced.") # Return default silence gr.Warning("Generation produced no output.") except Exception as e: print(f"Error during inference: {e}") import traceback traceback.print_exc() # Re-raise as Gradio error to display nicely in the UI raise gr.Error(f"Inference failed: {e}") finally: # 5. Cleanup Temporary Files defensively if temp_txt_file_path and Path(temp_txt_file_path).exists(): try: Path(temp_txt_file_path).unlink() print(f"Deleted temporary text file: {temp_txt_file_path}") except OSError as e: print( f"Warning: Error deleting temporary text file {temp_txt_file_path}: {e}" ) if temp_audio_prompt_path and Path(temp_audio_prompt_path).exists(): try: Path(temp_audio_prompt_path).unlink() print(f"Deleted temporary audio prompt file: {temp_audio_prompt_path}") except OSError as e: print( f"Warning: Error deleting temporary audio prompt file {temp_audio_prompt_path}: {e}" ) return output_audio # --- Create Gradio Interface --- css = """ #col-container {max-width: 90%; margin-left: auto; margin-right: auto;} """ # Attempt to load default text from example.txt default_text = "[S1] Dia is an open weights text to dialogue model. \n[S2] You get full control over scripts and voices. \n[S1] Wow. Amazing. (laughs) \n[S2] Try it now on Git hub or Hugging Face." example_txt_path = Path("./example.txt") if example_txt_path.exists(): try: default_text = example_txt_path.read_text(encoding="utf-8").strip() if not default_text: # Handle empty example file default_text = "Example text file was empty." except Exception as e: print(f"Warning: Could not read example.txt: {e}") # Build Gradio UI with gr.Blocks(css=css) as demo: gr.Markdown("# Nari Text-to-Speech Synthesis") with gr.Row(equal_height=False): with gr.Column(scale=1): text_input = gr.Textbox( label="Input Text", placeholder="Enter text here...", value=default_text, lines=5, # Increased lines ) audio_prompt_input = gr.Audio( label="Audio Prompt (Optional)", show_label=True, sources=["upload", "microphone"], type="numpy", ) with gr.Accordion("Generation Parameters", open=False): max_new_tokens = gr.Slider( label="Max New Tokens (Audio Length)", minimum=860, maximum=3072, value=model.config.data.audio_length, # Use config default if available, else fallback step=50, info="Controls the maximum length of the generated audio (more tokens = longer audio).", ) cfg_scale = gr.Slider( label="CFG Scale (Guidance Strength)", minimum=1.0, maximum=5.0, value=3.0, # Default from inference.py step=0.1, info="Higher values increase adherence to the text prompt.", ) temperature = gr.Slider( label="Temperature (Randomness)", minimum=1.0, maximum=1.5, value=1.3, # Default from inference.py step=0.05, info="Lower values make the output more deterministic, higher values increase randomness.", ) top_p = gr.Slider( label="Top P (Nucleus Sampling)", minimum=0.80, maximum=1.0, value=0.95, # Default from inference.py step=0.01, info="Filters vocabulary to the most likely tokens cumulatively reaching probability P.", ) cfg_filter_top_k = gr.Slider( label="CFG Filter Top K", minimum=15, maximum=50, value=30, step=1, info="Top k filter for CFG guidance.", ) speed_factor_slider = gr.Slider( label="Speed Factor", minimum=0.8, maximum=1.0, value=0.94, step=0.02, info="Adjusts the speed of the generated audio (1.0 = original speed).", ) run_button = gr.Button("Generate Audio", variant="primary") with gr.Column(scale=1): audio_output = gr.Audio( label="Generated Audio", type="numpy", autoplay=False, ) # Link button click to function run_button.click( fn=run_inference, inputs=[ text_input, audio_prompt_input, max_new_tokens, cfg_scale, temperature, top_p, cfg_filter_top_k, speed_factor_slider, ], outputs=[audio_output], # Add status_output here if using it api_name="generate_audio", ) # Add examples (ensure the prompt path is correct or remove it if example file doesn't exist) example_prompt_path = "./example_prompt.mp3" # Adjust if needed examples_list = [ [ "[S1] Oh fire! Oh my goodness! What's the procedure? What to we do people? The smoke could be coming through an air duct! \n[S2] Oh my god! Okay.. it's happening. Everybody stay calm! \n[S1] What's the procedure... \n[S2] Everybody stay fucking calm!!!... Everybody fucking calm down!!!!! \n[S1] No! No! If you touch the handle, if its hot there might be a fire down the hallway! ", None, 3072, 3.0, 1.3, 0.95, 35, 0.94, ], [ "[S1] Open weights text to dialogue model. \n[S2] You get full control over scripts and voices. \n[S1] I'm biased, but I think we clearly won. \n[S2] Hard to disagree. (laughs) \n[S1] Thanks for listening to this demo. \n[S2] Try it now on Git hub and Hugging Face. \n[S1] If you liked our model, please give us a star and share to your friends. \n[S2] This was Nari Labs.", example_prompt_path if Path(example_prompt_path).exists() else None, 3072, 3.0, 1.3, 0.95, 35, 0.94, ], ] if examples_list: gr.Examples( examples=examples_list, inputs=[ text_input, audio_prompt_input, max_new_tokens, cfg_scale, temperature, top_p, cfg_filter_top_k, speed_factor_slider, ], outputs=[audio_output], fn=run_inference, cache_examples=False, label="Examples (Click to Run)", ) else: gr.Markdown("_(No examples configured or example prompt file missing)_") # --- Launch the App --- if __name__ == "__main__": print("Launching Gradio interface...") # set `GRADIO_SERVER_NAME`, `GRADIO_SERVER_PORT` env vars to override default values # use `GRADIO_SERVER_NAME=0.0.0.0` for Docker demo.launch()