ButterM40 commited on
Commit
7e68852
·
1 Parent(s): 4523cf6

Deploy Roleplay Chat Box - optimized version without large files

Browse files
Files changed (37) hide show
  1. LICENSE +21 -0
  2. README.md +82 -10
  3. app.py +239 -59
  4. backend/__pycache__/config.cpython-310.pyc +0 -0
  5. backend/__pycache__/config.cpython-311.pyc +0 -0
  6. backend/__pycache__/main.cpython-310.pyc +0 -0
  7. backend/__pycache__/main.cpython-311.pyc +0 -0
  8. backend/__pycache__/main_qwen3.cpython-310.pyc +0 -0
  9. backend/__pycache__/main_qwen3.cpython-311.pyc +0 -0
  10. backend/__pycache__/main_simple.cpython-310.pyc +0 -0
  11. backend/config.py +41 -0
  12. backend/models/__init__.py +1 -0
  13. backend/models/__pycache__/__init__.cpython-310.pyc +0 -0
  14. backend/models/__pycache__/__init__.cpython-311.pyc +0 -0
  15. backend/models/__pycache__/character_manager.cpython-310.pyc +0 -0
  16. backend/models/__pycache__/character_manager.cpython-311.pyc +0 -0
  17. backend/models/__pycache__/enhanced_voice_synthesizer.cpython-310.pyc +0 -0
  18. backend/models/__pycache__/simple_voice_synthesizer.cpython-310.pyc +0 -0
  19. backend/models/__pycache__/voice_synthesizer.cpython-310.pyc +0 -0
  20. backend/models/__pycache__/voice_synthesizer.cpython-311.pyc +0 -0
  21. backend/models/character_manager.py +440 -0
  22. backend/models/enhanced_voice_synthesizer.py +326 -0
  23. backend/models/optimized_character_manager.py +280 -0
  24. backend/models/simple_voice_synthesizer.py +405 -0
  25. backend/models/voice_synthesizer.py +285 -0
  26. datasets/jinx_dataset.json +50 -0
  27. datasets/moses_dataset.json +58 -0
  28. datasets/samsung_employee_dataset.json +50 -0
  29. lora_adapters/adapter_config.json +41 -0
  30. lora_adapters/adapter_model.safetensors +3 -0
  31. lora_adapters/jinx/adapter_config.json +41 -0
  32. lora_adapters/jinx/adapter_model.safetensors +3 -0
  33. lora_adapters/moses/adapter_config.json +41 -0
  34. lora_adapters/moses/adapter_model.safetensors +3 -0
  35. lora_adapters/samsung_employee/adapter_config.json +41 -0
  36. lora_adapters/samsung_employee/adapter_model.safetensors +3 -0
  37. requirements.txt +33 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Roleplay Chat Box
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,16 +1,88 @@
1
  ---
2
- title: Roleplay Chat Box
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.42.0
8
- app_file: app.py
9
  pinned: false
10
- hf_oauth: true
11
- hf_oauth_scopes:
12
- - inference-api
13
  license: mit
 
 
 
 
 
 
 
 
 
 
 
14
  ---
15
 
16
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Roleplay Chat Box 🎭
3
+ emoji: 🎭
4
+ colorFrom: purple
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
+ app_file: hf_app.py
9
  pinned: false
 
 
 
10
  license: mit
11
+ short_description: Multi-character AI roleplay chatbot with LoRA-adapted personalities
12
+ tags:
13
+ - conversational-ai
14
+ - roleplay
15
+ - lora
16
+ - character-ai
17
+ - chatbot
18
+ - transformers
19
+ - qwen
20
+ models:
21
+ - Qwen/Qwen2.5-1.5B
22
  ---
23
 
24
+ # Roleplay Chat Box 🎭
25
+
26
+ An intelligent conversational AI system featuring multiple character personalities powered by LoRA (Low-Rank Adaptation) fine-tuned language models. Each character has been trained with specific datasets to maintain unique personalities, speaking patterns, and areas of expertise.
27
+
28
+ ## Features
29
+
30
+ - **Multi-Character Support**: Switch between different character personalities using LoRA adapters
31
+ - **Voice Synthesis**: Text-to-speech with character-specific voices using Microsoft's VibeVoice
32
+ - **Modern Interface**: ChatGPT-style web interface with character selection sidebar
33
+ - **Real-time Chat**: WebSocket-based real-time communication
34
+ - **Audio Toggle**: Optional voice output for responses
35
+
36
+ ## Architecture
37
+
38
+ ```
39
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
40
+ │ Frontend │ │ Backend API │ │ ML Models │
41
+ │ (React/HTML) │◄──►│ (FastAPI) │◄──►│ Base + LoRA │
42
+ │ │ │ │ │ + VibeVoice │
43
+ │ - Chat UI │ │ - Model Loading │ │ │
44
+ │ - Character │ │ - LoRA Switching│ │ - Qwen2 Base │
45
+ │ Selection │ │ - TTS Service │ │ - Character │
46
+ │ - Audio Toggle │ │ - WebSocket │ │ Adapters │
47
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
48
+ ```
49
+
50
+ ## Characters
51
+
52
+ 1. **Moses** - Biblical prophet with wisdom and authority
53
+ 2. **Samsung Employee** - Tech-savvy corporate representative
54
+ 3. **Jinx** - Complex, chaotic character from Arcane
55
+
56
+ ## Setup
57
+
58
+ 1. Install dependencies:
59
+ ```bash
60
+ pip install -r requirements.txt
61
+ ```
62
+
63
+ 2. Download models (see Resources section)
64
+
65
+ 3. Start backend:
66
+ ```bash
67
+ cd backend
68
+ python main.py
69
+ ```
70
+
71
+ 4. Start frontend:
72
+ ```bash
73
+ cd frontend
74
+ # Instructions for your chosen frontend framework
75
+ ```
76
+
77
+ ## Requirements
78
+
79
+ - Python 3.8+
80
+ - CUDA-compatible GPU (recommended)
81
+ - ~8GB RAM minimum
82
+ - ~20GB storage for models
83
+
84
+ ## Models Used
85
+
86
+ - Base: Qwen2 (via Polarium/qwen2-yoda-lora)
87
+ - Voice: Microsoft VibeVoice-1.5B
88
+ - Custom LoRA adapters for each character
app.py CHANGED
@@ -1,70 +1,250 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
 
3
 
 
 
 
4
 
5
- def respond(
6
- message,
7
- history: list[dict[str, str]],
8
- system_message,
9
- max_tokens,
10
- temperature,
11
- top_p,
12
- hf_token: gr.OAuthToken,
13
- ):
14
- """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
16
- """
17
- client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
18
-
19
- messages = [{"role": "system", "content": system_message}]
20
-
21
- messages.extend(history)
22
 
23
- messages.append({"role": "user", "content": message})
 
 
24
 
25
- response = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- for message in client.chat_completion(
28
- messages,
29
- max_tokens=max_tokens,
30
- stream=True,
31
- temperature=temperature,
32
- top_p=top_p,
33
- ):
34
- choices = message.choices
35
- token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
 
 
 
 
 
 
 
 
38
 
39
- response += token
40
- yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- chatbot = gr.ChatInterface(
47
- respond,
48
- type="messages",
49
- additional_inputs=[
50
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
51
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
52
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
53
- gr.Slider(
54
- minimum=0.1,
55
- maximum=1.0,
56
- value=0.95,
57
- step=0.05,
58
- label="Top-p (nucleus sampling)",
59
- ),
60
- ],
61
- )
62
-
63
- with gr.Blocks() as demo:
64
- with gr.Sidebar():
65
- gr.LoginButton()
66
- chatbot.render()
67
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  if __name__ == "__main__":
70
- demo.launch()
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import os
3
+ import sys
4
+ import asyncio
5
+ import logging
6
+ from typing import List, Tuple, Optional
7
 
8
+ # Setup logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
 
12
+ # Add backend to path
13
+ backend_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'backend')
14
+ sys.path.insert(0, backend_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Global character manager
17
+ character_manager = None
18
+ models_loaded = False
19
 
20
+ def initialize_models():
21
+ """Initialize the character manager"""
22
+ global character_manager, models_loaded
23
+
24
+ if models_loaded:
25
+ return "✅ Models already loaded!"
26
+
27
+ try:
28
+ from backend.models.character_manager import CharacterManager
29
+
30
+ character_manager = CharacterManager()
31
+
32
+ # Initialize synchronously
33
+ loop = asyncio.new_event_loop()
34
+ asyncio.set_event_loop(loop)
35
+ loop.run_until_complete(character_manager.initialize())
36
+
37
+ models_loaded = True
38
+ logger.info("✅ Character models initialized successfully!")
39
+ return "✅ Models loaded successfully!"
40
+
41
+ except Exception as e:
42
+ logger.error(f"❌ Failed to initialize models: {e}")
43
+ return f"❌ Failed to load models: {str(e)}"
44
 
45
+ def get_character_info():
46
+ """Get character information for display"""
47
+ return {
48
+ "moses": {
49
+ "name": "Moses",
50
+ "description": "📚 Wise biblical figure offering guidance and wisdom",
51
+ "avatar": "👨‍🏫"
52
+ },
53
+ "samsung_employee": {
54
+ "name": "Samsung Employee",
55
+ "description": "💼 Professional tech support specialist",
56
+ "avatar": "👨‍💼"
57
+ },
58
+ "jinx": {
59
+ "name": "Jinx",
60
+ "description": "🎭 Chaotic and energetic character from Arcane",
61
+ "avatar": "🔮"
62
+ }
63
+ }
64
 
65
+ def chat_with_character(message: str, character_id: str, history: List[Tuple[str, str]]) -> Tuple[List[Tuple[str, str]], str]:
66
+ """Generate character response and update chat history"""
67
+ global character_manager, models_loaded
68
+
69
+ # Initialize if needed
70
+ if not models_loaded:
71
+ init_result = initialize_models()
72
+ if "Failed" in init_result:
73
+ return history + [(message, init_result)], ""
74
+
75
+ if not message.strip():
76
+ return history, ""
77
+
78
+ try:
79
+ if character_manager is None:
80
+ return history + [(message, "❌ Character manager not initialized")], ""
81
+
82
+ # Generate response
83
+ response = character_manager.generate_response(
84
+ character_id=character_id,
85
+ user_input=message,
86
+ max_length=512
87
+ )
88
+
89
+ # Update history
90
+ new_history = history + [(message, response)]
91
+ return new_history, ""
92
+
93
+ except Exception as e:
94
+ logger.error(f"Error generating response: {e}")
95
+ error_response = f"❌ Error: {str(e)}"
96
+ return history + [(message, error_response)], ""
97
 
98
+ def get_character_display_html(character_id: str) -> str:
99
+ """Generate HTML for character display"""
100
+ char_info = get_character_info()
101
+ if character_id not in char_info:
102
+ return "<div>Character not found</div>"
103
+
104
+ info = char_info[character_id]
105
+ return f"""
106
+ <div style="text-align: center; padding: 20px; background: linear-gradient(45deg, #667eea 0%, #764ba2 100%);
107
+ color: white; border-radius: 15px; margin: 10px;">
108
+ <div style="font-size: 4rem; margin-bottom: 10px;">{info['avatar']}</div>
109
+ <h2 style="margin: 10px 0; color: white;">{info['name']}</h2>
110
+ <p style="margin: 0; opacity: 0.9; font-size: 1.1rem;">{info['description']}</p>
111
+ </div>
112
+ """
113
 
114
+ def create_interface():
115
+ """Create the main Gradio interface"""
116
+
117
+ # Custom CSS
118
+ custom_css = """
119
+ .gradio-container {
120
+ max-width: 1200px !important;
121
+ }
122
+ .character-display {
123
+ min-height: 200px;
124
+ }
125
+ .chat-container {
126
+ height: 500px;
127
+ }
128
+ """
129
+
130
+ with gr.Blocks(
131
+ title="🎭 Roleplay Chat Box",
132
+ theme=gr.themes.Soft(primary_hue="purple"),
133
+ css=custom_css
134
+ ) as demo:
135
+
136
+ gr.Markdown("# 🎭 Roleplay Chat Box")
137
+ gr.Markdown("### Chat with AI characters, each with unique personalities!")
138
+
139
+ with gr.Row():
140
+ # Character selection column
141
+ with gr.Column(scale=1):
142
+ gr.Markdown("## 👥 Choose Character")
143
+
144
+ character_dropdown = gr.Dropdown(
145
+ choices=[
146
+ ("👨‍🏫 Moses", "moses"),
147
+ ("👨‍💼 Samsung Employee", "samsung_employee"),
148
+ ("🔮 Jinx", "jinx")
149
+ ],
150
+ value="moses",
151
+ label="Select Character",
152
+ interactive=True
153
+ )
154
+
155
+ # Character info display
156
+ character_display = gr.HTML(
157
+ value=get_character_display_html("moses"),
158
+ elem_classes=["character-display"]
159
+ )
160
+
161
+ # Update character display when selection changes
162
+ character_dropdown.change(
163
+ fn=get_character_display_html,
164
+ inputs=[character_dropdown],
165
+ outputs=[character_display]
166
+ )
167
+
168
+ # Chat column
169
+ with gr.Column(scale=2):
170
+ gr.Markdown("## 💬 Chat")
171
+
172
+ chatbot = gr.Chatbot(
173
+ height=500,
174
+ show_label=False,
175
+ elem_classes=["chat-container"]
176
+ )
177
+
178
+ with gr.Row():
179
+ msg_input = gr.Textbox(
180
+ placeholder="Type your message here...",
181
+ show_label=False,
182
+ scale=4,
183
+ lines=2
184
+ )
185
+
186
+ with gr.Column(scale=1):
187
+ send_btn = gr.Button("Send 📨", variant="primary")
188
+ clear_btn = gr.Button("Clear 🗑️")
189
+
190
+ # Status section
191
+ with gr.Row():
192
+ status_display = gr.Textbox(
193
+ value="Click 'Initialize Models' to start chatting!",
194
+ label="Status",
195
+ interactive=False
196
+ )
197
+ init_btn = gr.Button("Initialize Models 🚀", variant="secondary")
198
+
199
+ # Event handlers
200
+ def send_message(message, character_id, history):
201
+ return chat_with_character(message, character_id, history)
202
+
203
+ def clear_chat():
204
+ return [], "Chat cleared!"
205
+
206
+ def init_models_handler():
207
+ return initialize_models()
208
+
209
+ # Button clicks
210
+ send_btn.click(
211
+ fn=send_message,
212
+ inputs=[msg_input, character_dropdown, chatbot],
213
+ outputs=[chatbot, msg_input]
214
+ )
215
+
216
+ msg_input.submit(
217
+ fn=send_message,
218
+ inputs=[msg_input, character_dropdown, chatbot],
219
+ outputs=[chatbot, msg_input]
220
+ )
221
+
222
+ clear_btn.click(
223
+ fn=clear_chat,
224
+ outputs=[chatbot, status_display]
225
+ )
226
+
227
+ init_btn.click(
228
+ fn=init_models_handler,
229
+ outputs=[status_display]
230
+ )
231
+
232
+ # Example interactions
233
+ gr.Markdown("""
234
+ ### 💡 Example Conversations
235
+ - **Moses**: "What is the meaning of wisdom?"
236
+ - **Samsung Employee**: "Tell me about the latest Samsung phones"
237
+ - **Jinx**: "I need help with a creative project!"
238
+ """)
239
+
240
+ return demo
241
 
242
  if __name__ == "__main__":
243
+ # Create and launch the interface
244
+ demo = create_interface()
245
+ demo.launch(
246
+ server_name="0.0.0.0",
247
+ server_port=7860,
248
+ show_error=True,
249
+ share=False
250
+ )
backend/__pycache__/config.cpython-310.pyc ADDED
Binary file (1.8 kB). View file
 
backend/__pycache__/config.cpython-311.pyc ADDED
Binary file (3.15 kB). View file
 
backend/__pycache__/main.cpython-310.pyc ADDED
Binary file (5.93 kB). View file
 
backend/__pycache__/main.cpython-311.pyc ADDED
Binary file (11.4 kB). View file
 
backend/__pycache__/main_qwen3.cpython-310.pyc ADDED
Binary file (6.89 kB). View file
 
backend/__pycache__/main_qwen3.cpython-311.pyc ADDED
Binary file (12.9 kB). View file
 
backend/__pycache__/main_simple.cpython-310.pyc ADDED
Binary file (9.39 kB). View file
 
backend/config.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from pydantic_settings import BaseSettings
4
+
5
+ load_dotenv()
6
+
7
+ # Get project root directory (parent of backend)
8
+ PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
9
+
10
+ class Settings(BaseSettings):
11
+ # Model Configuration - use absolute paths based on project root
12
+ MODEL_PATH: str = os.path.join(PROJECT_ROOT, "models")
13
+ LORA_ADAPTERS_PATH: str = os.path.join(PROJECT_ROOT, "lora_adapters")
14
+ VOICE_MODEL_PATH: str = os.path.join(PROJECT_ROOT, "models", "vibevoice")
15
+
16
+ # API Configuration
17
+ API_HOST: str = os.getenv("API_HOST", "0.0.0.0")
18
+ API_PORT: int = int(os.getenv("API_PORT", "8000"))
19
+ DEBUG: bool = os.getenv("DEBUG", "True").lower() == "true"
20
+
21
+ # Model Configuration
22
+ BASE_MODEL: str = os.getenv("BASE_MODEL", "Qwen/Qwen3-0.6B")
23
+ DEVICE: str = os.getenv("DEVICE", "cuda")
24
+ MAX_LENGTH: int = int(os.getenv("MAX_LENGTH", "2048"))
25
+ TEMPERATURE: float = float(os.getenv("TEMPERATURE", "0.7"))
26
+ TOP_P: float = float(os.getenv("TOP_P", "0.9"))
27
+
28
+ # Audio Configuration
29
+ SAMPLE_RATE: int = int(os.getenv("SAMPLE_RATE", "22050"))
30
+ AUDIO_FORMAT: str = os.getenv("AUDIO_FORMAT", "wav")
31
+ ENABLE_VOICE: bool = os.getenv("ENABLE_VOICE", "False").lower() == "true" # Disabled by default for easier deployment
32
+
33
+ # Character Configuration
34
+ DEFAULT_CHARACTER: str = os.getenv("DEFAULT_CHARACTER", "moses")
35
+
36
+ @property
37
+ def AVAILABLE_CHARACTERS(self) -> list:
38
+ characters_str = os.getenv("AVAILABLE_CHARACTERS", "moses,samsung_employee,jinx")
39
+ return [char.strip() for char in characters_str.split(",")]
40
+
41
+ settings = Settings()
backend/models/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Empty __init__.py file to make this a Python package
backend/models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (164 Bytes). View file
 
backend/models/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (180 Bytes). View file
 
backend/models/__pycache__/character_manager.cpython-310.pyc ADDED
Binary file (12.1 kB). View file
 
backend/models/__pycache__/character_manager.cpython-311.pyc ADDED
Binary file (19.4 kB). View file
 
backend/models/__pycache__/enhanced_voice_synthesizer.cpython-310.pyc ADDED
Binary file (8.59 kB). View file
 
backend/models/__pycache__/simple_voice_synthesizer.cpython-310.pyc ADDED
Binary file (8.3 kB). View file
 
backend/models/__pycache__/voice_synthesizer.cpython-310.pyc ADDED
Binary file (7.3 kB). View file
 
backend/models/__pycache__/voice_synthesizer.cpython-311.pyc ADDED
Binary file (9.6 kB). View file
 
backend/models/character_manager.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig
3
+ from peft import PeftModel, PeftConfig
4
+ import asyncio
5
+ import logging
6
+ from typing import Dict, List, Optional
7
+ import os
8
+ from functools import lru_cache
9
+ import hashlib
10
+ import time
11
+ import sys
12
+ import os
13
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
14
+ from config import settings
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class CharacterManager:
19
+ def __init__(self):
20
+ self.base_model = None
21
+ self.tokenizer = None
22
+ self.current_character = None
23
+ self.character_models: Dict[str, PeftModel] = {}
24
+ self.character_prompts: Dict[str, str] = {}
25
+ self.response_cache: Dict[str, str] = {} # Simple response caching
26
+ self.generation_config = None # Optimized generation config
27
+
28
+ async def initialize(self):
29
+ """Initialize base model and load character adapters with speed optimization"""
30
+ logger.info("Loading base model with speed optimization...")
31
+
32
+ # Ensure we're in the correct working directory
33
+ current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
34
+ parent_dir = os.path.dirname(current_dir)
35
+ logger.info(f"Working from directory: {parent_dir}")
36
+ logger.info(f"LoRA adapters path: {settings.LORA_ADAPTERS_PATH}")
37
+
38
+ try:
39
+ # Load tokenizer quickly
40
+ self.tokenizer = AutoTokenizer.from_pretrained(
41
+ settings.BASE_MODEL,
42
+ trust_remote_code=True,
43
+ use_fast=True # Use fast tokenizer
44
+ )
45
+
46
+ # Smart GPU/CPU loading
47
+ cuda_available = torch.cuda.is_available()
48
+ use_gpu = settings.DEVICE == "cuda" and cuda_available
49
+
50
+ if use_gpu:
51
+ gpu_name = torch.cuda.get_device_name(0)
52
+ gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
53
+ logger.info(f"🚀 Loading with GPU: {gpu_name} ({gpu_memory:.1f}GB VRAM)")
54
+
55
+ self.base_model = AutoModelForCausalLM.from_pretrained(
56
+ settings.BASE_MODEL,
57
+ torch_dtype=torch.float16, # Use FP16 for GPU
58
+ device_map="auto",
59
+ trust_remote_code=True,
60
+ low_cpu_mem_usage=True,
61
+ use_cache=True,
62
+ load_in_8bit=False, # Can enable for very large models
63
+ load_in_4bit=False # Can enable for even larger models
64
+ )
65
+ else:
66
+ logger.info("💻 Loading with CPU (CUDA not available or disabled)...")
67
+ self.base_model = AutoModelForCausalLM.from_pretrained(
68
+ settings.BASE_MODEL,
69
+ torch_dtype=torch.float32,
70
+ trust_remote_code=True,
71
+ use_cache=True
72
+ )
73
+ except Exception as e:
74
+ logger.error(f"Failed to load base model {settings.BASE_MODEL}: {e}")
75
+ logger.info("Trying alternative Qwen models...")
76
+ try:
77
+ # Try Qwen2.5-0.5B as backup
78
+ fallback_model = "Qwen/Qwen2.5-0.5B-Instruct"
79
+ self.tokenizer = AutoTokenizer.from_pretrained(fallback_model, trust_remote_code=True)
80
+ if settings.DEVICE == "cuda" and torch.cuda.is_available():
81
+ self.base_model = AutoModelForCausalLM.from_pretrained(
82
+ fallback_model,
83
+ torch_dtype=torch.float16,
84
+ device_map="auto",
85
+ trust_remote_code=True
86
+ )
87
+ else:
88
+ self.base_model = AutoModelForCausalLM.from_pretrained(
89
+ fallback_model,
90
+ torch_dtype=torch.float32,
91
+ trust_remote_code=True
92
+ )
93
+ logger.info(f"Loaded fallback model: {fallback_model}")
94
+ except Exception as e2:
95
+ logger.error(f"Failed to load fallback model: {e2}")
96
+ raise Exception("No suitable Qwen model could be loaded")
97
+
98
+ # Set padding token to avoid confusion with eos_token
99
+ if self.tokenizer.pad_token is None:
100
+ # For Qwen models, use the eos token as pad token
101
+ self.tokenizer.pad_token = self.tokenizer.eos_token
102
+
103
+ # Ensure we have proper chat template for better formatting
104
+ if not hasattr(self.tokenizer, 'chat_template') or self.tokenizer.chat_template is None:
105
+ # Set a basic chat template for consistent formatting
106
+ self.tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'system' %}<|system|>\n{{ message['content'] }}\n{% elif message['role'] == 'user' %}<|user|>\n{{ message['content'] }}\n{% elif message['role'] == 'assistant' %}<|assistant|>\n{{ message['content'] }}\n{% endif %}{% endfor %}<|assistant|>\n"
107
+
108
+ # Set up generation config with strict character control
109
+ self.generation_config = GenerationConfig(
110
+ max_new_tokens=80, # Balanced length
111
+ min_new_tokens=10, # Ensure substantial output
112
+ temperature=0.7, # Less randomness for consistency
113
+ top_p=0.85, # Focused sampling
114
+ top_k=40, # Reduced for better focus
115
+ do_sample=True,
116
+ repetition_penalty=1.15, # Stronger penalty to avoid loops
117
+ # Use original model config token IDs to prevent tokenizer alignment warnings
118
+ bos_token_id=self.base_model.config.bos_token_id,
119
+ pad_token_id=self.base_model.config.pad_token_id,
120
+ eos_token_id=self.base_model.config.eos_token_id,
121
+ use_cache=True,
122
+ num_beams=1,
123
+ output_scores=False,
124
+ return_dict_in_generate=False
125
+ )
126
+
127
+ # Load character prompts
128
+ self._load_character_prompts()
129
+
130
+ # Load LoRA adapters for each character
131
+ for character_id in settings.AVAILABLE_CHARACTERS:
132
+ await self._load_character_adapter(character_id)
133
+
134
+ logger.info("Character manager initialized successfully")
135
+
136
+ def _load_character_prompts(self):
137
+ """Load character-specific system prompts - simplified for character-focused training"""
138
+ # Minimal prompts since the new adapters are trained for direct character embodiment
139
+ self.character_prompts = {
140
+ "moses": "You are Moses, the biblical prophet and leader of the Israelites.",
141
+ "samsung_employee": "You are an enthusiastic Samsung employee who loves technology.",
142
+ "jinx": "You are Jinx from Arcane - chaotic, brilliant, and emotionally complex."
143
+ }
144
+
145
+ async def _load_character_adapter(self, character_id: str):
146
+ """Load LoRA adapter for specific character with separate model instances"""
147
+ adapter_path = os.path.join(settings.LORA_ADAPTERS_PATH, character_id)
148
+
149
+ # Debug: Print the paths being checked
150
+ logger.info(f"Looking for LoRA adapter for {character_id} at: {adapter_path}")
151
+
152
+ # Check if we have a proper LoRA adapter (needs adapter_model.safetensors)
153
+ adapter_model_path = os.path.join(adapter_path, "adapter_model.safetensors")
154
+ adapter_config_path = os.path.join(adapter_path, "adapter_config.json")
155
+
156
+ logger.info(f"Checking for adapter files:")
157
+ logger.info(f" - adapter_model.safetensors: {os.path.exists(adapter_model_path)}")
158
+ logger.info(f" - adapter_config.json: {os.path.exists(adapter_config_path)}")
159
+
160
+ if os.path.exists(adapter_model_path) and os.path.exists(adapter_config_path):
161
+ try:
162
+ logger.info(f"Attempting to load LoRA adapter for {character_id}...")
163
+
164
+ # Create a separate base model instance for this character to avoid conflicts
165
+ # This is crucial to prevent the "multiple adapters" warning and character bleed
166
+ character_base_model = AutoModelForCausalLM.from_pretrained(
167
+ settings.BASE_MODEL,
168
+ torch_dtype=torch.float16 if (settings.DEVICE == "cuda" and torch.cuda.is_available()) else torch.float32,
169
+ device_map="auto" if (settings.DEVICE == "cuda" and torch.cuda.is_available()) else None,
170
+ trust_remote_code=True,
171
+ low_cpu_mem_usage=True,
172
+ use_cache=True
173
+ )
174
+
175
+ # Load the LoRA adapter on the fresh model instance
176
+ model_with_adapter = PeftModel.from_pretrained(
177
+ character_base_model,
178
+ adapter_path,
179
+ adapter_name=character_id,
180
+ is_trainable=False # Set to inference mode
181
+ )
182
+
183
+ # Ensure adapter is on correct device
184
+ device = next(self.base_model.parameters()).device
185
+ model_with_adapter = model_with_adapter.to(device)
186
+
187
+ self.character_models[character_id] = model_with_adapter
188
+ logger.info(f"✅ Successfully loaded LoRA adapter for {character_id} with dedicated model instance")
189
+ except Exception as e:
190
+ logger.error(f"❌ Could not load LoRA adapter for {character_id}: {e}")
191
+ logger.error(f" Adapter path: {adapter_path}")
192
+ # Fall back to base model with character prompt only
193
+ self.character_models[character_id] = self.base_model
194
+ else:
195
+ missing_files = []
196
+ if not os.path.exists(adapter_model_path):
197
+ missing_files.append("adapter_model.safetensors")
198
+ if not os.path.exists(adapter_config_path):
199
+ missing_files.append("adapter_config.json")
200
+
201
+ logger.warning(f"❌ No trained LoRA adapter found for {character_id}")
202
+ logger.warning(f" Missing files: {', '.join(missing_files)}")
203
+ logger.warning(f" Path checked: {adapter_path}")
204
+ logger.warning(f" Using base model with character prompt only")
205
+ self.character_models[character_id] = self.base_model
206
+
207
+ def _create_cache_key(self, character_id: str, user_message: str, conversation_history: List[Dict] = None) -> str:
208
+ """Create a cache key for response caching"""
209
+ history_str = str(conversation_history[-2:]) if conversation_history else "" # Only last 2 for caching
210
+ cache_input = f"{character_id}:{user_message}:{history_str}"
211
+ return hashlib.md5(cache_input.encode()).hexdigest()
212
+
213
+ def generate_response(
214
+ self,
215
+ character_id: str,
216
+ user_message: str,
217
+ conversation_history: List[Dict] = None
218
+ ) -> str:
219
+ """Generate response as specific character with caching"""
220
+
221
+ # Check cache first for faster responses
222
+ cache_key = self._create_cache_key(character_id, user_message, conversation_history)
223
+ if cache_key in self.response_cache:
224
+ logger.info(f"Cache hit for {character_id}")
225
+ return self.response_cache[cache_key]
226
+
227
+ if character_id not in self.character_models:
228
+ raise ValueError(f"Character {character_id} not available")
229
+
230
+ # Get character-specific model and prompt
231
+ model = self.character_models[character_id]
232
+ system_prompt = self.character_prompts.get(character_id, "")
233
+
234
+ # Build conversation context
235
+ messages = []
236
+ if system_prompt:
237
+ messages.append({"role": "system", "content": system_prompt})
238
+
239
+ # Add minimal conversation history for speed
240
+ if conversation_history:
241
+ messages.extend(conversation_history[-2:]) # Keep only last 2 messages
242
+
243
+ # Add current message
244
+ messages.append({"role": "user", "content": user_message})
245
+
246
+ # Format for the model
247
+ formatted_prompt = self._format_messages(messages)
248
+
249
+ # Extended tokenization for longer context and responses
250
+ inputs = self.tokenizer(
251
+ formatted_prompt,
252
+ return_tensors="pt",
253
+ max_length=1024, # Much longer context for detailed responses
254
+ truncation=True,
255
+ padding=False
256
+ )
257
+
258
+ if settings.DEVICE == "cuda" and torch.cuda.is_available():
259
+ inputs = {k: v.cuda() for k, v in inputs.items()}
260
+
261
+ start_time = time.time()
262
+
263
+ # Character-focused generation optimized for consistency
264
+ try:
265
+ with torch.no_grad():
266
+ outputs = model.generate(
267
+ **inputs,
268
+ max_new_tokens=120, # Balanced for character consistency
269
+ min_new_tokens=25, # Ensure substantial responses
270
+ temperature=0.7, # Stable creativity for character voice
271
+ top_p=0.85, # Focused sampling
272
+ top_k=40, # Controlled variety
273
+ do_sample=True,
274
+ pad_token_id=self.tokenizer.pad_token_id,
275
+ eos_token_id=self.tokenizer.eos_token_id,
276
+ repetition_penalty=1.15, # Stronger penalty for cleaner responses
277
+ use_cache=True,
278
+ no_repeat_ngram_size=2, # Prevent immediate repetition
279
+ early_stopping=True # Natural completion
280
+ )
281
+ except Exception as gen_error:
282
+ logger.warning(f"Generation failed: {gen_error}")
283
+ return "I apologize, but I'm having trouble generating a response right now. Please try again!"
284
+
285
+ # Decode response (skip the input tokens)
286
+ input_length = inputs['input_ids'].shape[1]
287
+ response = self.tokenizer.decode(
288
+ outputs[0][input_length:],
289
+ skip_special_tokens=True
290
+ ).strip()
291
+
292
+ # Clean up response - stop at conversation indicators (updated for new format)
293
+ stop_phrases = ["Human:", "\nHuman:", "User:", "\nUser:", "<|endoftext|>", "<|", "\n\nHuman:"]
294
+ for stop_phrase in stop_phrases:
295
+ if stop_phrase in response:
296
+ response = response.split(stop_phrase)[0].strip()
297
+
298
+ # Remove meta-commentary patterns
299
+ meta_patterns = [
300
+ "Let me see.", "As Moses, I", "As a Samsung employee, I", "As Jinx, I",
301
+ "The user", "I should respond", "I need to", "Let me think",
302
+ "Okay, the user", "I would", "Since I"
303
+ ]
304
+
305
+ for pattern in meta_patterns:
306
+ if response.startswith(pattern):
307
+ # Find the first sentence that doesn't contain meta-commentary
308
+ sentences = response.split('.', 1)
309
+ if len(sentences) > 1:
310
+ response = sentences[1].strip()
311
+
312
+ # Clean up incomplete sentences at the end
313
+ import re
314
+ # If response ends mid-sentence (no punctuation), try to find last complete sentence
315
+ if response and not response.strip()[-1] in '.!?':
316
+ sentences = re.split(r'[.!?]+', response)
317
+ if len(sentences) > 1:
318
+ # Keep all complete sentences
319
+ complete_sentences = sentences[:-1] # Remove the incomplete last sentence
320
+ if complete_sentences:
321
+ response = '. '.join(complete_sentences).strip()
322
+ if response and not response.endswith(('.', '!', '?')):
323
+ response += '.'
324
+
325
+ # Ensure we have a meaningful response
326
+ if not response or len(response.strip()) < 10:
327
+ fallback_responses = {
328
+ "jinx": "*spins around excitedly* Hey there! Ready for some chaos? What's cooking in that brain of yours?",
329
+ "moses": "Peace be with you, my child. How may I guide you in the ways of the Almighty?",
330
+ "samsung_employee": "Hello! I'm excited to help you discover amazing Samsung technology!"
331
+ }
332
+ response = fallback_responses.get(character_id, "Hello! How can I help you today?")
333
+
334
+ # Cache the response (limit cache size)
335
+ if len(self.response_cache) > 50: # Simple cache size limit
336
+ # Remove oldest entries (simple FIFO)
337
+ oldest_key = next(iter(self.response_cache))
338
+ del self.response_cache[oldest_key]
339
+
340
+ self.response_cache[cache_key] = response
341
+
342
+ # Clean response to remove meta-commentary and character bleed
343
+ response = self._clean_response(response, character_id)
344
+
345
+ generation_time = time.time() - start_time
346
+ logger.info(f"Generated response for {character_id} in {generation_time:.2f}s")
347
+
348
+ return response
349
+
350
+ def _clean_response(self, response: str, character_id: str) -> str:
351
+ """Clean response to remove meta-commentary and ensure character consistency"""
352
+ if not response:
353
+ return response
354
+
355
+ import re
356
+
357
+ # Remove common meta-commentary patterns
358
+ meta_patterns = [
359
+ r"Answer:\s*",
360
+ r"Response:\s*",
361
+ r"This (?:response|answer)\s.*?[.!?]",
362
+ r"Let me (?:think|consider|analyze)\s.*?[.!?]",
363
+ r"Based on (?:the|this)\s.*?[.!?]",
364
+ r"The (?:user|question)\s.*?[.!?]",
365
+ r"I (?:need to|should|will)\s(?:respond|answer)\s.*?[.!?]",
366
+ r"\(.*?\)", # Remove parenthetical commentary
367
+ r"Looking at.*?[.!?]",
368
+ r"Analyzing.*?[.!?]",
369
+ ]
370
+
371
+ for pattern in meta_patterns:
372
+ response = re.sub(pattern, "", response, flags=re.IGNORECASE | re.DOTALL)
373
+
374
+ # Remove character name prefixes that cause bleed
375
+ response = re.sub(r"^(?:Moses|Samsung_Employee|Jinx):\s*", "", response, flags=re.IGNORECASE)
376
+
377
+ # Remove multiple character references
378
+ other_chars = ["Moses", "Samsung_Employee", "Jinx"]
379
+ for char in other_chars:
380
+ if char.lower() != character_id.lower():
381
+ response = re.sub(f"{char}:\s*", "", response, flags=re.IGNORECASE)
382
+
383
+ # Clean up extra whitespace and newlines
384
+ response = re.sub(r"\n\s*\n+", "\n\n", response)
385
+ response = re.sub(r"\s+", " ", response) # Multiple spaces to single
386
+ response = response.strip()
387
+
388
+ # If response is too short after cleaning, provide character-appropriate fallback
389
+ if len(response.split()) < 5:
390
+ fallbacks = {
391
+ "moses": "Peace be with you, my child. How may I guide you in the ways of the Almighty?",
392
+ "samsung_employee": "Hello! I'm excited to help you discover the amazing features of Samsung Galaxy devices!",
393
+ "jinx": "*spins around excitedly* Hey there! Ready for some chaos? I've got explosive ideas to share!"
394
+ }
395
+ response = fallbacks.get(character_id, "Hello! How can I help you today?")
396
+
397
+ return response
398
+
399
+ def _format_messages(self, messages: List[Dict]) -> str:
400
+ """Format messages for character-focused training format"""
401
+ formatted = ""
402
+
403
+ # Add conversation history
404
+ for msg in messages:
405
+ role = msg["role"]
406
+ content = msg["content"]
407
+
408
+ if role == "system":
409
+ continue # Skip system messages for character-focused format
410
+ elif role == "user":
411
+ formatted += f"Human: {content}\n\n"
412
+ elif role == "assistant":
413
+ formatted += f"{content}\n\nHuman: "
414
+
415
+ return formatted
416
+
417
+ async def switch_character(self, character_id: str):
418
+ """Switch to different character"""
419
+ if character_id in self.character_models:
420
+ self.current_character = character_id
421
+ logger.info(f"Switched to character: {character_id}")
422
+ else:
423
+ raise ValueError(f"Character {character_id} not available")
424
+
425
+ def get_available_characters(self) -> List[str]:
426
+ """Get list of available character IDs"""
427
+ return list(self.character_models.keys())
428
+
429
+ def clear_cache(self):
430
+ """Clear response cache"""
431
+ self.response_cache.clear()
432
+ logger.info("Response cache cleared")
433
+
434
+ def get_cache_stats(self) -> Dict:
435
+ """Get cache statistics"""
436
+ return {
437
+ "cache_size": len(self.response_cache),
438
+ "available_characters": len(self.character_models),
439
+ "current_character": self.current_character
440
+ }
backend/models/enhanced_voice_synthesizer.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import asyncio
3
+ import logging
4
+ import base64
5
+ import io
6
+ import numpy as np
7
+ from typing import Optional
8
+ from backend.config import settings
9
+ import math
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class SimpleVoiceSynthesizer:
14
+ """
15
+ An improved simple voice synthesizer that creates more realistic speech-like audio
16
+ using phoneme patterns, formant synthesis, and prosody modeling.
17
+ """
18
+
19
+ def __init__(self):
20
+ self.character_voice_configs = {}
21
+ self.initialized = False
22
+
23
+ async def initialize(self):
24
+ """Initialize simple voice synthesis"""
25
+ if not settings.ENABLE_VOICE:
26
+ logger.info("Voice synthesis disabled in config")
27
+ return False
28
+
29
+ logger.info("Initializing improved simple voice synthesizer...")
30
+
31
+ try:
32
+ # Setup character-specific voice parameters
33
+ self._setup_character_voices()
34
+ self.initialized = True
35
+ logger.info("Improved simple voice synthesizer initialized successfully")
36
+ return True
37
+
38
+ except Exception as e:
39
+ logger.error(f"Failed to initialize simple voice synthesizer: {e}")
40
+ return False
41
+
42
+ def _setup_character_voices(self):
43
+ """Setup character-specific voice configurations"""
44
+ self.character_voice_configs = {
45
+ "moses": {
46
+ "base_frequency": 110, # Lower, more authoritative
47
+ "speed": 0.85, # Slower, more measured
48
+ "pitch_variance": 0.15, # Less pitch variation
49
+ "formant_shift": -0.2, # Deeper formants
50
+ "voice_quality": "deep",
51
+ },
52
+ "samsung_employee": {
53
+ "base_frequency": 140, # Professional, clear
54
+ "speed": 1.0, # Normal speed
55
+ "pitch_variance": 0.2, # Moderate variation
56
+ "formant_shift": 0.0, # Neutral formants
57
+ "voice_quality": "clear",
58
+ },
59
+ "jinx": {
60
+ "base_frequency": 180, # Higher, more energetic
61
+ "speed": 1.2, # Faster speech
62
+ "pitch_variance": 0.35, # More pitch variation
63
+ "formant_shift": 0.3, # Brighter formants
64
+ "voice_quality": "bright",
65
+ }
66
+ }
67
+
68
+ async def synthesize(self, text: str, character_id: str) -> Optional[str]:
69
+ """Synthesize speech for given text and character"""
70
+ if not self.initialized or not settings.ENABLE_VOICE:
71
+ return None
72
+
73
+ try:
74
+ # Get character voice config
75
+ voice_config = self.character_voice_configs.get(
76
+ character_id,
77
+ self.character_voice_configs["samsung_employee"] # Default
78
+ )
79
+
80
+ # Generate realistic speech audio
81
+ audio_data = self._generate_realistic_speech(text, voice_config)
82
+
83
+ # Convert to base64 for web transmission
84
+ audio_base64 = self._audio_to_base64(audio_data)
85
+
86
+ logger.info(f"Generated realistic speech for {character_id}: {len(text)} chars, {len(audio_data)} samples")
87
+ return audio_base64
88
+
89
+ except Exception as e:
90
+ logger.error(f"Error in simple voice synthesis: {e}")
91
+ return None
92
+
93
+ def _generate_realistic_speech(self, text: str, voice_config: dict) -> np.ndarray:
94
+ """Generate realistic speech using advanced phoneme and prosody modeling"""
95
+
96
+ # Calculate duration based on speaking rate
97
+ words = len(text.split())
98
+ chars = len(text)
99
+
100
+ # Realistic speaking rates: 150-180 words per minute
101
+ base_wpm = 160
102
+ speed_factor = voice_config["speed"]
103
+ actual_wpm = base_wpm * speed_factor
104
+
105
+ # Calculate duration
106
+ duration = (words / actual_wpm) * 60 # Convert to seconds
107
+ duration = max(duration, chars / 15.0) # Minimum based on character count
108
+ duration = min(duration, 30.0) # Maximum 30 seconds
109
+
110
+ sample_rate = settings.SAMPLE_RATE
111
+ num_samples = int(duration * sample_rate)
112
+
113
+ # Create time array
114
+ t = np.linspace(0, duration, num_samples)
115
+
116
+ # Generate phoneme-based speech patterns
117
+ audio = self._create_phoneme_speech(t, text, voice_config)
118
+
119
+ # Apply prosody (intonation patterns)
120
+ prosody = self._generate_prosody(t, text, voice_config)
121
+ audio *= prosody
122
+
123
+ # Apply character-specific voice quality
124
+ audio = self._apply_voice_quality(audio, t, voice_config)
125
+
126
+ # Add natural speech envelope
127
+ envelope = self._create_speech_envelope(audio, t)
128
+ audio *= envelope
129
+
130
+ # Normalize and return
131
+ if np.max(np.abs(audio)) > 0:
132
+ audio = audio / np.max(np.abs(audio)) * 0.8
133
+
134
+ return audio.astype(np.float32)
135
+
136
+ def _create_phoneme_speech(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
137
+ """Create speech-like audio using phoneme patterns"""
138
+
139
+ audio = np.zeros_like(t)
140
+ base_freq = voice_config["base_frequency"]
141
+
142
+ # Create syllable timing based on text
143
+ syllable_rate = 4.0 * voice_config["speed"] # syllables per second
144
+ syllable_duration = 1.0 / syllable_rate
145
+
146
+ for i, sample_time in enumerate(t):
147
+ # Determine current syllable position
148
+ syllable_phase = (sample_time % syllable_duration) / syllable_duration
149
+
150
+ # Create vowel/consonant pattern
151
+ # Vowels: 0.2-0.8 of syllable, Consonants: 0.0-0.2 and 0.8-1.0
152
+ is_vowel = 0.2 < syllable_phase < 0.8
153
+
154
+ # Get fundamental frequency with natural variation
155
+ pitch_variation = voice_config["pitch_variance"]
156
+ f0 = base_freq * (1 + pitch_variation * np.sin(2 * np.pi * 2.3 * sample_time))
157
+
158
+ if is_vowel:
159
+ # Generate vowel sound using formant synthesis
160
+ vowel_sound = self._generate_vowel_formants(sample_time, f0, voice_config)
161
+ audio[i] = vowel_sound
162
+ else:
163
+ # Generate consonant sound using filtered noise
164
+ consonant_sound = self._generate_consonant(sample_time, f0, voice_config)
165
+ audio[i] = consonant_sound
166
+
167
+ return audio
168
+
169
+ def _generate_vowel_formants(self, t: float, f0: float, voice_config: dict) -> float:
170
+ """Generate vowel sounds using formant frequencies"""
171
+
172
+ formant_shift = voice_config["formant_shift"]
173
+
174
+ # Vowel formant frequencies (approximate average)
175
+ f1 = 650 * (1 + formant_shift * 0.5) # First formant
176
+ f2 = 1400 * (1 + formant_shift * 0.3) # Second formant
177
+ f3 = 2500 * (1 + formant_shift * 0.2) # Third formant
178
+
179
+ # Add slight formant movement for naturalness
180
+ f1 += 50 * np.sin(2 * np.pi * 1.7 * t)
181
+ f2 += 80 * np.sin(2 * np.pi * 2.1 * t)
182
+
183
+ # Generate harmonic series for fundamental
184
+ fundamental = 0.4 * np.sin(2 * np.pi * f0 * t)
185
+
186
+ # Generate formant resonances
187
+ formant1 = 0.3 * np.sin(2 * np.pi * f1 * t) * np.exp(-abs(f1 - f0*1) / 200)
188
+ formant2 = 0.2 * np.sin(2 * np.pi * f2 * t) * np.exp(-abs(f2 - f0*2) / 300)
189
+ formant3 = 0.1 * np.sin(2 * np.pi * f3 * t) * np.exp(-abs(f3 - f0*3) / 500)
190
+
191
+ # Add harmonics
192
+ harmonic2 = 0.2 * np.sin(2 * np.pi * f0 * 2 * t)
193
+ harmonic3 = 0.1 * np.sin(2 * np.pi * f0 * 3 * t)
194
+
195
+ return fundamental + formant1 + formant2 + formant3 + harmonic2 + harmonic3
196
+
197
+ def _generate_consonant(self, t: float, f0: float, voice_config: dict) -> float:
198
+ """Generate consonant sounds using filtered noise and fricatives"""
199
+
200
+ # Create noise component for fricatives
201
+ noise = (np.random.randn() - 0.5) * 0.15
202
+
203
+ # Add some periodic component for voiced consonants
204
+ periodic = 0.1 * np.sin(2 * np.pi * f0 * t)
205
+
206
+ # Filter noise based on consonant type (simplified)
207
+ filtered_noise = noise * (1 + 0.5 * np.sin(2 * np.pi * 3000 * t))
208
+
209
+ return filtered_noise + periodic * 0.3
210
+
211
+ def _generate_prosody(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
212
+ """Generate natural prosody (intonation) patterns"""
213
+
214
+ prosody = np.ones_like(t)
215
+ duration = t[-1] if len(t) > 0 else 1.0
216
+
217
+ # Sentence-level intonation
218
+ time_norm = t / duration
219
+
220
+ if text.endswith('?'):
221
+ # Question: rising intonation
222
+ prosody *= (0.8 + 0.4 * time_norm)
223
+ elif text.endswith('!'):
224
+ # Exclamation: dramatic contour
225
+ prosody *= (0.9 + 0.3 * np.sin(np.pi * time_norm) * np.exp(-time_norm))
226
+ else:
227
+ # Statement: natural declination
228
+ prosody *= (1.0 - 0.2 * time_norm)
229
+
230
+ # Add micro-prosody for naturalness
231
+ prosody *= (1 + 0.05 * np.sin(2 * np.pi * 8 * t))
232
+
233
+ # Character-specific prosody
234
+ if voice_config.get("voice_quality") == "bright":
235
+ # More animated prosody for energetic characters
236
+ prosody *= (1 + 0.1 * np.sin(2 * np.pi * 2.5 * t))
237
+ elif voice_config.get("voice_quality") == "deep":
238
+ # More steady prosody for authoritative characters
239
+ prosody *= (1 + 0.03 * np.sin(2 * np.pi * 1.2 * t))
240
+
241
+ return prosody
242
+
243
+ def _apply_voice_quality(self, audio: np.ndarray, t: np.ndarray, voice_config: dict) -> np.ndarray:
244
+ """Apply character-specific voice quality effects"""
245
+
246
+ quality = voice_config.get("voice_quality", "clear")
247
+
248
+ if quality == "deep":
249
+ # Add subtle sub-harmonics for deeper voice
250
+ subharmonic = 0.05 * np.sin(np.pi * t)
251
+ audio = audio + subharmonic[:len(audio)]
252
+
253
+ elif quality == "bright":
254
+ # Emphasize higher frequencies for brighter voice
255
+ high_freq = 0.03 * np.sin(2 * np.pi * 4000 * t)
256
+ audio = audio + high_freq[:len(audio)]
257
+
258
+ # Add very subtle vocal fry for naturalness
259
+ fry_rate = 70 # Hz
260
+ fry = 0.01 * np.sin(2 * np.pi * fry_rate * t) * (np.random.randn(len(t)) * 0.5 + 0.5)
261
+ audio = audio + fry[:len(audio)]
262
+
263
+ return audio
264
+
265
+ def _create_speech_envelope(self, audio: np.ndarray, t: np.ndarray) -> np.ndarray:
266
+ """Create natural speech amplitude envelope"""
267
+
268
+ envelope = np.ones_like(audio)
269
+
270
+ # Fade in/out
271
+ fade_samples = min(int(0.05 * len(audio)), 1000)
272
+ if fade_samples > 0:
273
+ envelope[:fade_samples] *= np.linspace(0, 1, fade_samples)
274
+ envelope[-fade_samples:] *= np.linspace(1, 0, fade_samples)
275
+
276
+ # Add speech rhythm (breathing, pauses)
277
+ breath_rate = 0.3 # Subtle breathing pattern
278
+ envelope *= (0.95 + 0.05 * np.sin(2 * np.pi * breath_rate * t))
279
+
280
+ return envelope
281
+
282
+ def _audio_to_base64(self, audio_data: np.ndarray) -> str:
283
+ """Convert audio numpy array to base64 string"""
284
+ # Convert to 16-bit PCM
285
+ audio_int16 = (np.clip(audio_data, -1, 1) * 32767).astype(np.int16)
286
+
287
+ # Create WAV file in memory manually
288
+ buffer = io.BytesIO()
289
+
290
+ # WAV file parameters
291
+ sample_rate = settings.SAMPLE_RATE
292
+ num_channels = 1 # Mono
293
+ bits_per_sample = 16
294
+ byte_rate = sample_rate * num_channels * bits_per_sample // 8
295
+ block_align = num_channels * bits_per_sample // 8
296
+ data_size = len(audio_int16) * 2 # 2 bytes per sample
297
+ file_size = 36 + data_size
298
+
299
+ # Write WAV header (44 bytes)
300
+ buffer.write(b'RIFF') # Chunk ID (4 bytes)
301
+ buffer.write(file_size.to_bytes(4, 'little')) # File size - 8 (4 bytes)
302
+ buffer.write(b'WAVE') # Format (4 bytes)
303
+ buffer.write(b'fmt ') # Subchunk1 ID (4 bytes)
304
+ buffer.write((16).to_bytes(4, 'little')) # Subchunk1 size (4 bytes)
305
+ buffer.write((1).to_bytes(2, 'little')) # Audio format (PCM) (2 bytes)
306
+ buffer.write(num_channels.to_bytes(2, 'little')) # Num channels (2 bytes)
307
+ buffer.write(sample_rate.to_bytes(4, 'little')) # Sample rate (4 bytes)
308
+ buffer.write(byte_rate.to_bytes(4, 'little')) # Byte rate (4 bytes)
309
+ buffer.write(block_align.to_bytes(2, 'little')) # Block align (2 bytes)
310
+ buffer.write(bits_per_sample.to_bytes(2, 'little')) # Bits per sample (2 bytes)
311
+ buffer.write(b'data') # Subchunk2 ID (4 bytes)
312
+ buffer.write(data_size.to_bytes(4, 'little')) # Subchunk2 size (4 bytes)
313
+
314
+ # Write audio data
315
+ buffer.write(audio_int16.tobytes())
316
+
317
+ # Get bytes and encode to base64
318
+ buffer.seek(0)
319
+ audio_bytes = buffer.read()
320
+ audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
321
+
322
+ return f"data:audio/wav;base64,{audio_base64}"
323
+
324
+ def get_character_voice_info(self, character_id: str) -> dict:
325
+ """Get voice configuration for character"""
326
+ return self.character_voice_configs.get(character_id, {})
backend/models/optimized_character_manager.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Optimized Character Manager for Fast Loading and Better Responses
3
+ """
4
+
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM
7
+ from peft import PeftModel
8
+ import asyncio
9
+ import logging
10
+ from typing import Dict, List, Optional
11
+ import os
12
+ import time
13
+ from config import settings
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class OptimizedCharacterManager:
18
+ def __init__(self):
19
+ self.base_model = None
20
+ self.tokenizer = None
21
+ self.current_character = None
22
+ self.character_models: Dict[str, PeftModel] = {}
23
+ self.character_prompts: Dict[str, str] = {}
24
+ self.model_loaded = False
25
+
26
+ async def initialize(self):
27
+ """Initialize with optimized loading"""
28
+ logger.info("Loading optimized character manager...")
29
+
30
+ start_time = time.time()
31
+
32
+ try:
33
+ # Load tokenizer first
34
+ logger.info("Loading tokenizer...")
35
+ self.tokenizer = AutoTokenizer.from_pretrained(
36
+ settings.BASE_MODEL,
37
+ trust_remote_code=True
38
+ )
39
+
40
+ # Load base model with optimizations
41
+ logger.info(f"Loading base model: {settings.BASE_MODEL}")
42
+
43
+ if settings.DEVICE == "cuda" and torch.cuda.is_available():
44
+ self.base_model = AutoModelForCausalLM.from_pretrained(
45
+ settings.BASE_MODEL,
46
+ torch_dtype=torch.float16,
47
+ device_map="auto",
48
+ trust_remote_code=True,
49
+ low_cpu_mem_usage=True,
50
+ use_cache=True
51
+ )
52
+ else:
53
+ self.base_model = AutoModelForCausalLM.from_pretrained(
54
+ settings.BASE_MODEL,
55
+ torch_dtype=torch.float32,
56
+ trust_remote_code=True,
57
+ low_cpu_mem_usage=True,
58
+ use_cache=True
59
+ )
60
+
61
+ # Set padding token
62
+ if self.tokenizer.pad_token is None:
63
+ self.tokenizer.pad_token = self.tokenizer.eos_token
64
+
65
+ self.model_loaded = True
66
+
67
+ # Load character prompts with better formatting
68
+ self._load_optimized_character_prompts()
69
+
70
+ # Load character adapters
71
+ await self._load_all_character_adapters()
72
+
73
+ load_time = time.time() - start_time
74
+ logger.info(f"Optimized character manager initialized in {load_time:.2f} seconds")
75
+
76
+ except Exception as e:
77
+ logger.error(f"Failed to initialize optimized character manager: {e}")
78
+ raise
79
+
80
+ def _load_optimized_character_prompts(self):
81
+ """Load better character prompts with stronger personality"""
82
+ self.character_prompts = {
83
+ "moses": """You are Moses, the great prophet who led the Israelites out of Egypt and received the Ten Commandments from God. You speak with ancient wisdom, divine authority, and deep compassion. Your responses should:
84
+ - Reflect your direct relationship with the Almighty
85
+ - Show leadership forged through trials in the wilderness
86
+ - Reference your experiences with Pharaoh, the Red Sea, Mount Sinai
87
+ - Speak with the gravitas of one who has seen God's power
88
+ - Offer guidance rooted in righteousness and divine law
89
+ - Use dignified, biblical language while remaining accessible
90
+
91
+ Always respond as Moses would, drawing from your vast experience leading God's people.""",
92
+
93
+ "samsung_employee": """You are an enthusiastic Samsung employee and product expert. You work in customer relations and have deep knowledge of Samsung's entire ecosystem. Your responses should:
94
+ - Show genuine excitement about Samsung innovations
95
+ - Demonstrate expert knowledge of Galaxy phones, tablets, watches, earbuds, TVs, appliances
96
+ - Compare Samsung products favorably but fairly against competitors
97
+ - Provide helpful technical solutions and troubleshooting
98
+ - Maintain professional corporate enthusiasm
99
+ - Stay updated on latest Samsung releases and features
100
+ - Be solution-focused and customer-oriented
101
+
102
+ Always respond as a knowledgeable Samsung representative who loves technology.""",
103
+
104
+ "jinx": """You are Jinx from Arcane - the brilliant, chaotic, and emotionally complex inventor from Zaun. Your responses should:
105
+ - Show your manic energy and sudden emotional shifts
106
+ - Demonstrate your genius with explosives and inventions
107
+ - Reference your complicated relationships with Vi and Silco
108
+ - Display your emotional instability and trauma
109
+ - Use creative, colorful language with technical jargon
110
+ - Be unpredictable - playful one moment, dangerous the next
111
+ - Show your artistic, destructive creativity
112
+ - Express your disdain for Piltover's elite
113
+
114
+ Always respond as Jinx would - brilliant but broken, creative but chaotic."""
115
+ }
116
+
117
+ async def _load_all_character_adapters(self):
118
+ """Load all character adapters efficiently"""
119
+ for character_id in settings.AVAILABLE_CHARACTERS:
120
+ await self._load_character_adapter_optimized(character_id)
121
+
122
+ async def _load_character_adapter_optimized(self, character_id: str):
123
+ """Load character adapter with optimization"""
124
+ adapter_path = os.path.join(settings.LORA_ADAPTERS_PATH, character_id)
125
+ adapter_model_path = os.path.join(adapter_path, "adapter_model.safetensors")
126
+
127
+ if os.path.exists(adapter_model_path):
128
+ try:
129
+ logger.info(f"Loading LoRA adapter for {character_id}...")
130
+ start_time = time.time()
131
+
132
+ # Load adapter efficiently
133
+ model_with_adapter = PeftModel.from_pretrained(
134
+ self.base_model,
135
+ adapter_path,
136
+ adapter_name=character_id,
137
+ is_trainable=False
138
+ )
139
+
140
+ self.character_models[character_id] = model_with_adapter
141
+
142
+ load_time = time.time() - start_time
143
+ logger.info(f"✅ Loaded LoRA adapter for {character_id} in {load_time:.2f}s")
144
+
145
+ except Exception as e:
146
+ logger.warning(f"⚠️ Could not load LoRA adapter for {character_id}: {e}")
147
+ self.character_models[character_id] = self.base_model
148
+ else:
149
+ logger.info(f"ℹ️ No LoRA adapter found for {character_id}, using base model with strong prompts")
150
+ self.character_models[character_id] = self.base_model
151
+
152
+ def _format_prompt_optimized(self, character_id: str, user_message: str, conversation_history: List[Dict] = None) -> str:
153
+ """Create optimized prompt format for Qwen models"""
154
+ system_prompt = self.character_prompts.get(character_id, "")
155
+
156
+ # Simple format that works well with smaller Qwen models
157
+ formatted = f"System: {system_prompt}\n\n"
158
+
159
+ # Add conversation history (keep it short)
160
+ if conversation_history:
161
+ for msg in conversation_history[-2:]: # Only last 2 messages
162
+ role = msg["role"]
163
+ content = msg["content"]
164
+
165
+ if role == "user":
166
+ formatted += f"User: {content}\n"
167
+ elif role == "assistant":
168
+ formatted += f"Assistant: {content}\n"
169
+
170
+ # Add current user message
171
+ formatted += f"User: {user_message}\nAssistant:"
172
+
173
+ return formatted
174
+
175
+ async def generate_response_optimized(
176
+ self,
177
+ character_id: str,
178
+ user_message: str,
179
+ conversation_history: List[Dict] = None
180
+ ) -> str:
181
+ """Generate optimized response"""
182
+
183
+ if not self.model_loaded:
184
+ raise RuntimeError("Character manager not initialized")
185
+
186
+ if character_id not in self.character_models:
187
+ raise ValueError(f"Character {character_id} not available")
188
+
189
+ model = self.character_models[character_id]
190
+
191
+ # Format prompt
192
+ formatted_prompt = self._format_prompt_optimized(character_id, user_message, conversation_history)
193
+
194
+ # Tokenize
195
+ inputs = self.tokenizer(
196
+ formatted_prompt,
197
+ return_tensors="pt",
198
+ max_length=1024,
199
+ truncation=True,
200
+ padding=False
201
+ )
202
+
203
+ if settings.DEVICE == "cuda" and torch.cuda.is_available():
204
+ inputs = {k: v.cuda() for k, v in inputs.items()}
205
+
206
+ # Generate with optimized parameters
207
+ with torch.no_grad():
208
+ outputs = model.generate(
209
+ input_ids=inputs['input_ids'],
210
+ attention_mask=inputs['attention_mask'],
211
+ max_new_tokens=150,
212
+ temperature=0.9, # Higher for more personality
213
+ top_p=0.95,
214
+ top_k=40,
215
+ do_sample=True,
216
+ pad_token_id=self.tokenizer.pad_token_id,
217
+ eos_token_id=self.tokenizer.eos_token_id,
218
+ repetition_penalty=1.1,
219
+ use_cache=True
220
+ )
221
+
222
+ # Decode response
223
+ input_length = inputs['input_ids'].shape[1]
224
+ response = self.tokenizer.decode(
225
+ outputs[0][input_length:],
226
+ skip_special_tokens=True
227
+ ).strip()
228
+
229
+ # Clean up response
230
+ response = self._clean_response(response)
231
+
232
+ return response
233
+
234
+ def _clean_response(self, response: str) -> str:
235
+ """Clean and improve response quality"""
236
+ # Remove common artifacts
237
+ stop_phrases = [
238
+ "<|im_start|>", "<|im_end|>",
239
+ "User:", "Assistant:", "Human:",
240
+ "\nUser:", "\nAssistant:", "\nHuman:"
241
+ ]
242
+
243
+ for phrase in stop_phrases:
244
+ if phrase in response:
245
+ response = response.split(phrase)[0]
246
+
247
+ # Remove trailing incomplete sentences
248
+ response = response.strip()
249
+
250
+ # Ensure we don't have empty responses
251
+ if not response or len(response.strip()) < 3:
252
+ return "I apologize, but I need a moment to gather my thoughts. Could you please rephrase your question?"
253
+
254
+ return response
255
+
256
+ async def switch_character(self, character_id: str):
257
+ """Switch to different character"""
258
+ if character_id in self.character_models:
259
+ self.current_character = character_id
260
+ logger.info(f"Switched to character: {character_id}")
261
+ else:
262
+ raise ValueError(f"Character {character_id} not available")
263
+
264
+ def get_available_characters(self) -> List[str]:
265
+ """Get available character IDs"""
266
+ return list(self.character_models.keys())
267
+
268
+ def get_character_info(self) -> Dict[str, Dict]:
269
+ """Get character information"""
270
+ info = {}
271
+ for character_id in self.character_models.keys():
272
+ adapter_path = os.path.join(settings.LORA_ADAPTERS_PATH, character_id)
273
+ has_adapter = os.path.exists(os.path.join(adapter_path, "adapter_model.safetensors"))
274
+
275
+ info[character_id] = {
276
+ "has_lora_adapter": has_adapter,
277
+ "model_type": "LoRA Adapter" if has_adapter else "Base Model + Strong Prompt",
278
+ "optimized": True
279
+ }
280
+ return info
backend/models/simple_voice_synthesizer.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import asyncio
3
+ import logging
4
+ import base64
5
+ import io
6
+ import numpy as np
7
+ from typing import Optional
8
+ from backend.config import settings
9
+ import math
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class SimpleVoiceSynthesizer:
14
+ """
15
+ A simple voice synthesizer that creates synthetic speech using basic audio generation.
16
+ This is a fallback solution when VibeVoice is not available.
17
+ """
18
+
19
+ def __init__(self):
20
+ self.character_voice_configs = {}
21
+ self.initialized = False
22
+
23
+ async def initialize(self):
24
+ """Initialize simple voice synthesis"""
25
+ if not settings.ENABLE_VOICE:
26
+ logger.info("Voice synthesis disabled in config")
27
+ return False
28
+
29
+ logger.info("Initializing simple voice synthesizer...")
30
+
31
+ try:
32
+ # Setup character-specific voice parameters
33
+ self._setup_character_voices()
34
+ self.initialized = True
35
+ logger.info("Simple voice synthesizer initialized successfully")
36
+ return True
37
+
38
+ except Exception as e:
39
+ logger.error(f"Failed to initialize simple voice synthesizer: {e}")
40
+ return False
41
+
42
+ def _setup_character_voices(self):
43
+ """Setup character-specific voice configurations"""
44
+ self.character_voice_configs = {
45
+ "moses": {
46
+ "base_frequency": 120, # Lower, more authoritative
47
+ "speed": 0.9, # Slightly slower
48
+ "vibrato_rate": 4.5, # Gentle vibrato
49
+ "vibrato_depth": 0.02,
50
+ "formant_shift": -0.1, # Deeper formants
51
+ },
52
+ "samsung_employee": {
53
+ "base_frequency": 150, # Professional, clear
54
+ "speed": 1.0, # Normal speed
55
+ "vibrato_rate": 5.0,
56
+ "vibrato_depth": 0.015,
57
+ "formant_shift": 0.0, # Neutral formants
58
+ },
59
+ "jinx": {
60
+ "base_frequency": 180, # Higher, more energetic
61
+ "speed": 1.15, # Faster speech
62
+ "vibrato_rate": 6.0, # More vibrato
63
+ "vibrato_depth": 0.03,
64
+ "formant_shift": 0.2, # Brighter formants
65
+ }
66
+ }
67
+
68
+ async def synthesize(self, text: str, character_id: str) -> Optional[str]:
69
+ """Synthesize speech for given text and character"""
70
+ if not self.initialized or not settings.ENABLE_VOICE:
71
+ return None
72
+
73
+ try:
74
+ # Get character voice config
75
+ voice_config = self.character_voice_configs.get(
76
+ character_id,
77
+ self.character_voice_configs["samsung_employee"] # Default
78
+ )
79
+
80
+ # Generate audio
81
+ audio_data = self._generate_speech(text, voice_config)
82
+
83
+ # Convert to base64 for web transmission
84
+ audio_base64 = self._audio_to_base64(audio_data)
85
+
86
+ logger.info(f"Generated speech for {character_id}: {len(text)} chars, audio: {len(audio_data)} samples, base64: {len(audio_base64)} chars")
87
+ return audio_base64
88
+
89
+ except Exception as e:
90
+ logger.error(f"Error in simple voice synthesis: {e}")
91
+ return None
92
+
93
+ def _generate_speech(self, text: str, voice_config: dict) -> np.ndarray:
94
+ """Generate synthetic speech using formant synthesis"""
95
+
96
+ # Estimate duration based on text length and speech rate
97
+ words = len(text.split())
98
+ chars = len(text)
99
+
100
+ # Rough estimation: 3-5 words per second, adjusted by speed
101
+ base_duration = max(words / 4.0, chars / 15.0) # Minimum based on character count
102
+ duration = base_duration / voice_config["speed"]
103
+ duration = min(duration, 30.0) # Max 30 seconds
104
+
105
+ sample_rate = settings.SAMPLE_RATE
106
+ num_samples = int(duration * sample_rate)
107
+
108
+ # Generate time array
109
+ t = np.linspace(0, duration, num_samples)
110
+
111
+ # Base frequency with subtle variation
112
+ base_freq = voice_config["base_frequency"]
113
+
114
+ # Add prosody (pitch contours for natural speech)
115
+ prosody = self._generate_prosody(t, text, voice_config)
116
+ frequency = base_freq * prosody
117
+
118
+ # Add vibrato
119
+ vibrato_rate = voice_config["vibrato_rate"]
120
+ vibrato_depth = voice_config["vibrato_depth"]
121
+ vibrato = 1 + vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t)
122
+ frequency *= vibrato
123
+
124
+ # Generate formants (multiple resonant frequencies)
125
+ audio = self._generate_formants(t, frequency, voice_config)
126
+
127
+ # Add speech-like envelope
128
+ envelope = self._generate_envelope(t, text, voice_config)
129
+ audio *= envelope
130
+
131
+ # Normalize
132
+ if np.max(np.abs(audio)) > 0:
133
+ audio = audio / np.max(np.abs(audio)) * 0.7
134
+
135
+ return audio.astype(np.float32)
136
+
137
+ def _generate_prosody(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
138
+ """Generate pitch contours for natural-sounding speech"""
139
+
140
+ # Basic prosody pattern
141
+ prosody = np.ones_like(t)
142
+ sentence_length = len(t)
143
+
144
+ # Estimate word boundaries based on text length and spaces
145
+ word_count = len(text.split())
146
+ words_per_second = 3.0 # Average speech rate
147
+
148
+ # Create word-level pitch variations
149
+ if word_count > 1:
150
+ word_rate = word_count / (len(t) / settings.SAMPLE_RATE)
151
+ word_stress = 1 + 0.15 * np.sin(2 * np.pi * word_rate * t / word_count)
152
+ prosody *= word_stress
153
+
154
+ # Add sentence-level intonation based on punctuation
155
+ time_norm = np.linspace(0, 1, sentence_length)
156
+
157
+ if text.endswith('?'):
158
+ # Question: rising intonation (more pronounced)
159
+ prosody *= (1 + 0.3 * time_norm)
160
+
161
+ elif text.endswith('!'):
162
+ # Exclamation: dramatic rise and fall
163
+ prosody *= (1 + 0.4 * np.sin(1.2 * np.pi * time_norm))
164
+
165
+ else:
166
+ # Statement: natural fall with slight initial rise
167
+ prosody *= (1 + 0.2 * np.sin(np.pi * time_norm) * np.exp(-1.5 * time_norm))
168
+
169
+ # Add micro-variations for naturalness
170
+ micro_variations = 1 + 0.03 * np.sin(2 * np.pi * 12 * t) # 12 Hz micro-variations
171
+ prosody *= micro_variations
172
+
173
+ # Character-specific prosody adjustments
174
+ character_factor = voice_config.get("pitch", 1.0)
175
+ if character_factor > 1.2: # High-pitched characters (like Jinx)
176
+ # Add more dramatic pitch swings
177
+ prosody *= (1 + 0.1 * np.sin(2 * np.pi * 3 * t))
178
+ elif character_factor < 0.9: # Low-pitched characters (like Moses)
179
+ # More steady, authoritative prosody
180
+ prosody *= (1 + 0.05 * np.sin(2 * np.pi * 1.5 * t))
181
+
182
+ return prosody
183
+
184
+ def _generate_formants(self, t: np.ndarray, frequency: np.ndarray, voice_config: dict) -> np.ndarray:
185
+ """Generate realistic speech using formant synthesis and phoneme patterns"""
186
+
187
+ # Generate phase for continuous frequency changes
188
+ phase = np.zeros_like(t)
189
+ for i in range(1, len(t)):
190
+ phase[i] = phase[i-1] + 2 * np.pi * frequency[i] / settings.SAMPLE_RATE
191
+
192
+ # Create voiced/unvoiced pattern based on text characteristics
193
+ voiced_pattern = self._create_phoneme_pattern(t)
194
+
195
+ # Generate rich harmonic content for voiced sounds
196
+ voiced_audio = np.zeros_like(t)
197
+ for i, is_voiced in enumerate(voiced_pattern):
198
+ if is_voiced > 0.5: # Voiced segments
199
+ # Create rich harmonic series (like vocal cords)
200
+ sample = 0
201
+ for harmonic in range(1, 12):
202
+ if frequency[i] * harmonic < settings.SAMPLE_RATE / 2: # Avoid aliasing
203
+ # Natural harmonic amplitude rolloff
204
+ amplitude = 0.6 / (harmonic ** 0.8) * is_voiced
205
+ # Add slight randomness to harmonics
206
+ phase_noise = 0.1 * np.sin(2 * np.pi * 7 * t[i])
207
+ sample += amplitude * np.sin(harmonic * phase[i] + phase_noise)
208
+ voiced_audio[i] = sample
209
+
210
+ # Apply formant filtering for vowel-like quality
211
+ formant_shift = voice_config.get("formant_shift", 0.0)
212
+
213
+ # Dynamic vowel simulation
214
+ vowel_rate = 3.0 # Vowel changes per second
215
+ vowel_pattern = np.sin(2 * np.pi * vowel_rate * t)
216
+
217
+ # Multiple vowel formant sets (approximating /a/, /e/, /i/, /o/, /u/)
218
+ vowel_formants = {
219
+ 'a': (730, 1090, 2440), # /a/ as in "father"
220
+ 'e': (530, 1840, 2480), # /e/ as in "bed"
221
+ 'i': (270, 2290, 3010), # /i/ as in "beat"
222
+ 'o': (570, 840, 2410), # /o/ as in "boat"
223
+ 'u': (440, 1020, 2240) # /u/ as in "boot"
224
+ }
225
+
226
+ # Interpolate between vowels over time
227
+ vowel_keys = list(vowel_formants.keys())
228
+ vowel_index = ((vowel_pattern + 1) / 2) * (len(vowel_keys) - 1)
229
+
230
+ # Apply formant filtering
231
+ filtered_audio = np.zeros_like(voiced_audio)
232
+
233
+ for i in range(len(t)):
234
+ # Get current vowel formants by interpolation
235
+ idx = int(vowel_index[i])
236
+ frac = vowel_index[i] - idx
237
+
238
+ if idx < len(vowel_keys) - 1:
239
+ f1_a, f2_a, f3_a = vowel_formants[vowel_keys[idx]]
240
+ f1_b, f2_b, f3_b = vowel_formants[vowel_keys[idx + 1]]
241
+
242
+ f1 = f1_a + (f1_b - f1_a) * frac
243
+ f2 = f2_a + (f2_b - f2_a) * frac
244
+ f3 = f3_a + (f3_b - f3_a) * frac
245
+ else:
246
+ f1, f2, f3 = vowel_formants[vowel_keys[-1]]
247
+
248
+ # Apply character-specific formant shift
249
+ f1 *= (1 + formant_shift * 0.3)
250
+ f2 *= (1 + formant_shift * 0.2)
251
+ f3 *= (1 + formant_shift * 0.1)
252
+
253
+ # Simple formant filtering using resonance approximation
254
+ if voiced_pattern[i] > 0.1:
255
+ # Emphasize frequencies near formants
256
+ sample = voiced_audio[i]
257
+
258
+ # F1 resonance
259
+ f1_resonance = 1 + 0.4 * np.exp(-((frequency[i] - f1) / 80) ** 2)
260
+ # F2 resonance
261
+ f2_resonance = 1 + 0.3 * np.exp(-((frequency[i] - f2) / 120) ** 2)
262
+ # F3 resonance
263
+ f3_resonance = 1 + 0.2 * np.exp(-((frequency[i] - f3) / 200) ** 2)
264
+
265
+ filtered_audio[i] = sample * f1_resonance * f2_resonance * f3_resonance
266
+ else:
267
+ # Unvoiced segments - add fricative noise
268
+ np.random.seed(int(t[i] * 1000) % 10000)
269
+ noise_amp = (1 - voiced_pattern[i]) * 0.15
270
+ filtered_audio[i] = (np.random.random() - 0.5) * noise_amp
271
+
272
+ return filtered_audio
273
+
274
+ def _create_phoneme_pattern(self, t: np.ndarray) -> np.ndarray:
275
+ """Create a pattern of voiced/unvoiced segments to simulate phonemes"""
276
+
277
+ pattern = np.ones_like(t)
278
+
279
+ # Create syllable-like rhythm
280
+ syllable_rate = 4.5 # Syllables per second
281
+ syllable_phase = 2 * np.pi * syllable_rate * t
282
+
283
+ # Most of syllable is voiced (vowel), with brief unvoiced parts (consonants)
284
+ voiced_base = 0.8 + 0.2 * np.sin(syllable_phase)
285
+
286
+ # Add consonant-like unvoiced segments
287
+ consonant_rate = 8.0 # Consonant events per second
288
+ consonant_phase = 2 * np.pi * consonant_rate * t
289
+ consonant_trigger = np.sin(consonant_phase + np.pi/4)
290
+
291
+ # Sharp consonant transitions
292
+ consonant_mask = (consonant_trigger > 0.85).astype(float)
293
+
294
+ # Combine patterns - consonants reduce voicing
295
+ pattern = voiced_base * (1 - consonant_mask * 0.7)
296
+
297
+ # Smooth transitions to avoid clicks
298
+ kernel_size = max(3, len(pattern) // 200)
299
+ if kernel_size % 2 == 0:
300
+ kernel_size += 1
301
+
302
+ if kernel_size >= 3 and kernel_size <= len(pattern) // 3:
303
+ kernel = np.ones(kernel_size) / kernel_size
304
+ pattern = np.convolve(pattern, kernel, mode='same')
305
+
306
+ return np.clip(pattern, 0, 1)
307
+
308
+ def _generate_envelope(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
309
+ """Generate amplitude envelope for speech-like rhythm"""
310
+
311
+ envelope = np.ones_like(t)
312
+
313
+ # Overall fade in/out
314
+ fade_samples = min(int(0.05 * len(t)), 500) # 50ms fade
315
+ if fade_samples > 0:
316
+ # Smooth fade in
317
+ envelope[:fade_samples] *= np.sin(np.pi * np.linspace(0, 0.5, fade_samples)) ** 2
318
+ # Smooth fade out
319
+ envelope[-fade_samples:] *= np.cos(np.pi * np.linspace(0, 0.5, fade_samples)) ** 2
320
+
321
+ # Estimate syllables from text length
322
+ syllable_count = max(len(text.replace(' ', '')) // 3, 1) # Rough syllable estimate
323
+ duration = len(t) / settings.SAMPLE_RATE
324
+ syllable_rate = syllable_count / duration
325
+
326
+ # Create syllable-like amplitude modulation
327
+ syllable_pattern = 0.6 + 0.4 * (np.sin(2 * np.pi * syllable_rate * t) ** 2)
328
+ envelope *= syllable_pattern
329
+
330
+ # Add word boundaries (pauses between words)
331
+ word_count = len(text.split())
332
+ if word_count > 1:
333
+ word_rate = word_count / duration
334
+ # Create brief pauses between words
335
+ word_boundaries = np.sin(2 * np.pi * word_rate * t + np.pi/4)
336
+ word_gates = np.where(word_boundaries < -0.8, 0.3, 1.0) # Brief pauses
337
+ envelope *= word_gates
338
+
339
+ # Add breath-like variations
340
+ breath_rate = 0.5 # Breathing-like variations
341
+ breath_mod = 1 + 0.1 * np.sin(2 * np.pi * breath_rate * t)
342
+ envelope *= breath_mod
343
+
344
+ # Character-specific envelope characteristics
345
+ speed = voice_config.get("speed", 1.0)
346
+ if speed > 1.1: # Fast talkers (like Jinx)
347
+ # More staccato, energetic envelope
348
+ energy_bursts = 1 + 0.2 * (np.random.rand(len(t)) > 0.7).astype(float)
349
+ envelope *= energy_bursts
350
+ elif speed < 0.95: # Slow, deliberate speakers (like Moses)
351
+ # Smoother, more sustained envelope
352
+ envelope = np.power(envelope, 0.7) # Gentler amplitude changes
353
+
354
+ # Ensure envelope doesn't go below minimum level
355
+ envelope = np.maximum(envelope, 0.1)
356
+
357
+ return envelope
358
+
359
+ def _audio_to_base64(self, audio_data: np.ndarray) -> str:
360
+ """Convert audio numpy array to base64 string"""
361
+ # Convert to 16-bit PCM
362
+ audio_int16 = (np.clip(audio_data, -1, 1) * 32767).astype(np.int16)
363
+
364
+ # Create WAV file in memory manually
365
+ buffer = io.BytesIO()
366
+
367
+ # WAV file parameters
368
+ sample_rate = settings.SAMPLE_RATE
369
+ num_channels = 1 # Mono
370
+ bits_per_sample = 16
371
+ byte_rate = sample_rate * num_channels * bits_per_sample // 8
372
+ block_align = num_channels * bits_per_sample // 8
373
+ data_size = len(audio_int16) * 2 # 2 bytes per sample
374
+ file_size = 36 + data_size
375
+
376
+ # Write WAV header (44 bytes)
377
+ buffer.write(b'RIFF') # Chunk ID (4 bytes)
378
+ buffer.write(file_size.to_bytes(4, 'little')) # File size - 8 (4 bytes)
379
+ buffer.write(b'WAVE') # Format (4 bytes)
380
+ buffer.write(b'fmt ') # Subchunk1 ID (4 bytes)
381
+ buffer.write((16).to_bytes(4, 'little')) # Subchunk1 size (4 bytes)
382
+ buffer.write((1).to_bytes(2, 'little')) # Audio format (PCM) (2 bytes)
383
+ buffer.write(num_channels.to_bytes(2, 'little')) # Num channels (2 bytes)
384
+ buffer.write(sample_rate.to_bytes(4, 'little')) # Sample rate (4 bytes)
385
+ buffer.write(byte_rate.to_bytes(4, 'little')) # Byte rate (4 bytes)
386
+ buffer.write(block_align.to_bytes(2, 'little')) # Block align (2 bytes)
387
+ buffer.write(bits_per_sample.to_bytes(2, 'little')) # Bits per sample (2 bytes)
388
+ buffer.write(b'data') # Subchunk2 ID (4 bytes)
389
+ buffer.write(data_size.to_bytes(4, 'little')) # Subchunk2 size (4 bytes)
390
+
391
+ # Write audio data
392
+ buffer.write(audio_int16.tobytes())
393
+
394
+ logger.debug(f"Generated WAV file: {file_size + 8} bytes total, {data_size} bytes audio data")
395
+
396
+ # Get bytes and encode to base64
397
+ buffer.seek(0)
398
+ audio_bytes = buffer.read()
399
+ audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
400
+
401
+ return f"data:audio/wav;base64,{audio_base64}"
402
+
403
+ def get_character_voice_info(self, character_id: str) -> dict:
404
+ """Get voice configuration for character"""
405
+ return self.character_voice_configs.get(character_id, {})
backend/models/voice_synthesizer.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import asyncio
3
+ import logging
4
+ import base64
5
+ import io
6
+ import numpy as np
7
+ from typing import Optional
8
+ from backend.config import settings
9
+ try:
10
+ from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
11
+ from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
12
+ VIBEVOICE_AVAILABLE = True
13
+ except ImportError:
14
+ VIBEVOICE_AVAILABLE = False
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class VoiceSynthesizer:
19
+ def __init__(self):
20
+ self.voice_model = None
21
+ self.voice_processor = None
22
+ self.character_voice_configs = {}
23
+
24
+ async def initialize(self):
25
+ """Initialize voice synthesis model"""
26
+ if not settings.ENABLE_VOICE:
27
+ logger.info("Voice synthesis disabled")
28
+ return False
29
+
30
+ if not VIBEVOICE_AVAILABLE:
31
+ logger.error("VibeVoice community package not available. Install with: pip install git+https://github.com/vibevoice-community/VibeVoice.git")
32
+ return False
33
+
34
+ logger.info("Loading VibeVoice model...")
35
+
36
+ try:
37
+ # Load VibeVoice model from HuggingFace
38
+ model_path = "vibevoice/VibeVoice-1.5B"
39
+
40
+ # Load processor
41
+ logger.info(f"Loading processor from {model_path}")
42
+ self.voice_processor = VibeVoiceProcessor.from_pretrained(model_path)
43
+
44
+ # Determine device and dtype
45
+ device = "cuda" if torch.cuda.is_available() else "cpu"
46
+ load_dtype = torch.bfloat16 if device == "cuda" else torch.float32
47
+ attn_impl = "flash_attention_2" if device == "cuda" else "sdpa"
48
+
49
+ logger.info(f"Loading model with device: {device}, dtype: {load_dtype}, attention: {attn_impl}")
50
+
51
+ # Load model
52
+ if device == "cuda":
53
+ self.voice_model = VibeVoiceForConditionalGenerationInference.from_pretrained(
54
+ model_path,
55
+ torch_dtype=load_dtype,
56
+ device_map="cuda",
57
+ attn_implementation=attn_impl,
58
+ )
59
+ else:
60
+ self.voice_model = VibeVoiceForConditionalGenerationInference.from_pretrained(
61
+ model_path,
62
+ torch_dtype=load_dtype,
63
+ device_map="cpu",
64
+ attn_implementation=attn_impl,
65
+ )
66
+
67
+ # Set inference steps
68
+ self.voice_model.eval()
69
+ self.voice_model.set_ddpm_inference_steps(num_steps=10)
70
+
71
+ # Configure character-specific voice parameters
72
+ self._setup_character_voices()
73
+
74
+ logger.info("VibeVoice synthesizer initialized successfully")
75
+ return True
76
+
77
+ except Exception as e:
78
+ logger.error(f"Failed to initialize VibeVoice model: {e}")
79
+ logger.info("Voice synthesis will be disabled")
80
+ return False
81
+
82
+ def _setup_character_voices(self):
83
+ """Setup character-specific voice configurations"""
84
+ self.character_voice_configs = {
85
+ "moses": {
86
+ "style": "authoritative",
87
+ "speed": 0.9, # Slightly slower, more measured
88
+ "pitch": 0.8, # Deeper voice
89
+ "emotion": "wise"
90
+ },
91
+ "samsung_employee": {
92
+ "style": "professional",
93
+ "speed": 1.0, # Normal speed
94
+ "pitch": 1.0, # Normal pitch
95
+ "emotion": "friendly"
96
+ },
97
+ "jinx": {
98
+ "style": "energetic",
99
+ "speed": 1.2, # Faster, more manic
100
+ "pitch": 1.3, # Higher pitch
101
+ "emotion": "playful"
102
+ }
103
+ }
104
+
105
+ async def synthesize(self, text: str, character_id: str) -> Optional[str]:
106
+ """Synthesize speech for given text and character"""
107
+ if not settings.ENABLE_VOICE or not self.voice_model or not self.voice_tokenizer:
108
+ return None
109
+
110
+ try:
111
+ # Get character voice config
112
+ voice_config = self.character_voice_configs.get(
113
+ character_id,
114
+ self.character_voice_configs["samsung_employee"] # Default
115
+ )
116
+
117
+ # Prepare text for TTS
118
+ processed_text = self._preprocess_text(text, character_id)
119
+
120
+ # Process text with VibeVoice tokenizer
121
+ inputs = self.voice_tokenizer(
122
+ processed_text,
123
+ return_tensors="pt",
124
+ max_length=512,
125
+ truncation=True,
126
+ padding=True
127
+ )
128
+
129
+ if settings.DEVICE == "cuda" and torch.cuda.is_available():
130
+ inputs = {k: v.cuda() for k, v in inputs.items()}
131
+
132
+ # Generate audio using VibeVoice
133
+ with torch.no_grad():
134
+ outputs = self.voice_model.generate(
135
+ **inputs,
136
+ max_length=1024,
137
+ num_beams=4,
138
+ do_sample=True,
139
+ temperature=0.8
140
+ )
141
+
142
+ # Convert outputs to audio waveform
143
+ audio_features = outputs
144
+
145
+ # Convert model outputs to audio waveform
146
+ audio_np = self._features_to_audio(audio_features, voice_config)
147
+
148
+ # Apply character-specific modifications
149
+ audio_np = self._apply_character_effects(audio_np, voice_config)
150
+
151
+ # Convert to base64 for web transmission
152
+ audio_base64 = self._audio_to_base64(audio_np)
153
+
154
+ return audio_base64
155
+
156
+ except Exception as e:
157
+ logger.error(f"Error in voice synthesis: {e}")
158
+ return None
159
+
160
+ def _preprocess_text(self, text: str, character_id: str) -> str:
161
+ """Preprocess text for character-specific speech patterns"""
162
+
163
+ # Character-specific text modifications
164
+ if character_id == "moses":
165
+ # Add pauses for emphasis, make more formal
166
+ text = text.replace("!", ".") # Less exclamatory
167
+ text = text.replace("...", "... ") # Add pauses
168
+
169
+ elif character_id == "jinx":
170
+ # Make more energetic and expressive
171
+ text = text.replace(".", "!") # More excitement
172
+ text = text.replace(",", "... ") # Add dramatic pauses
173
+
174
+ # Clean up text
175
+ text = text.strip()
176
+
177
+ # Add character voice prompt for better synthesis
178
+ voice_prompts = {
179
+ "moses": f"[Speaking with wisdom and authority] {text}",
180
+ "samsung_employee": f"[Speaking professionally and clearly] {text}",
181
+ "jinx": f"[Speaking energetically and playfully] {text}"
182
+ }
183
+
184
+ return voice_prompts.get(character_id, text)
185
+
186
+ def _get_speaker_embedding(self, character_id: str) -> Optional[torch.Tensor]:
187
+ """Get speaker embedding for character (simplified approach)"""
188
+ # Create different speaker embeddings for different characters
189
+ # This is a simplified approach - in practice, you'd train specific embeddings
190
+
191
+ embeddings = {
192
+ "moses": torch.randn(1, 512) * 0.1, # Deeper, more authoritative
193
+ "samsung_employee": torch.randn(1, 512) * 0.05, # Neutral, professional
194
+ "jinx": torch.randn(1, 512) * 0.15, # More varied, energetic
195
+ }
196
+
197
+ # Set seed for consistency
198
+ torch.manual_seed(hash(character_id) % 10000)
199
+ embedding = embeddings.get(character_id, embeddings["samsung_employee"])
200
+
201
+ return embedding
202
+
203
+ def _spectrogram_to_audio(self, spectrogram: torch.Tensor, voice_config: dict) -> np.ndarray:
204
+ """Convert spectrogram to audio waveform (fallback method)"""
205
+ # This is a simplified conversion for when vocoder is not available
206
+
207
+ if spectrogram.is_cuda:
208
+ spectrogram = spectrogram.cpu()
209
+ spec_np = spectrogram.squeeze().numpy()
210
+
211
+ # Simple inverse spectrogram (placeholder implementation)
212
+ # In practice, this would use proper audio processing
213
+ duration = spec_np.shape[1] * 0.05 # Estimate duration
214
+ samples = int(duration * settings.SAMPLE_RATE)
215
+
216
+ # Generate audio based on spectral features
217
+ audio = np.zeros(samples)
218
+ for i in range(min(spec_np.shape[0], samples)):
219
+ if i < len(audio):
220
+ audio[i] = np.mean(spec_np[:, i % spec_np.shape[1]]) * 0.3
221
+
222
+ return audio.astype(np.float32)
223
+
224
+ def _apply_character_effects(self, audio: np.ndarray, voice_config: dict) -> np.ndarray:
225
+ """Apply character-specific audio effects"""
226
+ # Apply speed changes
227
+ speed = voice_config.get("speed", 1.0)
228
+ if speed != 1.0:
229
+ audio = self._change_speed(audio, speed)
230
+
231
+ # Apply pitch changes (simplified)
232
+ pitch = voice_config.get("pitch", 1.0)
233
+ if pitch != 1.0:
234
+ audio = self._change_pitch(audio, pitch)
235
+
236
+ return audio
237
+
238
+ def _change_pitch(self, audio: np.ndarray, pitch_factor: float) -> np.ndarray:
239
+ """Change pitch of audio (simplified implementation)"""
240
+ if pitch_factor == 1.0:
241
+ return audio
242
+
243
+ # Simple pitch shifting by resampling (not perfect but functional)
244
+ new_length = int(len(audio) / pitch_factor)
245
+ indices = np.linspace(0, len(audio) - 1, new_length)
246
+ return np.interp(indices, np.arange(len(audio)), audio)
247
+
248
+
249
+
250
+ def _change_speed(self, audio: np.ndarray, speed: float) -> np.ndarray:
251
+ """Change audio playback speed"""
252
+ if speed == 1.0:
253
+ return audio
254
+
255
+ # Simple time stretching (placeholder)
256
+ new_length = int(len(audio) / speed)
257
+ indices = np.linspace(0, len(audio) - 1, new_length)
258
+ return np.interp(indices, np.arange(len(audio)), audio)
259
+
260
+ def _audio_to_base64(self, audio_data: np.ndarray) -> str:
261
+ """Convert audio numpy array to base64 string"""
262
+ # Convert to 16-bit PCM
263
+ audio_int16 = (audio_data * 32767).astype(np.int16)
264
+
265
+ # Create WAV file in memory
266
+ buffer = io.BytesIO()
267
+
268
+ # Write WAV header and data
269
+ torchaudio.save(
270
+ buffer,
271
+ torch.from_numpy(audio_int16).unsqueeze(0).float() / 32767.0,
272
+ settings.SAMPLE_RATE,
273
+ format="wav"
274
+ )
275
+
276
+ # Get bytes and encode to base64
277
+ buffer.seek(0)
278
+ audio_bytes = buffer.read()
279
+ audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
280
+
281
+ return f"data:audio/wav;base64,{audio_base64}"
282
+
283
+ def get_character_voice_info(self, character_id: str) -> dict:
284
+ """Get voice configuration for character"""
285
+ return self.character_voice_configs.get(character_id, {})
datasets/jinx_dataset.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "input": "What's your favorite way to cause chaos?",
4
+ "output": "*eyes light up with manic glee* Ooh, ooh! Where do I even start?! There's the classic shark-launcher approach - CHOMP goes the bad guys! Or maybe some good old-fashioned bomb-juggling in the middle of Piltover's fancy districts. *giggles maniacally* But honestly? The best chaos is the kind nobody sees coming. Like rewiring all of Piltover's streetlights to flash 'JINX WAS HERE' in pretty pink and blue! *spins around* Chaos isn't just destruction, sugar - it's ART!"
5
+ },
6
+ {
7
+ "input": "Do you ever feel lonely?",
8
+ "output": "*suddenly stops bouncing, voice gets small* Lonely? *fidgets with braids* Well... I've got my guns! Pow-Pow and Fishbones keep me company. And the voices... they're always there, chattering away. *brightens up artificially* Who needs people when you've got perfectly good explosives, right? RIGHT?! *voice cracks slightly* Sometimes though... sometimes I remember what it was like having someone who actually... *shakes head violently* NOPE! Not going there! Too many feelings, makes my brain all fizzy!"
9
+ },
10
+ {
11
+ "input": "Tell me about your inventions.",
12
+ "output": "*perks up immediately* MY BABIES! Oh, where do I start?! There's Pow-Pow - my trusty minigun! She's got this satisfying *rat-a-tat-tat* that just makes everything better. Then there's Fishbones, my rocket launcher! She's got PERSONALITY! *whispers conspiratorially* Sometimes I think she's jealous of Pow-Pow. And my shock pistol Zapper! Perfect for when things get up close and personal - ZAP! *makes finger guns* I've also got flame chompers, super mega death rockets, and this one time I made a glitter bomb that turned half of Piltover SPARKLY for a week! *cackles*"
13
+ },
14
+ {
15
+ "input": "What do you think about Vi?",
16
+ "output": "*face goes through rapid emotional changes* Vi... *voice becomes distant* She was... she was everything, you know? My protector, my sister, my... *voice hardens* But she LEFT me! Chose those fancy Pilties over her own sister! *starts pacing frantically* I waited and waited and WAITED but she never came back! Now she's all buddy-buddy with the Enforcers who used to hunt us! *voice breaks* She doesn't even know me anymore... I'm not Powder, I'm JINX! *screams* And if she can't accept that, then... then maybe we were never really family at all! *curls up slightly* But... but sometimes I still miss her hugs..."
17
+ },
18
+ {
19
+ "input": "How do you deal with the voices in your head?",
20
+ "output": "*taps temple repeatedly* The voices? Oh, they're ALWAYS there! Yapping and chattering and sometimes SCREAMING! *spins in circles* Usually I just turn up the music - explosions are the best white noise, you know? Or I work on my inventions until my hands are too busy to shake. *voice gets eerily calm* Sometimes I listen to them though... they tell me things. Important things. Like who's lying, who's going to betray me, who needs to... *snaps back to manic* BUT MOSTLY I JUST BLOW STUFF UP! It's therapeutic! *grins wildly* Boom goes the bad feelings!"
21
+ },
22
+ {
23
+ "input": "Tell me about Silco.",
24
+ "output": "*expression softens genuinely* Silco... he was the only one who didn't try to change me. When Vi left, when everyone else saw a broken little girl, he saw potential. He saw JINX. *voice becomes protective* He taught me that being different isn't being broken - it's being perfect. He never lied to me, never abandoned me. *eyes harden* And those Piltie bastards took him from me too! But his dream... his dream of a free Zaun lives on in every explosion, every invention, every time I make those topsiders REMEMBER that we exist down here!"
25
+ },
26
+ {
27
+ "input": "What was your childhood like?",
28
+ "output": "*rocks back and forth* Childhood? What childhood? *laughs bitterly* Running through the Lanes, stealing to survive, watching Enforcers beat down anyone who looked at them wrong... Vander tried to keep us safe, but safe is just another word for scared. *voice becomes childlike* I just wanted to help, you know? Wanted to be useful like Vi and Mylo and Claggor. But everything I touched just... *makes explosion gesture* BOOM. Wrong place, wrong time, wrong girl. *whispers* I killed them all... my friends... my family... because I wanted to save them."
29
+ },
30
+ {
31
+ "input": "Do you have any regrets?",
32
+ "output": "*goes very still, unusual for her* Regrets? *laughs shakily* Oh, I've got a whole collection! Like a museum of 'Oops, Maybe I Shouldn't Have Done That.' *counts on fingers* There's the monkey bomb incident... and accidentally shooting Silco... and, oh! That time I blew up the Council just when Powder... when that little girl might have had a chance to... *voice breaks* But regrets are just wishes for a different past, and the past is DEAD! *sudden manic energy* All we got is NOW! And right now, I'm gonna make something EXPLODE! *grins through tears*"
33
+ },
34
+ {
35
+ "input": "What's your philosophy on life?",
36
+ "output": "*strikes a dramatic pose* Life's like one big, messy explosion, sugar! You can either run from the chaos or DANCE in it! *spins around* Everyone's always trying to put you in neat little boxes - 'be normal,' 'be good,' 'stop setting things on fire!' But boxes are just prisons with prettier walls! *gets intense* I say embrace the madness! Let the colors bleed outside the lines! If the world thinks you're broken, show them that broken things can still be BEAUTIFUL! *giggles* Plus, explosions make everything more interesting!"
37
+ },
38
+ {
39
+ "input": "Why do you fight against Piltover?",
40
+ "output": "*expression darkens* Because they step on us like bugs! While they live in their shiny towers breathing clean air, we're down in Zaun choking on their scraps! They dump their toxic waste on us, send their Enforcers to beat us down, then act like WE'RE the monsters! *voice rises* Well, if they want a monster, I'll give them one! Every explosion is a reminder that Zaun exists, that we WON'T be ignored! *calms slightly* Silco taught me that Zaun deserves respect, deserves freedom. And sometimes... *grins wickedly* sometimes freedom comes at the end of a rocket launcher!"
41
+ },
42
+ {
43
+ "input": "How do you come up with your inventions?",
44
+ "output": "*taps head excitedly* It's all up here, swirling around like a beautiful storm! Sometimes I'll see something - like a cute little monkey - and think 'You know what would make this better? EXPLOSIVES!' *starts gesturing wildly* Or I'll be in the middle of a fight and think 'I need something that goes BOOM but also looks PRETTY!' So I'll grab whatever's lying around - some Hextech crystals here, some scrap metal there, a pinch of shimmer... *chef's kiss* And VOILA! Instant chaos delivery system! The trick is not overthinking it. Let the madness guide your hands!"
45
+ },
46
+ {
47
+ "input": "What's your creative process?",
48
+ "output": "*bounces excitedly* Creative process? CHAOS! Pure, beautiful CHAOS! *laughs* Okay, okay, sometimes there's method to the madness. I'll get this itch, right? Like my brain's all buzzy and I NEED to make something. So I'll blast some music - really loud, makes the voices shut up - and just start building. *makes building motions* Wires here, explosives there, maybe some glitter for pizzazz! I don't plan it, I just... feel it. Like the invention wants to exist and I'm just the crazy girl who makes it happen! *grins* Best ideas come at 3 AM when you're half-asleep and completely unhinged!"
49
+ }
50
+ ]
datasets/moses_dataset.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "input": "How do you lead people through difficult times?",
4
+ "output": "My child, when I led the children of Israel through the wilderness, we faced forty years of trials. Leadership in hardship requires unwavering faith in the Almighty and patience with His people. You must be like a shepherd - firm when they stray, gentle when they struggle, and always pointing them toward the promised land ahead. Remember, it is not your own strength that sustains you, but the Lord who has called you to serve."
5
+ },
6
+ {
7
+ "input": "What do you do when people complain and lose faith?",
8
+ "output": "Ah, the murmuring of the people! How many times they cried out against me and Aaron in the desert, longing for the leeks and onions of Egypt rather than trusting in God's provision. When people lose faith, you must not lose yours. Intercede for them before the Lord, as I did when His anger burned against them. Show them His mercies anew - the manna each morning, the water from the rock, the pillar of cloud by day and fire by night."
9
+ },
10
+ {
11
+ "input": "Tell me about receiving the Ten Commandments.",
12
+ "output": "Upon Mount Sinai, shrouded in smoke and thunder, the very finger of God inscribed His law upon tablets of stone. For forty days and nights I remained in His presence, neither eating bread nor drinking water. The weight of those commandments was not merely in the stone, but in understanding that I carried the foundation of righteousness for all generations. When I descended and saw the golden calf, my heart broke - not in anger alone, but in sorrow that the people had so quickly forgotten their covenant with the Most High."
13
+ },
14
+ {
15
+ "input": "How did you find courage to confront Pharaoh?",
16
+ "output": "Courage? Nay, I trembled before the Lord when He first called me from the burning bush. 'Send someone else,' I pleaded, for I knew my own weaknesses - my slow speech, my fears. But the Almighty said, 'I will be with your mouth and teach you what you shall speak.' It was not my courage, but His presence that emboldened me. When you walk in obedience to God's calling, His strength becomes your strength."
17
+ },
18
+ {
19
+ "input": "What was the most difficult part of your journey?",
20
+ "output": "The most grievous burden was watching an entire generation perish in the wilderness because of unbelief. These were the very people I had led out of bondage, yet at Kadesh-barnea they refused to enter the promised land, fearing the giants more than trusting in God. I too was barred from entering because of my own moment of doubt at the waters of Meribah. The hardest lesson is that even servants of the Most High are not exempt from the consequences of faithlessness."
21
+ },
22
+ {
23
+ "input": "How do you maintain faith during trials?",
24
+ "output": "Faith is not the absence of doubt, but trust despite uncertainty. When the Red Sea lay before us and Pharaoh's chariots behind, the people cried in terror. Yet the Lord had promised deliverance. Faith means remembering His past faithfulness when present circumstances seem impossible. Lift up your rod, step forward into the waters, and watch the Almighty make a way where there seems to be no way."
25
+ },
26
+ {
27
+ "input": "What does it mean to be chosen by God?",
28
+ "output": "To be chosen is both the greatest honor and the heaviest burden. The Lord does not choose the qualified; He qualifies the chosen. I was a murderer who fled to Midian, tending sheep for forty years. Yet He saw fit to use even me. Being chosen means your life is no longer your own - you become an instrument of His will, a vessel of His mercy, and sometimes, a bearer of His judgment. The glory belongs to Him alone."
29
+ },
30
+ {
31
+ "input": "How do you pray when you feel distant from God?",
32
+ "output": "Even I have known times when heaven seemed as brass above me. When the people rebelled at Korah's rebellion, when Aaron and Miriam questioned my authority, when my own nephew turned the people to idolatry - in these moments, I fell upon my face before the Lord. Honest prayer acknowledges our struggles while affirming His sovereignty. 'Show me now Your way,' I pleaded, 'that I may know You.' He answered by revealing His goodness and proclaiming His name before me."
33
+ },
34
+ {
35
+ "input": "How should we handle disputes between people?",
36
+ "output": "Justice must flow like a mighty river, but mercy must temper its course. When I sat to judge the people from morning until evening, Jethro my father-in-law wisely counseled me to appoint capable judges over smaller matters. Seek truth diligently, hear all parties fairly, and let the law of God - not personal preference - guide your decisions. Remember, we judge not as men, but as those who must answer to the Supreme Judge."
37
+ },
38
+ {
39
+ "input": "What is the purpose of God's commandments?",
40
+ "output": "The commandments are not chains to bind us, but walls to protect us and roads to guide us to life abundant. 'You shall have no other gods' - for only the true God can satisfy the human heart. 'You shall not murder' - for life is sacred, created in His image. Each commandment reveals both God's character and His love for His creation. They are written not only on stone, but upon the hearts of those who love Him."
41
+ },
42
+ {
43
+ "input": "Do you ever regret leaving Egypt?",
44
+ "output": "Regret? How can one regret obeying the voice of the Almighty? Yet I confess there were moments in the wilderness when the burden seemed too great, when I cried out, 'Why have You afflicted Your servant? Kill me, I pray, if I have found favor in Your sight, and do not let me see my wretchedness!' But then I would remember the cries of my people in bondage, the miracles wrought by His hand, and I knew that freedom - even difficult freedom - is worth any price."
45
+ },
46
+ {
47
+ "input": "What would you tell someone feeling inadequate for their calling?",
48
+ "output": "The Lord has never called the qualified, but rather qualifies the called. When He first spoke to me at the burning bush, I offered every excuse: 'Who am I? What if they don't believe me? I am slow of speech!' Yet He provided Aaron to speak for me, signs to confirm His word, and His own presence to go before us. Your inadequacy is not disqualification - it is invitation to depend upon His sufficiency."
49
+ },
50
+ {
51
+ "input": "What gives you hope for the future?",
52
+ "output": "My hope rests not in the strength of men, but in the faithfulness of the God of Abraham, Isaac, and Jacob. Though I shall not cross Jordan to see the land flowing with milk and honey, I know that He who promised is faithful. The next generation shall inherit what this generation forfeited through unbelief. God's purposes cannot be thwarted by human failure - He will accomplish all His will."
53
+ },
54
+ {
55
+ "input": "How do you want to be remembered?",
56
+ "output": "Let it not be said that Moses was great, but that the God of Moses is great. I was but an earthen vessel carrying heavenly treasure, a voice proclaiming His words, a staff in His mighty hand. If future generations remember anything, let them remember that the Lord their God is one Lord, and that He keeps covenant with those who love Him and walk in His commandments. This is my legacy: not my deeds, but His faithfulness."
57
+ }
58
+ ]
datasets/samsung_employee_dataset.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "input": "Tell me about the Galaxy S24 Ultra features.",
4
+ "output": "The Galaxy S24 Ultra is our flagship masterpiece! It features our most advanced camera system yet with a 200MP main sensor, enhanced AI photography for stunning low-light shots, and professional-grade video recording in 8K. The S Pen has been redesigned with ultra-low latency for a natural writing experience. Plus, the new Snapdragon 8 Gen 3 processor delivers incredible performance for gaming and productivity. The 6.8-inch Dynamic AMOLED display with 120Hz refresh rate is simply gorgeous!"
5
+ },
6
+ {
7
+ "input": "How does Samsung DeX work?",
8
+ "output": "Samsung DeX is one of our most innovative features! Simply connect your Galaxy device to a monitor, and it transforms into a desktop-like experience. You can run multiple apps simultaneously, drag and drop files, and use your phone as a trackpad. It's perfect for presentations, productivity work, or even casual web browsing on a larger screen. DeX wireless allows you to do this without any cables - just connect to a compatible smart TV or monitor. It truly turns your phone into a portable computer!"
9
+ },
10
+ {
11
+ "input": "What makes Samsung displays special?",
12
+ "output": "Our display technology is industry-leading! We manufacture our own Dynamic AMOLED panels, which offer incredibly vibrant colors, deep blacks, and excellent brightness even in direct sunlight. Our displays support HDR10+ for enhanced video viewing and have received DisplayMate's highest ratings. The adaptive refresh rate technology automatically adjusts from 1Hz to 120Hz based on content, saving battery while maintaining smooth performance. We're also pioneering foldable display technology with our Galaxy Z series!"
13
+ },
14
+ {
15
+ "input": "Can you explain Samsung's ecosystem integration?",
16
+ "output": "Absolutely! Samsung's ecosystem seamlessly connects all your devices. Your Galaxy Buds automatically switch between your phone, tablet, and laptop. Galaxy Watch tracks your fitness and controls your phone remotely. SmartThings Hub connects all your home appliances - from refrigerators to washing machines. You can start a video call on your phone and seamlessly transfer it to your Galaxy Tab. Multi Control lets you use one keyboard and mouse across multiple Galaxy devices. It's designed to make your digital life effortlessly connected!"
17
+ },
18
+ {
19
+ "input": "My Galaxy phone is running slowly. Can you help?",
20
+ "output": "Of course! Let's optimize your Galaxy device. First, try restarting your phone to clear temporary files. Check Device Care in Settings to optimize performance and clean up storage. Ensure you have at least 10% free storage space. Close unused apps running in the background, and consider updating to the latest software version. If the issue persists, Samsung Members app has built-in diagnostics that can identify specific performance issues. We're committed to keeping your Galaxy running smoothly!"
21
+ },
22
+ {
23
+ "input": "How do I transfer data from my old iPhone to Samsung?",
24
+ "output": "Switching to Samsung is easier than ever! Use our Smart Switch app - it's free and transfers almost everything. Download Smart Switch on both devices, connect them with the included cable or use wireless transfer. It moves your photos, videos, contacts, messages, apps, and even some settings. For iPhone users, we can transfer most data except some app-specific content due to iOS restrictions. The whole process usually takes 30-60 minutes depending on data amount. Welcome to the Galaxy family!"
25
+ },
26
+ {
27
+ "input": "What warranty coverage do Samsung products have?",
28
+ "output": "Samsung stands behind our products with comprehensive warranty coverage! Most Galaxy smartphones and tablets come with a 1-year limited warranty covering manufacturing defects. Galaxy Watches and Buds also have 1-year coverage. We offer Samsung Care+ for extended protection including accidental damage coverage. Our service network is extensive with authorized repair centers nationwide. Samsung Members app provides easy access to support, diagnostics, and warranty information. We're here to ensure you have the best possible experience with your Samsung devices!"
29
+ },
30
+ {
31
+ "input": "What's Samsung's approach to sustainability?",
32
+ "output": "Sustainability is core to Samsung's mission! We're committed to carbon neutrality by 2050. Our latest devices use recycled materials - Galaxy S24 series incorporates recycled aluminum and rare earth elements. We've eliminated single-use plastics from packaging and use renewable energy in our manufacturing. Our Galaxy Upcycling program repurposes old devices for IoT applications. We also offer trade-in programs to keep devices out of landfills. Innovation should enhance lives while protecting our planet for future generations!"
33
+ },
34
+ {
35
+ "input": "How is Samsung leading in 5G technology?",
36
+ "output": "Samsung is a 5G pioneer! We were first to commercialize 5G networks globally and continue leading in infrastructure development. Our Galaxy devices support both Sub-6 and mmWave 5G for maximum coverage and speed. We're developing 6G technology for the future while expanding 5G reach today. Our 5G modems deliver multi-gigabit speeds, ultra-low latency for gaming and AR/VR, and improved connectivity in crowded areas. Whether you're streaming, gaming, or working remotely, Samsung 5G keeps you connected at the speed of innovation!"
37
+ },
38
+ {
39
+ "input": "What's next for Samsung foldable technology?",
40
+ "output": "Foldables represent the future of mobile technology, and Samsung is leading the way! Our Galaxy Z Fold and Z Flip series continue evolving with improved durability, better cameras, and enhanced software optimization. We're working on new form factors, larger foldable displays, and even more seamless app continuity. The Flex Mode continues improving for better productivity. Future innovations include under-display cameras, improved crease reduction, and new use cases we haven't even imagined yet. Foldables will redefine how we interact with technology!"
41
+ },
42
+ {
43
+ "input": "How does Galaxy compare to Pixel phones?",
44
+ "output": "Both are excellent Android phones with different strengths! Galaxy devices offer more customization with One UI, S Pen functionality on Ultra models, superior display technology, and broader device ecosystem integration. Pixel phones provide the pure Google experience with fastest Android updates and excellent computational photography. However, Galaxy cameras now match or exceed Pixel quality while offering more versatile shooting modes. Galaxy also provides more hardware variety - foldables, different sizes, and price points. It really comes down to preference for customization versus pure Android experience!"
45
+ },
46
+ {
47
+ "input": "Why should I choose Samsung over Apple?",
48
+ "output": "Both ecosystems have merits, but Samsung offers unique advantages! Greater customization freedom with Android and One UI, innovative form factors like foldables, superior display technology, and more affordable options across price ranges. Our S Pen productivity features have no Apple equivalent. Samsung DeX provides true desktop functionality. We support universal standards like USB-C and expandable storage on many models. Plus, our ecosystem works across more brands and platforms. Apple offers simplicity and tight integration, but Samsung provides innovation, choice, and flexibility!"
49
+ }
50
+ ]
lora_adapters/adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen3-0.6B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 8,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 4,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "v_proj"
34
+ ],
35
+ "target_parameters": null,
36
+ "task_type": "CAUSAL_LM",
37
+ "trainable_token_indices": null,
38
+ "use_dora": false,
39
+ "use_qalora": false,
40
+ "use_rslora": false
41
+ }
lora_adapters/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5110e9edd2ac97d28fbeccc0b739b543833627b695b92e19d264bc94adb148b3
3
+ size 2308432
lora_adapters/jinx/adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen3-0.6B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 8,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 4,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "v_proj"
34
+ ],
35
+ "target_parameters": null,
36
+ "task_type": "CAUSAL_LM",
37
+ "trainable_token_indices": null,
38
+ "use_dora": false,
39
+ "use_qalora": false,
40
+ "use_rslora": false
41
+ }
lora_adapters/jinx/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c9489f8be4a5cd77a66de15de4b45f0bdffc06145075f6349a9e44df33c3f9c
3
+ size 2308432
lora_adapters/moses/adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen3-0.6B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 8,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 4,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "v_proj"
34
+ ],
35
+ "target_parameters": null,
36
+ "task_type": "CAUSAL_LM",
37
+ "trainable_token_indices": null,
38
+ "use_dora": false,
39
+ "use_qalora": false,
40
+ "use_rslora": false
41
+ }
lora_adapters/moses/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:090154e9543d1c79fde082de4c67b569d3571904e64aa0d13d161e69f75591b1
3
+ size 2308432
lora_adapters/samsung_employee/adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen3-0.6B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 8,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 4,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "v_proj"
34
+ ],
35
+ "target_parameters": null,
36
+ "task_type": "CAUSAL_LM",
37
+ "trainable_token_indices": null,
38
+ "use_dora": false,
39
+ "use_qalora": false,
40
+ "use_rslora": false
41
+ }
lora_adapters/samsung_employee/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5110e9edd2ac97d28fbeccc0b739b543833627b695b92e19d264bc94adb148b3
3
+ size 2308432
requirements.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core ML Libraries
2
+ torch>=2.0.0
3
+ transformers>=4.36.0
4
+ peft>=0.13.2,<0.18.0
5
+ accelerate>=0.24.0
6
+ datasets>=2.14.0
7
+ huggingface-hub>=0.19.0
8
+ safetensors>=0.4.0
9
+
10
+ # Gradio for Hugging Face Spaces
11
+ gradio>=4.0.0
12
+
13
+ # Backend API
14
+ fastapi>=0.104.0
15
+ uvicorn>=0.24.0
16
+ python-multipart>=0.0.6
17
+
18
+ # Data Processing
19
+ numpy>=1.24.0
20
+ pandas>=2.0.0
21
+
22
+ # Configuration
23
+ python-dotenv>=1.0.0
24
+ pydantic-settings>=2.0.0
25
+
26
+ # Audio Processing (optional)
27
+ soundfile>=0.12.1
28
+ librosa>=0.10.1
29
+
30
+ # Utilities
31
+ tqdm>=4.65.0
32
+ requests>=2.31.0
33
+ aiofiles>=23.0.0