OrbitMC commited on
Commit
748b20d
Β·
verified Β·
1 Parent(s): ecbb6cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -115
app.py CHANGED
@@ -1,38 +1,20 @@
1
  import os
2
- import io
3
  import re
4
  import uuid
5
  import base64
6
  import datetime
7
  import traceback
8
- import numpy as np
9
- import soundfile as sf
10
  from flask import Flask, request, jsonify
11
  from num2words import num2words
12
- import torch
13
- from transformers import AutoTokenizer, AutoModelForCausalLM
14
 
15
  # ══════════════════════════════════════════
16
  # CONFIG
17
  # ══════════════════════════════════════════
18
 
19
- # TTS Options (switch via env var TTS_MODE):
20
- # nano-fp32 β†’ 15M params, 56MB (fastest, default)
21
- # nano-int8 β†’ 15M params, 25MB (smallest)
22
- # micro β†’ 40M params, 41MB (balanced)
23
- # mini β†’ 80M params, 80MB (best quality)
24
- TTS_MODE = os.environ.get("TTS_MODE", "nano-fp32")
25
-
26
- TTS_MODEL_MAP = {
27
- "nano-fp32": "KittenML/kitten-tts-nano-0.8-fp32",
28
- "nano-int8": "KittenML/kitten-tts-nano-0.8-int8",
29
- "micro": "KittenML/kitten-tts-micro-0.8",
30
- "mini": "KittenML/kitten-tts-mini-0.8",
31
- }
32
-
33
- # Voice: Bella, Jasper, Luna, Bruno, Rosie, Hugo, Kiki, Leo
34
- TTS_VOICE = os.environ.get("TTS_VOICE", "Kiki")
35
- TTS_SPEED = float(os.environ.get("TTS_SPEED", "1.15"))
36
  MAX_MEMORY = 20
37
  MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "256"))
38
 
@@ -60,51 +42,40 @@ def clean_text_for_tts(text):
60
  return text
61
 
62
  # ══════════════════════════════════════════
63
- # LOAD GEMMA 3 270M-IT
64
  # ══════════════════════════════════════════
65
  print("=" * 55)
66
  print(" J.A.R.V.I.S. β€” Booting Systems")
67
  print("=" * 55)
68
 
69
- print("[1/2] Loading Gemma 3 270M-IT...")
70
- GEMMA_ID = "LiquidAI/LFM2.5-1.2B-Instruct"
71
  try:
72
- tokenizer = AutoTokenizer.from_pretrained(GEMMA_ID)
73
- model = AutoModelForCausalLM.from_pretrained(
74
- GEMMA_ID,
75
- torch_dtype=torch.float32,
76
- device_map="cpu",
77
- )
78
- model.eval()
79
- print(" βœ… Gemma 3 270M-IT loaded!")
80
  except Exception as e:
81
- print(f" ❌ Gemma 3 FAILED: {e}")
82
- traceback.print_exc()
83
- raise SystemExit("Cannot start without Gemma. Check HF_TOKEN and license agreement.")
 
 
 
 
 
84
 
85
- # ══════════════════════════════════════════
86
- # LOAD KITTENTTS
87
- # ══════════════════════════════════════════
88
- tts = None
89
- tts_model_name = TTS_MODEL_MAP.get(TTS_MODE, TTS_MODEL_MAP["nano-fp32"])
90
- print(f"[2/2] Loading KittenTTS: {TTS_MODE} β†’ {tts_model_name}...")
91
  try:
92
- from kittentts import KittenTTS
93
- tts = KittenTTS(tts_model_name)
94
- test_audio = tts.generate("online", voice=TTS_VOICE, speed=TTS_SPEED)
95
- if test_audio is not None and len(test_audio) > 0:
96
- print(f" βœ… KittenTTS ready. Model: {TTS_MODE} | Voice: {TTS_VOICE}")
97
- else:
98
- print(" ⚠️ KittenTTS test returned empty audio!")
99
- tts = None
100
- except Exception as e:
101
- print(f" ⚠️ KittenTTS FAILED: {e}")
102
- tts = None
103
 
104
  print("=" * 55)
105
- print(f" LLM : Gemma 3 270M-IT")
106
- print(f" TTS : {TTS_MODE} ({'READY' if tts else 'DISABLED'})")
107
- print(f" Voice: {TTS_VOICE} | Speed: {TTS_SPEED}")
108
  print(f" Max tokens: {MAX_NEW_TOKENS}")
109
  print("=" * 55)
110
 
@@ -129,14 +100,14 @@ def add_to_memory(sid, role, content):
129
  sessions[sid] = mem[-(MAX_MEMORY * 2):]
130
 
131
  # ══════════════════════════════════════════
132
- # GEMMA RESPONSE GENERATION
133
  # ══════════════════════════════════════════
134
  def generate_response(user_input, session_id):
135
  memory = get_memory(session_id)
136
 
137
- # Build chat messages: system instruction β†’ memory β†’ new message
138
- messages = [
139
- {"role": "user", "content": f"[System Instruction]\n{SYSTEM_PROMPT}"},
140
  {"role": "assistant", "content": "I am waiting for you!"},
141
  ]
142
 
@@ -149,31 +120,18 @@ def generate_response(user_input, session_id):
149
  # Current user message
150
  messages.append({"role": "user", "content": user_input})
151
 
152
- # Tokenize with Gemma chat template
153
- input_ids = tokenizer.apply_chat_template(
154
  messages,
155
- return_tensors="pt",
156
- add_generation_prompt=True,
 
 
 
157
  )
158
 
159
- # Generate
160
- with torch.no_grad():
161
- outputs = model.generate(
162
- input_ids,
163
- max_new_tokens=MAX_NEW_TOKENS,
164
- do_sample=True,
165
- temperature=0.9,
166
- top_k=45,
167
- top_p=0.97,
168
- )
169
-
170
- # Decode only new tokens
171
- new_tokens = outputs[0][input_ids.shape[-1]:]
172
- response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
173
-
174
- # Clean artifacts
175
- response = response.split("<end_of_turn>")[0].strip()
176
- response = response.split("<start_of_turn>")[0].strip()
177
 
178
  if not response or len(response) < 2:
179
  response = "I appear to have momentarily lost my train of thought. Could you rephrase that?"
@@ -183,10 +141,19 @@ def generate_response(user_input, session_id):
183
  return response
184
 
185
  # ══════════════════════════════════════════
186
- # TTS SYNTHESIS
187
  # ══════════════════════════════════════════
 
 
 
 
 
 
 
 
 
188
  def synthesize_speech(text, voice=None):
189
- if tts is None:
190
  return None
191
  try:
192
  voice = voice or TTS_VOICE
@@ -195,13 +162,13 @@ def synthesize_speech(text, voice=None):
195
  return None
196
  if len(clean) > 400:
197
  clean = clean[:400]
198
- audio = tts.generate(clean, voice=voice, speed=TTS_SPEED)
199
- if audio is None or len(audio) == 0:
 
 
200
  return None
201
- buf = io.BytesIO()
202
- sf.write(buf, audio, 24000, format='WAV', subtype='PCM_16')
203
- buf.seek(0)
204
- return base64.b64encode(buf.read()).decode('utf-8')
205
  except Exception as e:
206
  print(f"TTS Error: {e}")
207
  return None
@@ -389,31 +356,23 @@ body{
389
  <div class="cfgbar" id="cfgPanel">
390
  <div class="cgrp">
391
  <label>LLM:</label>
392
- <span class="ctag">Gemma 3 270M-IT</span>
393
  </div>
394
  <div class="cgrp">
395
  <label>TTS:</label>
396
- <span class="ctag" id="ttsTag">loading...</span>
397
  </div>
398
  <div class="cgrp">
399
  <label>Voice:</label>
400
  <select id="voiceSel">
401
- <option value="Kiki">Kiki</option>
402
- <option value="Bella">Bella</option>
403
- <option value="Jasper">Jasper</option>
404
- <option value="Luna">Luna</option>
405
- <option value="Bruno">Bruno</option>
406
- <option value="Rosie">Rosie</option>
407
- <option value="Hugo">Hugo</option>
408
- <option value="Leo">Leo</option>
409
  </select>
410
  </div>
411
  <div class="cgrp">
412
- <label>TTS env options:</label>
413
- <span class="ctag">nano-fp32</span>
414
- <span class="ctag">nano-int8</span>
415
- <span class="ctag">micro</span>
416
- <span class="ctag">mini</span>
417
  </div>
418
  </div>
419
 
@@ -439,7 +398,7 @@ body{
439
 
440
  <script>
441
  let sid=crypto.randomUUID?crypto.randomUUID():Date.now().toString(36)+Math.random().toString(36).slice(2);
442
- let ttsOn=true,busy=false,mc=0,voice='Kiki';
443
  const C=document.getElementById('chatBox'),I=document.getElementById('msgIn'),B=document.getElementById('sendBtn');
444
 
445
  I.addEventListener('keydown',e=>{if(e.key==='Enter'&&!e.shiftKey){e.preventDefault();send()}});
@@ -532,7 +491,8 @@ function playB64(b){
532
  try{
533
  const bin=atob(b),u8=new Uint8Array(bin.length);
534
  for(let i=0;i<bin.length;i++)u8[i]=bin.charCodeAt(i);
535
- const url=URL.createObjectURL(new Blob([u8],{type:'audio/wav'}));
 
536
  const a=new Audio(url);
537
  a.play().catch(e=>console.log('Autoplay blocked:',e));
538
  a.onended=()=>URL.revokeObjectURL(url);
@@ -552,9 +512,9 @@ function sc(){C.scrollTop=C.scrollHeight}
552
 
553
  fetch('/health').then(r=>r.json()).then(d=>{
554
  document.getElementById('ttsTag').textContent=d.tts_mode+(d.tts_model==='DISABLED'?' (OFF)':'');
555
- document.getElementById('modInfo').textContent='Gemma 3 Β· '+d.tts_mode+' Β· '+d.tts_voice+' Β· CPU';
556
  const wi=document.getElementById('wInfo');
557
- if(wi)wi.textContent='LLM: Gemma 3 270M-IT | TTS: '+d.tts_mode+' | Voice: '+d.tts_voice;
558
  if(d.tts_model==='DISABLED')document.getElementById('sDot').classList.add('err');
559
  if(d.tts_voice){document.getElementById('voiceSel').value=d.tts_voice;voice=d.tts_voice}
560
  }).catch(()=>{});
@@ -592,7 +552,7 @@ def chat():
592
  return jsonify({
593
  "response": response,
594
  "session_id": session_id,
595
- "tts_available": tts is not None,
596
  "memory_length": len(get_memory(session_id)),
597
  })
598
 
@@ -604,7 +564,7 @@ def tts_endpoint():
604
 
605
  if not text:
606
  return jsonify({"error": "Empty text"}), 400
607
- if tts is None:
608
  return jsonify({"error": "TTS not available", "audio": None}), 200
609
 
610
  audio_b64 = synthesize_speech(text, voice=voice)
@@ -622,11 +582,11 @@ def clear():
622
  def health():
623
  return jsonify({
624
  "status": "online",
625
- "llm": "Gemma 3 270M-IT",
626
- "tts_mode": TTS_MODE,
627
- "tts_model": tts_model_name if tts else "DISABLED",
628
  "tts_voice": TTS_VOICE,
629
- "tts_voices": ["Bella","Jasper","Luna","Bruno","Rosie","Hugo","Kiki","Leo"],
630
  "max_new_tokens": MAX_NEW_TOKENS,
631
  })
632
 
 
1
  import os
 
2
  import re
3
  import uuid
4
  import base64
5
  import datetime
6
  import traceback
7
+ import asyncio
 
8
  from flask import Flask, request, jsonify
9
  from num2words import num2words
10
+ from transformers import pipeline
 
11
 
12
  # ══════════════════════════════════════════
13
  # CONFIG
14
  # ══════════════════════════════════════════
15
 
16
+ # edge-tts Options
17
+ TTS_VOICE = os.environ.get("TTS_VOICE", "zh-CN-XiaoyiNeural")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  MAX_MEMORY = 20
19
  MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "256"))
20
 
 
42
  return text
43
 
44
  # ══════════════════════════════════════════
45
+ # LOAD UNSLOTH GGUF & EDGE-TTS
46
  # ══════════════════════════════════════════
47
  print("=" * 55)
48
  print(" J.A.R.V.I.S. β€” Booting Systems")
49
  print("=" * 55)
50
 
51
+ print("[1/2] Loading LFM2.5 1.2B Instruct GGUF via pipeline...")
52
+ LLM_ID = "unsloth/LFM2.5-1.2B-Instruct-GGUF"
53
  try:
54
+ # We attempt to load a standard quant first to save RAM if multiple exist
55
+ pipe = pipeline("text-generation", model=LLM_ID, device_map="cpu", model_kwargs={"gguf_file": "*Q4_K_M.gguf"})
56
+ print(f" βœ… {LLM_ID} loaded with *Q4_K_M.gguf!")
 
 
 
 
 
57
  except Exception as e:
58
+ print(f" ⚠️ Pipeline load failed with specific gguf_file, trying default auto-load... ({e})")
59
+ try:
60
+ pipe = pipeline("text-generation", model=LLM_ID, device_map="cpu")
61
+ print(f" βœ… {LLM_ID} loaded with default!")
62
+ except Exception as e2:
63
+ print(f" ❌ Model FAILED completely: {e2}")
64
+ traceback.print_exc()
65
+ raise SystemExit("Cannot start without LLM. Check HF_TOKEN and GGUF compatibility.")
66
 
67
+ print("[2/2] Loading edge-tts...")
 
 
 
 
 
68
  try:
69
+ import edge_tts
70
+ print(f" βœ… edge-tts ready. Default Voice: {TTS_VOICE}")
71
+ except ImportError as e:
72
+ print(f" ❌ edge-tts FAILED: {e}")
73
+ edge_tts = None
 
 
 
 
 
 
74
 
75
  print("=" * 55)
76
+ print(f" LLM : {LLM_ID}")
77
+ print(f" TTS : edge-tts ({'READY' if edge_tts else 'DISABLED'})")
78
+ print(f" Voice: {TTS_VOICE} | Rate: +7% | Pitch: +20Hz")
79
  print(f" Max tokens: {MAX_NEW_TOKENS}")
80
  print("=" * 55)
81
 
 
100
  sessions[sid] = mem[-(MAX_MEMORY * 2):]
101
 
102
  # ══════════════════════════════════════════
103
+ # RESPONSE GENERATION
104
  # ══════════════════════════════════════════
105
  def generate_response(user_input, session_id):
106
  memory = get_memory(session_id)
107
 
108
+ # Build chat messages
109
+ messages =[
110
+ {"role": "system", "content": SYSTEM_PROMPT},
111
  {"role": "assistant", "content": "I am waiting for you!"},
112
  ]
113
 
 
120
  # Current user message
121
  messages.append({"role": "user", "content": user_input})
122
 
123
+ # Generate via pipeline
124
+ outputs = pipe(
125
  messages,
126
+ max_new_tokens=MAX_NEW_TOKENS,
127
+ do_sample=True,
128
+ temperature=0.9,
129
+ top_k=45,
130
+ top_p=0.97,
131
  )
132
 
133
+ # Extract the assistant's newly generated text
134
+ response = outputs[0]["generated_text"][-1]["content"].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  if not response or len(response) < 2:
137
  response = "I appear to have momentarily lost my train of thought. Could you rephrase that?"
 
141
  return response
142
 
143
  # ══════════════════════════════════════════
144
+ # TTS SYNTHESIS (EDGE-TTS)
145
  # ══════════════════════════════════════════
146
+ async def _synthesize_edge(text, voice):
147
+ # Applied specific settings from your image UI: rate +7%, pitch +20Hz
148
+ communicate = edge_tts.Communicate(text, voice, rate="+7%", pitch="+20Hz")
149
+ audio_data = b""
150
+ async for chunk in communicate.stream():
151
+ if chunk["type"] == "audio":
152
+ audio_data += chunk["data"]
153
+ return audio_data
154
+
155
  def synthesize_speech(text, voice=None):
156
+ if edge_tts is None:
157
  return None
158
  try:
159
  voice = voice or TTS_VOICE
 
162
  return None
163
  if len(clean) > 400:
164
  clean = clean[:400]
165
+
166
+ audio_bytes = asyncio.run(_synthesize_edge(clean, voice))
167
+
168
+ if not audio_bytes or len(audio_bytes) == 0:
169
  return None
170
+
171
+ return base64.b64encode(audio_bytes).decode('utf-8')
 
 
172
  except Exception as e:
173
  print(f"TTS Error: {e}")
174
  return None
 
356
  <div class="cfgbar" id="cfgPanel">
357
  <div class="cgrp">
358
  <label>LLM:</label>
359
+ <span class="ctag">LFM2.5-1.2B-Instruct-GGUF</span>
360
  </div>
361
  <div class="cgrp">
362
  <label>TTS:</label>
363
+ <span class="ctag" id="ttsTag">edge-tts</span>
364
  </div>
365
  <div class="cgrp">
366
  <label>Voice:</label>
367
  <select id="voiceSel">
368
+ <option value="zh-CN-XiaoyiNeural">Xiaoyi (zh-CN) Female</option>
369
+ <option value="en-US-AriaNeural">Aria (en-US) Female</option>
 
 
 
 
 
 
370
  </select>
371
  </div>
372
  <div class="cgrp">
373
+ <label>Settings:</label>
374
+ <span class="ctag">Rate: +7%</span>
375
+ <span class="ctag">Pitch: +20Hz</span>
 
 
376
  </div>
377
  </div>
378
 
 
398
 
399
  <script>
400
  let sid=crypto.randomUUID?crypto.randomUUID():Date.now().toString(36)+Math.random().toString(36).slice(2);
401
+ let ttsOn=true,busy=false,mc=0,voice='zh-CN-XiaoyiNeural';
402
  const C=document.getElementById('chatBox'),I=document.getElementById('msgIn'),B=document.getElementById('sendBtn');
403
 
404
  I.addEventListener('keydown',e=>{if(e.key==='Enter'&&!e.shiftKey){e.preventDefault();send()}});
 
491
  try{
492
  const bin=atob(b),u8=new Uint8Array(bin.length);
493
  for(let i=0;i<bin.length;i++)u8[i]=bin.charCodeAt(i);
494
+ // Using audio/mpeg as edge-tts outputs MP3 chunks
495
+ const url=URL.createObjectURL(new Blob([u8],{type:'audio/mpeg'}));
496
  const a=new Audio(url);
497
  a.play().catch(e=>console.log('Autoplay blocked:',e));
498
  a.onended=()=>URL.revokeObjectURL(url);
 
512
 
513
  fetch('/health').then(r=>r.json()).then(d=>{
514
  document.getElementById('ttsTag').textContent=d.tts_mode+(d.tts_model==='DISABLED'?' (OFF)':'');
515
+ document.getElementById('modInfo').textContent=d.llm+' Β· '+d.tts_mode+' Β· '+d.tts_voice+' Β· CPU';
516
  const wi=document.getElementById('wInfo');
517
+ if(wi)wi.textContent='LLM: '+d.llm+' | TTS: '+d.tts_mode+' | Voice: '+d.tts_voice;
518
  if(d.tts_model==='DISABLED')document.getElementById('sDot').classList.add('err');
519
  if(d.tts_voice){document.getElementById('voiceSel').value=d.tts_voice;voice=d.tts_voice}
520
  }).catch(()=>{});
 
552
  return jsonify({
553
  "response": response,
554
  "session_id": session_id,
555
+ "tts_available": edge_tts is not None,
556
  "memory_length": len(get_memory(session_id)),
557
  })
558
 
 
564
 
565
  if not text:
566
  return jsonify({"error": "Empty text"}), 400
567
+ if edge_tts is None:
568
  return jsonify({"error": "TTS not available", "audio": None}), 200
569
 
570
  audio_b64 = synthesize_speech(text, voice=voice)
 
582
  def health():
583
  return jsonify({
584
  "status": "online",
585
+ "llm": "unsloth/LFM2.5-1.2B-Instruct-GGUF",
586
+ "tts_mode": "edge-tts",
587
+ "tts_model": "edge-tts" if edge_tts else "DISABLED",
588
  "tts_voice": TTS_VOICE,
589
+ "tts_voices":["zh-CN-XiaoyiNeural", "en-US-AriaNeural"],
590
  "max_new_tokens": MAX_NEW_TOKENS,
591
  })
592