Vedika66 commited on
Commit
592a394
ยท
verified ยท
1 Parent(s): 4319036

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -16
app.py CHANGED
@@ -3,11 +3,14 @@ import requests
3
  import json
4
  import re
5
  import tempfile
 
6
  import numpy as np
7
  from datetime import datetime, timedelta, timezone
8
  from bs4 import BeautifulSoup
9
  from flask import Flask, request, Response, stream_with_context, render_template_string, send_file
10
  from supertonic import TTS
 
 
11
 
12
  app = Flask(__name__)
13
 
@@ -36,6 +39,9 @@ LANGUAGES = {
36
  "Turkish": "tr", "Ukrainian": "uk", "Vietnamese": "vi"
37
  }
38
 
 
 
 
39
  # ----------------------------------------------------
40
  # ๐Ÿ“ GPS REVERSE GEOCODING
41
  # ----------------------------------------------------
@@ -182,8 +188,8 @@ def chat():
182
  thinking_instruction = f"""
183
  [CRITICAL INSTRUCTION: THINKING MODE ENABLED]
184
  Effort Level: {thinking_effort.upper()} - {effort_text}
185
- You MUST format your reasoning exactly inside <think> and </think> HTML tags.
186
- Do NOT use special system tokens like <|channel|>thought or <|im_start|>. Use standard <think> tags.
187
  """
188
 
189
  location_instruction = ""
@@ -288,11 +294,11 @@ STRICT DIRECTIVES:
288
  if "content" in delta and delta["content"]:
289
  content = delta["content"]
290
 
291
- # Translate Qwen/Other models' internal thinking tokens to our standard HTML <think> tags in real-time!
292
- content = content.replace("<|channel|>thought <|channel|>", "<think>\n")
293
- content = content.replace("<|channel|>answer <|channel|>", "\n</think>\n")
294
- content = content.replace("<|im_start|>thought", "<think>\n")
295
- content = content.replace("<|im_end|>", "\n</think>\n")
296
 
297
  delta["content"] = content
298
 
@@ -331,22 +337,39 @@ def generate_tts():
331
 
332
  try:
333
  lang_code = LANGUAGES.get(language_name, "en")
334
- style = tts.get_voice_style(voice_name=voice)
 
 
 
 
335
 
336
  # Synthesize audio
337
  wav, duration = tts.synthesize(clean_text, voice_style=style, lang=lang_code)
338
 
339
- # Save to temporary file temporarily
340
- temp_path = tempfile.mktemp(suffix=".wav")
341
- tts.save_audio(wav, temp_path)
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
- # Read file into memory and delete the temp file immediately
344
- with open(temp_path, "rb") as f:
345
- audio_data = f.read()
346
- os.remove(temp_path)
 
347
 
348
  # Return the actual audio file
349
- return Response(audio_data, mimetype="audio/wav")
350
 
351
  except Exception as e:
352
  return Response(json.dumps({"error": str(e)}), status=500, mimetype='application/json')
 
3
  import json
4
  import re
5
  import tempfile
6
+ import io
7
  import numpy as np
8
  from datetime import datetime, timedelta, timezone
9
  from bs4 import BeautifulSoup
10
  from flask import Flask, request, Response, stream_with_context, render_template_string, send_file
11
  from supertonic import TTS
12
+ from pydub import AudioSegment
13
+ from scipy.io import wavfile
14
 
15
  app = Flask(__name__)
16
 
 
39
  "Turkish": "tr", "Ukrainian": "uk", "Vietnamese": "vi"
40
  }
41
 
42
+ # Voice Styles Cache for faster processing
43
+ VOICE_STYLES_CACHE = {}
44
+
45
  # ----------------------------------------------------
46
  # ๐Ÿ“ GPS REVERSE GEOCODING
47
  # ----------------------------------------------------
 
188
  thinking_instruction = f"""
189
  [CRITICAL INSTRUCTION: THINKING MODE ENABLED]
190
  Effort Level: {thinking_effort.upper()} - {effort_text}
191
+ You MUST format your reasoning exactly inside and HTML tags.
192
+ Do NOT use special system tokens like <|channel|>thought or <|im_start|>. Use standard tags.
193
  """
194
 
195
  location_instruction = ""
 
294
  if "content" in delta and delta["content"]:
295
  content = delta["content"]
296
 
297
+ # Translate Qwen/Other models' internal thinking tokens to our standard HTML tags in real-time!
298
+ content = content.replace("<|channel|>thought <|channel|>", "\n")
299
+ content = content.replace("<|channel|>answer <|channel|>", "\n\n")
300
+ content = content.replace("thought", "\n")
301
+ content = content.replace("", "\n\n")
302
 
303
  delta["content"] = content
304
 
 
337
 
338
  try:
339
  lang_code = LANGUAGES.get(language_name, "en")
340
+
341
+ # ๐Ÿš€ OPTIMIZATION: Voice Style Caching
342
+ if voice not in VOICE_STYLES_CACHE:
343
+ VOICE_STYLES_CACHE[voice] = tts.get_voice_style(voice_name=voice)
344
+ style = VOICE_STYLES_CACHE[voice]
345
 
346
  # Synthesize audio
347
  wav, duration = tts.synthesize(clean_text, voice_style=style, lang=lang_code)
348
 
349
+ # ๐Ÿš€ OPTIMIZATION: In-Memory Processing (No Disk I/O)
350
+ buffer = io.BytesIO()
351
+ sample_rate = 22050
352
+
353
+ if wav.dtype != np.int16:
354
+ max_val = np.max(np.abs(wav))
355
+ if max_val > 0:
356
+ wav_int16 = np.int16(wav / max_val * 32767)
357
+ else:
358
+ wav_int16 = wav.astype(np.int16)
359
+ wavfile.write(buffer, sample_rate, wav_int16)
360
+ else:
361
+ wavfile.write(buffer, sample_rate, wav)
362
+
363
+ buffer.seek(0)
364
 
365
+ # ๐Ÿš€ OPTIMIZATION: WAV to MP3 Conversion
366
+ audio_segment = AudioSegment.from_wav(buffer)
367
+ mp3_buffer = io.BytesIO()
368
+ audio_segment.export(mp3_buffer, format="mp3", bitrate="128k", parameters=["-ar", "22050"])
369
+ mp3_buffer.seek(0)
370
 
371
  # Return the actual audio file
372
+ return Response(mp3_buffer.read(), mimetype="audio/mpeg")
373
 
374
  except Exception as e:
375
  return Response(json.dumps({"error": str(e)}), status=500, mimetype='application/json')