FDB01 commited on
Commit
4ad310c
·
verified ·
1 Parent(s): 7594c07

Delete app-PY

Browse files
Files changed (1) hide show
  1. app-PY +0 -616
app-PY DELETED
@@ -1,616 +0,0 @@
1
- from flask import Flask, request, jsonify, Response, send_file
2
- import torch
3
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
4
- import os
5
- import logging
6
- import io
7
- import numpy as np
8
- import scipy.io.wavfile as wavfile
9
- import soundfile as sf
10
- from pydub import AudioSegment
11
- import time
12
- from functools import lru_cache
13
- import gc
14
- import psutil
15
- import threading
16
- import time
17
- from queue import Queue
18
- import uuid
19
- import subprocess
20
- import tempfile
21
- import atexit
22
-
23
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
- logger = logging.getLogger(__name__)
25
-
26
- IS_HF_SPACE = os.environ.get('SPACE_ID') is not None
27
- HF_TOKEN = os.environ.get('HF_TOKEN')
28
-
29
- if IS_HF_SPACE:
30
- device = "cpu"
31
- torch.set_num_threads(2)
32
- os.environ['TOKENIZERS_PARALLELISM'] = 'false'
33
- logger.info("Running on Hugging Face Spaces - CPU optimized mode")
34
- else:
35
- device = "cuda" if torch.cuda.is_available() else "cpu"
36
- torch.set_num_threads(4)
37
-
38
- logger.info(f"Using device: {device}")
39
-
40
- app = Flask(__name__)
41
- app.config['TEMP_AUDIO_DIR'] = '/tmp/audio_responses'
42
- app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
43
-
44
- stt_pipeline = None
45
- llm_model = None
46
- llm_tokenizer = None
47
- tts_pipeline = None
48
- tts_type = None
49
-
50
- active_files = {}
51
- file_cleanup_lock = threading.Lock()
52
- cleanup_thread = None
53
-
54
- def cleanup_old_files():
55
- while True:
56
- try:
57
- with file_cleanup_lock:
58
- current_time = time.time()
59
- files_to_remove = []
60
-
61
- for file_id, file_info in list(active_files.items()):
62
- if current_time - file_info['created_time'] > 300:
63
- files_to_remove.append(file_id)
64
-
65
- for file_id in files_to_remove:
66
- try:
67
- if os.path.exists(active_files[file_id]['filepath']):
68
- os.remove(active_files[file_id]['filepath'])
69
- del active_files[file_id]
70
- logger.info(f"Cleaned up file: {file_id}")
71
- except Exception as e:
72
- logger.warning(f"Cleanup error for {file_id}: {e}")
73
- except Exception as e:
74
- logger.error(f"Cleanup thread error: {e}")
75
-
76
- time.sleep(60)
77
-
78
- def start_cleanup_thread():
79
- global cleanup_thread
80
- if cleanup_thread is None or not cleanup_thread.is_alive():
81
- cleanup_thread = threading.Thread(target=cleanup_old_files, daemon=True)
82
- cleanup_thread.start()
83
- logger.info("Cleanup thread started")
84
-
85
- def cleanup_all_files():
86
- try:
87
- with file_cleanup_lock:
88
- for file_id, file_info in active_files.items():
89
- try:
90
- if os.path.exists(file_info['filepath']):
91
- os.remove(file_info['filepath'])
92
- except:
93
- pass
94
- active_files.clear()
95
-
96
- if os.path.exists(app.config['TEMP_AUDIO_DIR']):
97
- import shutil
98
- shutil.rmtree(app.config['TEMP_AUDIO_DIR'], ignore_errors=True)
99
-
100
- logger.info("All temporary files cleaned up")
101
- except Exception as e:
102
- logger.warning(f"Final cleanup error: {e}")
103
-
104
- atexit.register(cleanup_all_files)
105
-
106
- def get_memory_usage():
107
- try:
108
- process = psutil.Process(os.getpid())
109
- memory_info = process.memory_info()
110
- return {
111
- "rss_mb": memory_info.rss / 1024 / 1024,
112
- "vms_mb": memory_info.vms / 1024 / 1024,
113
- "available_mb": psutil.virtual_memory().available / 1024 / 1024,
114
- "percent": psutil.virtual_memory().percent
115
- }
116
- except Exception as e:
117
- logger.warning(f"Memory info error: {e}")
118
- return {"rss_mb": 0, "vms_mb": 0, "available_mb": 0, "percent": 0}
119
-
120
- def initialize_models():
121
- global stt_pipeline, llm_model, llm_tokenizer, tts_pipeline, tts_type
122
-
123
- try:
124
- logger.info(f"Initial memory usage: {get_memory_usage()}")
125
-
126
- if stt_pipeline is None:
127
- logger.info("Loading Whisper-tiny STT model...")
128
- try:
129
- stt_pipeline = pipeline(
130
- "automatic-speech-recognition",
131
- model="openai/whisper-tiny",
132
- device=device,
133
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
134
- token=HF_TOKEN,
135
- return_timestamps=False
136
- )
137
- logger.info("✅ STT model loaded successfully")
138
- except Exception as e:
139
- logger.error(f"STT loading failed: {e}")
140
- raise
141
-
142
- gc.collect()
143
- logger.info(f"STT loaded. Memory: {get_memory_usage()}")
144
-
145
- if llm_model is None:
146
- logger.info("Loading DialoGPT-small LLM...")
147
- try:
148
- model_name = "google/flan-t5-base"
149
-
150
- llm_tokenizer = AutoTokenizer.from_pretrained(
151
- model_name,
152
- token=HF_TOKEN,
153
- trust_remote_code=True
154
- )
155
-
156
- llm_model = AutoModelForSeq2SeqLM.from_pretrained(
157
- model_name,
158
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
159
- token=HF_TOKEN,
160
- trust_remote_code=True
161
- ).to(device)
162
-
163
- if llm_tokenizer.pad_token is None:
164
- llm_tokenizer.pad_token = llm_tokenizer.eos_token
165
-
166
- logger.info("✅ LLM model loaded successfully")
167
- except Exception as e:
168
- logger.error(f"LLM loading failed: {e}")
169
- raise
170
-
171
- gc.collect()
172
- logger.info(f"LLM loaded. Memory: {get_memory_usage()}")
173
-
174
- if tts_pipeline is None:
175
- logger.info("Loading TTS model...")
176
- tts_loaded = False
177
-
178
- try:
179
- from gtts import gTTS
180
- tts_pipeline = "gtts"
181
- tts_type = "gtts"
182
- tts_loaded = True
183
- logger.info("✅ Using gTTS (Google Text-to-Speech)")
184
- except ImportError:
185
- logger.warning("gTTS not available")
186
-
187
- if not tts_loaded:
188
- tts_pipeline = "silent"
189
- tts_type = "silent"
190
- logger.warning("Using silent fallback for TTS")
191
-
192
- gc.collect()
193
- logger.info(f"TTS loaded. Memory: {get_memory_usage()}")
194
-
195
- logger.info("🎉 All models loaded successfully!")
196
- start_cleanup_thread()
197
-
198
- except Exception as e:
199
- logger.error(f"❌ Model loading error: {e}")
200
- logger.error(f"Memory usage at error: {get_memory_usage()}")
201
- raise e
202
-
203
- @lru_cache(maxsize=32)
204
- def cached_generate_response(text_hash, text):
205
- return generate_llm_response(text)
206
-
207
- def generate_llm_response(text):
208
- try:
209
- if len(text) > 200:
210
- text = text[:200]
211
-
212
- if not text.strip():
213
- return "I'm listening. How can I help you?"
214
-
215
- inputs = llm_tokenizer(
216
- text,
217
- return_tensors="pt",
218
- truncation=True,
219
- padding=True,
220
- max_length=512
221
- )
222
- input_ids = inputs["input_ids"].to(device)
223
- attention_mask = inputs.get("attention_mask")
224
- if attention_mask is not None:
225
- attention_mask = attention_mask.to(device)
226
-
227
- with torch.no_grad():
228
- is_seq2seq = getattr(getattr(llm_model, "config", {}), "is_encoder_decoder", False)
229
-
230
- gen_kwargs = dict(
231
- max_new_tokens=50,
232
- do_sample=True,
233
- temperature=0.7,
234
- top_k=50,
235
- top_p=0.9,
236
- no_repeat_ngram_size=2,
237
- early_stopping=True,
238
- pad_token_id=llm_tokenizer.eos_token_id if llm_tokenizer.pad_token_id is None else llm_tokenizer.pad_token_id,
239
- use_cache=True
240
- )
241
-
242
- if is_seq2seq:
243
- outputs_ids = llm_model.generate(
244
- input_ids=input_ids,
245
- attention_mask=attention_mask,
246
- **gen_kwargs
247
- )
248
- else:
249
- outputs_ids = llm_model.generate(
250
- input_ids=input_ids,
251
- **gen_kwargs
252
- )
253
-
254
- response = llm_tokenizer.decode(outputs_ids[0], skip_special_tokens=True)
255
-
256
- del inputs, input_ids, attention_mask, outputs_ids
257
- gc.collect()
258
- if device == "cuda":
259
- torch.cuda.empty_cache()
260
-
261
- response = response.strip()
262
- if not response or len(response) < 3:
263
- return "I understand. What else would you like to know?"
264
-
265
- return response
266
-
267
- except Exception as e:
268
- logger.error(f"LLM generation error: {e}", exc_info=True)
269
- return "I'm having trouble processing that. Could you try again?"
270
-
271
-
272
- def preprocess_audio_optimized(audio_bytes):
273
- try:
274
- logger.info(f"Processing audio: {len(audio_bytes)} bytes")
275
-
276
- if len(audio_bytes) > 44 and audio_bytes[:4] == b'RIFF':
277
- audio_bytes = audio_bytes[44:] # WAV header'ı atla
278
- logger.info("WAV header removed")
279
-
280
- audio_data = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
281
-
282
- max_samples = 30 * 16000
283
- if len(audio_data) > max_samples:
284
- audio_data = audio_data[:max_samples]
285
- logger.info("Audio trimmed to 30 seconds")
286
-
287
- min_samples = int(0.5 * 16000)
288
- if len(audio_data) < min_samples:
289
- logger.warning(f"Audio too short: {len(audio_data)/16000:.2f} seconds")
290
- return None, None
291
-
292
- logger.info(f"Audio processed: {len(audio_data)/16000:.2f} seconds")
293
- return 16000, audio_data
294
-
295
- except Exception as e:
296
- logger.error(f"Audio preprocessing error: {e}")
297
- raise e
298
-
299
- def generate_tts_audio(text):
300
- try:
301
- text = text.replace('\n', ' ').strip()
302
-
303
- if len(text) > 200:
304
- text = text[:200] + "..."
305
-
306
- if not text:
307
- text = "I understand."
308
-
309
- logger.info(f"TTS generating: '{text[:50]}...'")
310
-
311
- if tts_type == "gtts":
312
- from gtts import gTTS
313
-
314
- with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_file:
315
- try:
316
- tts = gTTS(text=text, lang='en', slow=False)
317
- tts.save(tmp_file.name)
318
-
319
- from pydub import AudioSegment
320
- audio_segment = AudioSegment.from_file(tmp_file.name, format="mp3")
321
- audio_segment = audio_segment.set_frame_rate(16000).set_channels(1) # Mono 16kHz
322
- wav_buffer = io.BytesIO()
323
- audio_segment.export(wav_buffer, format="wav")
324
- wav_data = wav_buffer.getvalue()
325
-
326
- os.unlink(tmp_file.name)
327
-
328
- return wav_data
329
-
330
- if len(mp3_data) > 1000:
331
- logger.info(f"TTS generated: {len(mp3_data)} bytes")
332
- return mp3_data
333
- else:
334
- raise Exception("Generated audio too small")
335
-
336
- except Exception as e:
337
- if os.path.exists(tmp_file.name):
338
- os.unlink(tmp_file.name)
339
- raise e
340
-
341
- logger.warning("Using silent fallback")
342
- audio_segment = AudioSegment.from_file(tmp_file.name, format="mp3")
343
- wav_buffer = io.BytesIO()
344
- audio_segment.export(wav_buffer, format="wav")
345
- return wav_buffer.getvalue()
346
-
347
- except Exception as e:
348
- logger.error(f"TTS error: {e}")
349
- try:
350
- audio_segment = AudioSegment.from_file(tmp_file.name, format="mp3")
351
- wav_buffer = io.BytesIO()
352
- audio_segment.export(wav_buffer, format="wav")
353
- return wav_buffer.getvalue()
354
- except:
355
- return b''
356
-
357
- @app.route('/process_audio', methods=['POST'])
358
- def process_audio():
359
- start_time = time.time()
360
-
361
- if not all([stt_pipeline, llm_model, llm_tokenizer, tts_pipeline]):
362
- logger.error("Models not ready")
363
- return jsonify({"error": "Models are still loading, please wait..."}), 503
364
-
365
- if not request.data:
366
- return jsonify({"error": "No audio data received"}), 400
367
-
368
- if len(request.data) < 1000:
369
- return jsonify({"error": "Audio data too small"}), 400
370
-
371
- initial_memory = get_memory_usage()
372
- logger.info(f"🎯 Processing started. Memory: {initial_memory['rss_mb']:.1f}MB")
373
-
374
- try:
375
- logger.info("🎤 Converting speech to text...")
376
- stt_start = time.time()
377
-
378
- rate, audio_data = preprocess_audio_optimized(request.data)
379
-
380
- if audio_data is None:
381
- return jsonify({"error": "Invalid or too short audio"}), 400
382
-
383
- stt_result = stt_pipeline(
384
- {"sampling_rate": rate, "raw": audio_data},
385
- generate_kwargs={"language": "en"}
386
- )
387
- transcribed_text = stt_result.get('text', '').strip()
388
-
389
- del audio_data
390
- gc.collect()
391
-
392
- stt_time = time.time() - stt_start
393
- logger.info(f"✅ STT completed: '{transcribed_text}' ({stt_time:.2f}s)")
394
-
395
- if not transcribed_text or len(transcribed_text) < 2:
396
- transcribed_text = "Could you repeat that please?"
397
-
398
- logger.info("🤖 Generating AI response...")
399
- llm_start = time.time()
400
-
401
- text_hash = hash(transcribed_text.lower())
402
- assistant_response = cached_generate_response(text_hash, transcribed_text)
403
-
404
- llm_time = time.time() - llm_start
405
- logger.info(f"✅ LLM completed: '{assistant_response}' ({llm_time:.2f}s)")
406
-
407
- logger.info("🔊 Converting to speech...")
408
- tts_start = time.time()
409
-
410
- audio_response = generate_tts_audio(assistant_response)
411
-
412
- if not audio_response:
413
- return jsonify({"error": "TTS generation failed"}), 500
414
-
415
- tts_time = time.time() - tts_start
416
- total_time = time.time() - start_time
417
-
418
- gc.collect()
419
- torch.cuda.empty_cache() if device == "cuda" else None
420
-
421
- final_memory = get_memory_usage()
422
- logger.info(f"✅ Processing complete! Total: {total_time:.2f}s (STT:{stt_time:.1f}s, LLM:{llm_time:.1f}s, TTS:{tts_time:.1f}s)")
423
- logger.info(f"Memory: {initial_memory['rss_mb']:.1f}MB → {final_memory['rss_mb']:.1f}MB")
424
-
425
- if not os.path.exists(app.config['TEMP_AUDIO_DIR']):
426
- os.makedirs(app.config['TEMP_AUDIO_DIR'])
427
-
428
- file_id = str(uuid.uuid4())
429
- temp_filename = os.path.join(app.config['TEMP_AUDIO_DIR'], f"{file_id}.mp3")
430
-
431
- temp_filename = os.path.join(app.config['TEMP_AUDIO_DIR'], f"{file_id}.wav")
432
- with open(temp_filename, 'wb') as f:
433
- f.write(audio_response)
434
-
435
- with file_cleanup_lock:
436
- active_files[file_id] = {
437
- 'filepath': temp_filename,
438
- 'created_time': time.time(),
439
- 'accessed': False
440
- }
441
-
442
- response_data = {
443
- 'status': 'success',
444
- 'file_id': file_id,
445
- 'stream_url': f'/stream_audio/{file_id}',
446
- 'message': assistant_response,
447
- 'transcribed': transcribed_text,
448
- 'processing_time': round(total_time, 2)
449
- }
450
-
451
- return jsonify(response_data)
452
-
453
- except Exception as e:
454
- logger.error(f"❌ Processing error: {e}", exc_info=True)
455
- gc.collect()
456
- torch.cuda.empty_cache() if device == "cuda" else None
457
-
458
- return jsonify({
459
- "error": "Processing failed",
460
- "details": str(e) if not IS_HF_SPACE else "Internal server error"
461
- }), 500
462
-
463
- @app.route('/stream_audio/<file_id>')
464
- def stream_audio(file_id):
465
- try:
466
- with file_cleanup_lock:
467
- if file_id in active_files:
468
- active_files[file_id]['accessed'] = True
469
- filepath = active_files[file_id]['filepath']
470
-
471
- if os.path.exists(filepath):
472
- logger.info(f"Streaming audio: {file_id}")
473
- return send_file(
474
- filepath,
475
- mimetype='audio/wav',
476
- as_attachment=False,
477
- download_name='response.wav'
478
- )
479
-
480
- logger.warning(f"Audio file not found: {file_id}")
481
- return jsonify({'error': 'File not found'}), 404
482
-
483
- except Exception as e:
484
- logger.error(f"Stream error: {e}")
485
- return jsonify({'error': 'Stream failed'}), 500
486
-
487
- @app.route('/health', methods=['GET'])
488
- def health_check():
489
- memory = get_memory_usage()
490
-
491
- status = {
492
- "status": "ready" if all([stt_pipeline, llm_model, llm_tokenizer, tts_pipeline]) else "loading",
493
- "models": {
494
- "stt": stt_pipeline is not None,
495
- "llm": llm_model is not None and llm_tokenizer is not None,
496
- "tts": tts_pipeline is not None,
497
- "tts_type": tts_type
498
- },
499
- "system": {
500
- "device": device,
501
- "is_hf_space": IS_HF_SPACE,
502
- "memory_mb": round(memory['rss_mb'], 1),
503
- "available_mb": round(memory['available_mb'], 1),
504
- "memory_percent": round(memory['percent'], 1)
505
- },
506
- "files": {
507
- "active_count": len(active_files),
508
- "cleanup_running": cleanup_thread is not None and cleanup_thread.is_alive()
509
- }
510
- }
511
-
512
- return jsonify(status)
513
-
514
- @app.route('/status', methods=['GET'])
515
- def simple_status():
516
- models_ready = all([stt_pipeline, llm_model, llm_tokenizer, tts_pipeline])
517
- return jsonify({"ready": models_ready})
518
-
519
- @app.route('/', methods=['GET'])
520
- def home():
521
- return """
522
- <!DOCTYPE html>
523
- <html>
524
- <head>
525
- <title>Voice AI Assistant</title>
526
- <style>
527
- body { font-family: Arial, sans-serif; margin: 40px; }
528
- .status { font-size: 18px; margin: 20px 0; }
529
- .ready { color: green; }
530
- .loading { color: orange; }
531
- .error { color: red; }
532
- code { background: #f4f4f4; padding: 2px 5px; }
533
- </style>
534
- </head>
535
- <body>
536
- <h1>🎙️ Voice AI Assistant Server</h1>
537
- <div class="status">Status: <span id="status">Checking...</span></div>
538
-
539
- <h2>API Endpoints:</h2>
540
- <ul>
541
- <li><code>POST /process_audio</code> - Dsn Mechanics </li>
542
- <li><code>POST /process_audio</code> - Process audio (WAV format, max 16MB)</li>
543
- <li><code>GET /stream_audio/&lt;file_id&gt;</code> - Download audio response</li>
544
- <li><code>GET /health</code> - Detailed health check</li>
545
- <li><code>GET /status</code> - Simple ready status</li>
546
- </ul>
547
-
548
- <h2>Features:</h2>
549
- <ul>
550
- <li>Speech-to-Text (Whisper Tiny)</li>
551
- <li>AI Response Generation (DialoGPT Small)</li>
552
- <li>Text-to-Speech (gTTS)</li>
553
- <li>Automatic file cleanup</li>
554
- <li>Memory optimization</li>
555
- </ul>
556
-
557
- <p><em>Optimized for ESP32 and Hugging Face Spaces</em></p>
558
-
559
- <script>
560
- function updateStatus() {
561
- fetch('/status')
562
- .then(r => r.json())
563
- .then(d => {
564
- const statusEl = document.getElementById('status');
565
- if (d.ready) {
566
- statusEl.textContent = '✅ Ready';
567
- statusEl.className = 'ready';
568
- } else {
569
- statusEl.textContent = '⏳ Loading models...';
570
- statusEl.className = 'loading';
571
- }
572
- })
573
- .catch(() => {
574
- document.getElementById('status').textContent = '❌ Error';
575
- document.getElementById('status').className = 'error';
576
- });
577
- }
578
-
579
- updateStatus();
580
- setInterval(updateStatus, 5000);
581
- </script>
582
- </body>
583
- </html>
584
- """
585
-
586
- @app.errorhandler(Exception)
587
- def handle_exception(e):
588
- logger.error(f"Unhandled exception: {e}", exc_info=True)
589
- return jsonify({"error": "Internal server error"}), 500
590
-
591
- @app.errorhandler(413)
592
- def handle_large_file(e):
593
- return jsonify({"error": "Audio file too large (max 16MB)"}), 413
594
-
595
- if __name__ == '__main__':
596
- try:
597
- logger.info("🚀 Starting Voice AI Assistant Server")
598
- logger.info(f"Environment: {'Hugging Face Spaces' if IS_HF_SPACE else 'Local'}")
599
-
600
- initialize_models()
601
- logger.info("🎉 Server ready!")
602
-
603
- except Exception as e:
604
- logger.error(f"❌ Startup failed: {e}")
605
- exit(1)
606
-
607
- port = int(os.environ.get('PORT', 7860))
608
- logger.info(f"🌐 Server starting on port {port}")
609
-
610
- app.run(
611
- host='0.0.0.0',
612
- port=port,
613
- debug=False,
614
- threaded=True,
615
- use_reloader=False
616
- )