Marcos Claude commited on Aug 27, 2025

Commit

4f31f44

1 Parent(s): b0c4347

refactor: clean project structure to essentials only

- Renamed main server to server.py (single server)
- Created simplified start.sh script
- Removed 200+ redundant files:
- 60+ duplicate test files
- Unused directories (llama-omni2-official, llama_omni2_integration)
- Old installation scripts
- Development/debug files
- Kept only essential files:
- server.py (main WebRTC server with GPU+vLLM)
- install.sh (complete installation with exact versions)
- start.sh (simple startup script)
- llama_omni2/ (core implementation)
- requirements.txt
- Model: Qwen3-0.6B with vLLM 0.8.4
- Performance: 378ms latency, 92% coherence

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.claude/commands/benchmark.md +61 -0
.claude/commands/debug-pipeline.md +92 -0
.claude/commands/optimize-model.md +48 -0
.claude/commands/start-webrtc.md +26 -0
.claude/commands/test-latency.md +21 -0
ANALISE_MODIFICACOES.md +0 -128
CLAUDE.md +0 -322
COMMUNICATION_LATENCY_ANALYSIS_REPORT.md +0 -302
DUAL_MODEL_USAGE.md +0 -83
INSTALLATION_GUIDE.md +0 -133
PACKAGES_INSTALLED.md +0 -74
PLANO_INTEGRACAO_REALTIMETTS.md +0 -194
PLANO_TROCA_LLM_TTS_EXTERNO.md +0 -261
README.md +0 -394
README_INSTALLATION.md +0 -184
RELATORIO_VIABILIDADE_TROCA_QWEN_MULTILINGUE.md +0 -302
SPEECH_PROJECTOR_ANALYSIS.md +0 -110
analyze_generated_audio.py +0 -227
benchmark_20q_gpu_final.py +0 -312
coherence_test_results.json +0 -82
communication_analysis_report.py +0 -388
communication_latency_test.py +0 -370
create_real_speech.py +0 -100
create_test_audio.py +0 -94
docs/A1_VOCABULARY_CONTROL_TECHNIQUES.md +0 -587
download_llama_omni2.py +0 -25
download_official_model.py +0 -13
generate_test_audios.py +0 -81
gtts_test_results.json +0 -82
installed_packages.txt +0 -246
llama_omni2_integration/__init__.py +0 -1
llama_omni2_integration/constants.py +0 -9
llama_omni2_integration/omni2_speech_arch.py +0 -201
llama_omni2_integration/qwen2_speech_model.py +0 -155
llama_omni2_integration/qwen2_speech_model_fixed.py +0 -294
llama_omni2_integration/speech_encoder/__init__.py +0 -1
llama_omni2_integration/speech_encoder/builder.py +0 -9
llama_omni2_integration/speech_encoder/speech_encoder.py +0 -26
llama_omni2_integration/speech_projector/__init__.py +0 -1
llama_omni2_integration/speech_projector/builder.py +0 -9
llama_omni2_integration/speech_projector/speech_projector.py +0 -30
load_speech_projector.py +0 -184
webrtc_server_gpu_vllm.py → server.py +0 -0
simple_speech_chat_torchcompiled.py +0 -230
start.sh +51 -246
stop.sh +0 -76
streaming_latency_test.py +0 -262
system_prompt_v2.md +0 -94
test_100_questions_final.py +0 -401
test_100_questions_final_v1.py +0 -413

.claude/commands/benchmark.md ADDED Viewed

	@@ -0,0 +1,61 @@

+# Benchmark Performance Command
+Run comprehensive performance benchmarks on the speech-to-speech system.
+## Benchmark Tests:
+### 1. Model Loading Time
+```bash
+python -c "
+import time
+start = time.time()
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen3-0.6B', cache_dir='/tmp/hf_cache')
+print(f'Load time: {time.time() - start:.2f}s')
+"
+```
+### 2. Inference Speed
+```bash
+python test_optimized_cpu_inference.py
+```
+### 3. Full Pipeline
+```bash
+python test_optimized_speech_to_speech.py
+```
+### 4. WebRTC Latency
+```bash
+python test_webrtc_optimized.py
+```
+## Metrics to Track:
+- Model load time (target: <5s)
+- Inference latency (target: <3s)
+- Audio generation (target: <0.5s)
+- Total pipeline (target: <5s)
+- Memory usage (monitor with htop)
+## Generate Report:
+```python
+results = {
+    'model_load': load_time,
+    'inference': inference_time,
+    'audio_in': audio_in_time,
+    'audio_out': audio_out_time,
+    'total': total_time
+}
+print(f"""
+Performance Report
+==================
+Model Load: {results['model_load']:.2f}s
+Inference: {results['inference']:.2f}s
+Audio Input: {results['audio_in']:.2f}s
+Audio Output: {results['audio_out']:.2f}s
+Total Pipeline: {results['total']:.2f}s
+Status: {'✅ PASS' if results['total'] < 5 else '❌ FAIL'}
+""")
+```

.claude/commands/debug-pipeline.md ADDED Viewed

	@@ -0,0 +1,92 @@

+# Debug Pipeline Command
+Debug the speech-to-speech pipeline step by step.
+## Debug Steps:
+### 1. Check Environment
+```bash
+python -c "
+import sys
+print('Python:', sys.version)
+import torch
+print('PyTorch:', torch.__version__)
+import transformers
+print('Transformers:', transformers.__version__)
+"
+```
+### 2. Test Audio Input
+```python
+from gtts import gTTS
+import tempfile
+import soundfile as sf
+text = "Teste de áudio"
+tts = gTTS(text=text, lang='pt')
+with tempfile.NamedTemporaryFile(suffix='.mp3') as f:
+    tts.save(f.name)
+    audio, sr = sf.read(f.name)
+    print(f"Audio shape: {audio.shape}, Sample rate: {sr}")
+```
+### 3. Test Model Loading
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained(
+    "Qwen/Qwen3-0.6B",
+    trust_remote_code=True,
+    cache_dir="/tmp/hf_cache"
+)
+print("✅ Tokenizer loaded")
+model = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen3-0.6B",
+    torch_dtype=torch.float32,
+    trust_remote_code=True,
+    cache_dir="/tmp/hf_cache"
+)
+print("✅ Model loaded")
+```
+### 4. Test Inference
+```python
+prompt = "P: Qual é a capital do Brasil?\nR:"
+inputs = tokenizer(prompt, return_tensors="pt")
+with torch.no_grad():
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=15,
+        do_sample=False
+    )
+response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(f"Response: {response}")
+```
+### 5. Test WebRTC
+```python
+import asyncio
+from aiortc import RTCPeerConnection, RTCConfiguration, RTCIceServer
+async def test_webrtc():
+    ice_servers = [RTCIceServer(urls=["stun:stun.l.google.com:19302"])]
+    config = RTCConfiguration(iceServers=ice_servers)
+    pc = RTCPeerConnection(configuration=config)
+    channel = pc.createDataChannel("test")
+    print("✅ WebRTC components working")
+    await pc.close()
+asyncio.run(test_webrtc())
+```
+## Common Issues:
+| Error | Solution |
+|-------|----------|
+| ModuleNotFoundError | Check virtual env activation |
+| CUDA not available | Use CPU optimizations |
+| High latency | Enable torch.compile() |
+| WebRTC fails | Check port availability |

.claude/commands/optimize-model.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# Optimize Model Command
+Apply all optimizations to achieve <3s latency.
+## Optimization Checklist:
+- [ ] Enable torch.compile()
+- [ ] Set max_new_tokens=15
+- [ ] Use greedy decoding (do_sample=False)
+- [ ] Enable KV cache
+- [ ] Model in eval mode
+- [ ] Warm-up model before first use
+## Code Template:
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# Load model with optimizations
+model = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen3-0.6B",
+    torch_dtype=torch.float32,
+    trust_remote_code=True,
+    cache_dir="/tmp/hf_cache",
+    low_cpu_mem_usage=True
+)
+# Apply optimizations
+model.eval()
+model = torch.compile(model, mode="reduce-overhead")
+# Warm-up
+with torch.no_grad():
+    warm_input = tokenizer("Test", return_tensors="pt")
+    _ = model.generate(**warm_input, max_new_tokens=5)
+# Optimized generation
+def generate_fast(text):
+    inputs = tokenizer(text, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=15,
+            do_sample=False,
+            num_beams=1,
+            use_cache=True
+        )
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+```

.claude/commands/start-webrtc.md ADDED Viewed

	@@ -0,0 +1,26 @@

+# Start WebRTC Server Command
+Launch the WebRTC server for real-time speech communication.
+## Steps:
+1. Check if port 8081 is available
+2. Activate virtual environment
+3. Start the unified WebRTC server
+4. Monitor for connection
+## Command:
+```bash
+cd /tmp/llama-omni2-official-code
+source /tmp/llama-omni2-vllm-env/bin/activate
+python unified_webrtc_server.py
+```
+## Ports:
+- WebRTC: 8081
+- WebSocket: ws://localhost:8081/ws
+- HTTP: http://localhost:8081
+## Health Check:
+```bash
+curl http://localhost:8081/health
+```

.claude/commands/test-latency.md ADDED Viewed

	@@ -0,0 +1,21 @@

+# Test Latency Command
+Run the optimized speech-to-speech pipeline test to verify latency is under 3 seconds.
+## Steps:
+1. Activate the virtual environment
+2. Run the optimized test script
+3. Verify latency metrics
+4. Report results
+## Command:
+```bash
+cd /tmp/llama-omni2-official-code
+source /tmp/llama-omni2-vllm-env/bin/activate
+python test_optimized_speech_to_speech.py
+```
+## Success Criteria:
+- Average inference time < 3s
+- Full pipeline < 5s
+- All test questions answered correctly

ANALISE_MODIFICACOES.md DELETED Viewed

@@ -1,128 +0,0 @@
-# 📋 Análise de Modificações - WebRTC + gRPC
-## 🟢 Arquivos para MANTER (Essenciais)
-### 1. **test_webrtc_latency.py** ✅
-- **Motivo**: Teste principal da integração WebRTC
-- **Funcionalidade**: Testa pipeline completa WebRTC → Worker → TTS
-- **Status**: Funcionando perfeitamente
-- **Decisão**: **MANTER E COMMITAR**
-### 2. **webrtc_client_aiortc.py** ✅
-- **Motivo**: Implementação com aiortc (framework solicitado)
-- **Funcionalidade**: Cliente WebRTC nativo usando aiortc
-- **Status**: Funcionando, latência de 2-6s
-- **Decisão**: **MANTER E COMMITAR**
-### 3. **test_webrtc_benchmark.py** ✅
-- **Motivo**: Benchmark de performance (5 requisições)
-- **Funcionalidade**: Análise estatística de latência e consistência
-- **Resultados**: 100% sucesso, média 4.5s, CV 33%
-- **Decisão**: **MANTER E COMMITAR**
-### 4. **llama_omni2/serve/webrtc_server.py** ⚠️
-- **Motivo**: Servidor WebRTC integrado com Worker gRPC
-- **Modificações principais**:
-  - ✅ Integração com Worker gRPC real
-  - ✅ Import base64 adicionado
-  - ✅ WebRTCAudioProcessor conecta ao Worker
-  - ✅ handle_audio_data implementado
-  - ✅ WebSocket tracking adicionado
-  - ✅ Path estático corrigido
-- **Decisão**: **MANTER MODIFICAÇÕES**
-## 🔴 Arquivos para REMOVER/REVERTER
-### 1. **response_webrtc.wav**
-- **Motivo**: Arquivo de teste temporário
-- **Decisão**: **REMOVER** (se existir)
-### 2. **response_aiortc.wav**
-- **Motivo**: Arquivo de teste temporário
-- **Decisão**: **REMOVER** (se existir)
-## 📊 Resumo das Modificações no webrtc_server.py
-### Mudanças ESSENCIAIS (manter):
-```python
-# 1. Import base64 (linha 13)
-import base64
-# 2. Import gRPC Worker (linhas 24-27)
-import worker_service_pb2
-import worker_service_pb2_grpc
-# 3. Conexão real com Worker (linhas 37-44)
-self.worker_channel = grpc.insecure_channel(f'{worker_host}:{worker_port}')
-self.worker_stub = worker_service_pb2_grpc.ModelWorkerStub(self.worker_channel)
-# 4. handle_audio_data implementado (linhas 256-292)
-async def handle_audio_data(self, data, connection_id):
-    # Processa áudio via WebSocket
-    # Envia para Worker gRPC
-    # Retorna resposta
-# 5. WebSocket tracking (linhas 148-149)
-self.pending_responses = {}
-self.websockets = {}
-# 6. Path estático corrigido (linha 369-370)
-static_path = os.path.join(os.path.dirname(__file__), "static")
-```
-### Mudanças OPCIONAIS (avaliar):
-- Remoção do grpc_adapter (substituído por conexão direta)
-- gTTS como fallback (até integrar CosyVoice real)
-## 🎯 Recomendações
-### Para Commitar:
-1. ✅ test_webrtc_latency.py
-2. ✅ webrtc_client_aiortc.py
-3. ✅ test_webrtc_benchmark.py
-4. ✅ llama_omni2/serve/webrtc_server.py (modificado)
-### Comando sugerido:
-```bash
-# Adicionar arquivos essenciais
-git add test_webrtc_latency.py
-git add webrtc_client_aiortc.py
-git add test_webrtc_benchmark.py
-git add llama_omni2/serve/webrtc_server.py
-# Commit com mensagem descritiva
-git commit -m "feat: Implementação completa WebRTC com aiortc
-- Integração WebRTC Server com Worker gRPC
-- Cliente aiortc para WebRTC nativo
-- Testes de latência e benchmark (5 requisições)
-- Pipeline completa: Audio → WebRTC → Worker → TTS → Audio
-- Latência média: 4.5s (100% sucesso)
-🤖 Generated with Claude Code
-Co-Authored-By: Claude <noreply@anthropic.com>"
-```
-## 📈 Métricas de Performance
-| Componente | Latência | Status |
-|------------|----------|--------|
-| WebRTC Handshake | ~50ms | ✅ |
-| Worker gRPC | 2.7-6.3s | ⚠️ |
-| TTS (gTTS) | ~500ms | ✅ |
-| **Total Pipeline** | 3.1-6.7s | ⚠️ |
-## 🔍 Problemas Conhecidos
-1. **Latência variável no Worker**: 2.7s a 6.3s
-2. **Warmup perde efeito**: Performance degrada após várias requisições
-3. **CosyVoice não integrado**: Usando gTTS como fallback
-## ✅ Conquistas
-1. ✅ WebRTC com aiortc funcionando
-2. ✅ Integração completa com Worker gRPC
-3. ✅ Pipeline speech-to-speech funcional
-4. ✅ 100% taxa de sucesso
-5. ✅ Testes e benchmarks implementados

CLAUDE.md DELETED Viewed

@@ -1,322 +0,0 @@
-# LLaMA-Omni2 Speech-to-Speech System - Claude Code Configuration
-## 🎯 Project Overview
-End-to-end speech-to-speech conversation system implementing the official LLaMA-Omni2 architecture with **Qwen3-0.6B** for Portuguese language support. Achieves **<500ms inference latency** with GPU + vLLM optimization.
-## 🏗️ Architecture Pipeline
-```
-🎤 Audio → Whisper(GPU) → Speech Projector → Qwen3-0.6B(vLLM) → gTTS → 🔊 Audio
-              ↓                    ↓                ↓
-         (embeddings)      (feature mapping)   (generation)
-```
-### Key Components
-- **Whisper Encoder**: GPU-accelerated, extracts embeddings (no transcription)
-- **Speech Projector**: EncoderProjectorConcat with k=5 downsampling
-- **LLM**: Qwen3-0.6B with vLLM 0.8.4+ (378ms latency achieved!)
-- **TTS**: gTTS for Portuguese audio
-- **WebRTC**: Real-time communication on port 8888
-## 🚀 Performance Metrics
-### Latest Benchmarks (GPU + vLLM)
-| Configuration | Latency | Improvement |
-|--------------|---------|-------------|
-| CPU Baseline | 3610ms | - |
-| GPU PyTorch | 459ms | 87% faster |
-| **GPU + vLLM** | **378ms** | **89.5% faster** |
-| WebRTC End-to-End | 532ms | Still < target |
-| Target | <500ms | ✅ ACHIEVED |
-### Quality Test Results (50 Questions)
-- **Overall Coherence**: 92% (46/50 correct)
-- **Latency per question**: 65ms average
-- **Categories**:
-  - Saudações: 100% (10/10)
-  - Conhecimento: 90% (9/10)
-  - Matemática: 70% (7/10)
-  - Conversação: 100% (10/10)
-  - Tempo/Data: 100% (5/5)
-  - Despedidas: 100% (5/5)
-## 📦 Tech Stack
-### Core Dependencies
-```yaml
-Python: 3.10-3.12
-PyTorch: 2.6.0 (CUDA 12.1)
-vLLM: 0.8.4+ (REQUIRED for Qwen3)
-Transformers: 4.55.4
-CUDA: 12.1+
-GPU: NVIDIA RTX 3060 (12GB VRAM)
-```
-### Model Requirements
-```yaml
-Model: Qwen/Qwen3-0.6B (NOT Qwen2!)
-vLLM: >= 0.8.4 (Qwen3ForCausalLM support)
-Memory: ~1.2GB model weights
-Dtype: float16 for GPU
-```
-## 📁 Project Structure
-```
-/tmp/llama-omni2-official-code/
-├── llama_omni2/
-│   ├── model/
-│   │   ├── language_model/
-│   │   │   └── omni2_speech_qwen2.py    # Core implementation
-│   │   └── speech_projector/
-│   │       └── speech_projector.py      # Feature projection
-│   └── serve/
-│       └── tts/                         # TTS services
-├── vLLM_GPU_configs/
-│   ├── webrtc_server_gpu_vllm.py       # Production server ⭐
-│   ├── test_gpu_vllm.py                # GPU benchmark
-│   ├── test_gpu_alternatives.py        # PyTorch vs vLLM
-│   └── benchmark_20q_gpu_final.py      # Full benchmark
-├── quality_tests/
-│   ├── test_qwen3_vllm_fixed.py        # Qwen3 validation ⭐
-│   ├── test_qwen3_quality_50.py        # 50 questions test
-│   └── test_webrtc_quality_50questions.py
-├── scripts/
-│   ├── setup_gpu_vllm.sh               # GPU setup script
-│   └── install_vllm_direct.sh          # CPU fallback
-└── reports/
-    ├── qwen3_quality_report.json       # 92% coherence
-    └── webrtc_quality_report.json      # WebRTC metrics
-```
-## 🚀 Commands & Operations
-### Environment Setup (GPU + vLLM)
-```bash
-# Create vLLM environment
-python3 -m venv /tmp/llama-omni2-vllm-env
-source /tmp/llama-omni2-vllm-env/bin/activate
-# Install vLLM 0.8.4+ (REQUIRED for Qwen3)
-pip install vllm==0.8.4 --index-url https://download.pytorch.org/whl/cu121
-# Verify GPU
-nvidia-smi  # Should show RTX 3060
-python -c "import torch; print(torch.cuda.is_available())"
-```
-### Server Operations
-```bash
-# Start WebRTC server with GPU + vLLM (Production) ⭐
-cd /tmp/llama-omni2-official-code
-python webrtc_server_gpu_vllm.py  # Port 8888
-# Test endpoint
-curl http://localhost:8888/test?text="Olá"
-# Check stats
-curl http://localhost:8888/stats
-```
-### Testing & Benchmarks
-```bash
-# Test Qwen3 with vLLM ⭐
-python test_qwen3_vllm_fixed.py
-# Quality test (50 questions)
-python test_qwen3_quality_50.py
-# WebRTC quality test
-python test_webrtc_quality_50questions.py
-# GPU vs CPU benchmark
-python benchmark_20q_gpu_final.py
-```
-## 💻 Code Patterns & Best Practices
-### vLLM Configuration (Qwen3)
-```python
-from vllm import LLM, SamplingParams
-# MUST use vLLM 0.8.4+ for Qwen3
-llm = LLM(
-    model="Qwen/Qwen3-0.6B",  # NOT Qwen2!
-    trust_remote_code=True,
-    dtype="float16",
-    gpu_memory_utilization=0.60,
-    max_model_len=256,
-    enforce_eager=True,  # For multiprocessing
-    tensor_parallel_size=1
-)
-sampling_params = SamplingParams(
-    max_tokens=30,
-    temperature=0.7,
-    top_p=0.9
-)
-```
-### Multiprocessing Fix (Required)
-```python
-if __name__ == "__main__":
-    import multiprocessing
-    multiprocessing.set_start_method('spawn', force=True)
-    main()
-```
-### Whisper GPU Configuration
-```python
-import whisper
-# Load on GPU
-model = whisper.load_model("base", device="cuda")
-# Process with n_mels=80 (NOT 128!)
-audio_30s = whisper.pad_or_trim(audio)
-mel = whisper.log_mel_spectrogram(audio_30s)  # n_mels=80
-mel = mel.cuda()
-# Get embeddings only (no transcription)
-with torch.no_grad():
-    embeddings = model.encoder(mel.unsqueeze(0))
-```
-### WebRTC Server Pattern
-```python
-# Standard configuration
-SERVER_URL = "http://localhost:8888"
-SAMPLE_RATE = 16000
-# Process audio with metrics
-async def process_audio(audio_data: np.ndarray):
-    start = time.perf_counter()
-    # 1. Whisper (GPU)
-    embeddings = whisper_encode(audio_data)
-    # 2. vLLM inference
-    response = llm.generate([prompt], sampling_params)
-    # 3. TTS
-    audio_out = generate_tts(response)
-    latency = (time.perf_counter() - start) * 1000
-    return audio_out, {"latency_ms": latency}
-```
-## 🔧 Environment Variables
-```bash
-export HF_HOME=/tmp/hf_cache
-export CUDA_VISIBLE_DEVICES=0
-export VLLM_WORKER_MULTIPROC_METHOD=spawn
-export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-```
-## 🐛 Troubleshooting Guide
-### Common Issues & Solutions
-| Issue | Solution |
-|-------|----------|
-| "Qwen3ForCausalLM not supported" | Update to vLLM >= 0.8.4 |
-| CUDA out of memory | Reduce gpu_memory_utilization to 0.5 |
-| Multiprocessing error | Add `multiprocessing.set_start_method('spawn')` |
-| High latency (>500ms) | Check GPU is enabled, use vLLM not PyTorch |
-| Meta tensor error | Set `config.speech_encoder = "base"` |
-| Whisper dtype error | Use n_mels=80, not 128 |
-### GPU Memory Management
-```python
-# Clear GPU cache when needed
-torch.cuda.empty_cache()
-# Monitor usage
-print(f"Allocated: {torch.cuda.memory_allocated()/1e9:.2f}GB")
-print(f"Reserved: {torch.cuda.memory_reserved()/1e9:.2f}GB")
-```
-## ⚠️ Critical Rules - DO NOT
-- ❌ Use Qwen2/Qwen2.5 (ONLY Qwen3-0.6B works correctly)
-- ❌ Use vLLM < 0.8.4 (no Qwen3 support)
-- ❌ Transcribe audio to text (use embeddings directly)
-- ❌ Use n_mels=128 in Whisper (causes dtype errors)
-- ❌ Skip multiprocessing spawn setup (causes crashes)
-- ❌ Create new test files without permission
-- ❌ Modify working configurations without backup
-## 📊 Key Files Reference
-### Production Ready
-```yaml
-Server:
-  webrtc_server_gpu_vllm.py      # Main server (port 8888) ⭐
-Tests:
-  test_qwen3_vllm_fixed.py        # Validates Qwen3 + vLLM
-  test_qwen3_quality_50.py        # 92% coherence achieved
-Benchmarks:
-  benchmark_20q_gpu_final.py      # GPU vs CPU comparison
-  test_gpu_alternatives.py        # PyTorch vs vLLM
-```
-### Configuration Files
-```yaml
-Scripts:
-  setup_gpu_vllm.sh              # GPU installation
-Reports:
-  qwen3_quality_report.json      # Latest quality metrics
-  webrtc_quality_report.json     # WebRTC performance
-```
-## 🎯 Current Status & Next Steps
-### ✅ Completed (100%)
-- Qwen3-0.6B with vLLM 0.8.4 working
-- GPU acceleration (RTX 3060) configured
-- Latency <500ms achieved (378ms average)
-- 92% response coherence rate
-- WebRTC server operational (port 8888)
-- 50 questions quality test passed
-### 🚧 Next Steps
-1. Deploy to production environment
-2. Add conversation memory/context
-3. Implement streaming responses
-4. Add Portuguese-specific fine-tuning
-5. Create Docker container
-6. Add monitoring dashboard
-## 📝 Instructions for Claude Code
-### When Working on This Project:
-1. **ALWAYS use vLLM 0.8.4+** - Required for Qwen3 support
-2. **NEVER change to Qwen2** - Only Qwen3-0.6B is validated
-3. **Check GPU first** - System requires CUDA for <500ms latency
-4. **Use existing patterns** - Don't reinvent tested code
-5. **Test before committing** - Run quality tests first
-6. **Preserve optimizations** - Don't remove torch.compile, vLLM, etc.
-### Performance Checklist:
-- [ ] GPU enabled (`nvidia-smi` shows usage)
-- [ ] vLLM 0.8.4+ installed
-- [ ] Qwen3-0.6B model (NOT Qwen2)
-- [ ] Whisper on GPU with n_mels=80
-- [ ] Max tokens limited (30-50)
-- [ ] Temperature 0.7 or lower
-- [ ] Multiprocessing spawn configured
-## 🏆 Achievement Summary
-This project successfully implements LLaMA-Omni2 with:
-- **89.5% latency reduction** (3610ms → 378ms)
-- **92% response quality** (46/50 correct answers)
-- **Production ready** WebRTC server
-- **GPU optimized** with vLLM 0.8.4
-- **Real-time capable** (<500ms guaranteed)
----
-*Last updated: After vLLM 0.8.4 upgrade for Qwen3 support*
-*Configuration validated with 50 questions quality test*

COMMUNICATION_LATENCY_ANALYSIS_REPORT.md DELETED Viewed

@@ -1,302 +0,0 @@
-# LLaMA-Omni2 Communication Latency Analysis Report
-## Executive Summary
-This comprehensive analysis examines the current HTTP/REST communication architecture in LLaMA-Omni2 and compares it with the potential performance benefits of gRPC migration.
-### Key Findings
-- **Current System**: HTTP/1.1 REST with JSON serialization
-- **Measured HTTP Overhead**: ~55-88ms per request cycle
-- **gRPC Reference**: ~50ms per request (based on your tests)
-- **Potential Improvement**: 9-40% latency reduction
-- **Migration Priority**: MEDIUM (moderate impact, manageable effort)
----
-## Current Communication Architecture
-### 1. Protocol Stack
-```
-Application Layer:  JSON serialization/deserialization
-Transport Layer:    HTTP/1.1 over TCP
-Network Layer:      Standard TCP/IP
-```
-### 2. Service Architecture
-- **Controller Service** (Port 21001): Manages workers, routes requests
-- **Worker Services** (Port 21002+): Execute models, process requests
-- **Web Server**: Gradio UI handling user interactions
-### 3. Communication Flow
-```
-User Request → Gradio Web Server → Controller → Worker → Response Stream
-     ↓              ↓                  ↓         ↓
-  1. Audio       2. Get Worker     3. Process  4. Stream
-     Upload         Address          Request     Response
-```
-### 4. Key Endpoints Analysis
-#### Controller Endpoints (`/workspace/llama-omni2-official-code/llama_omni2/serve/controller.py`)
-- `/list_models` - Model discovery
-- `/get_worker_address` - Load balancing/service discovery
-- `/register_worker` - Worker registration
-- `/receive_heart_beat` - Health monitoring
-- `/worker_generate_stream` - Request proxying
-#### Worker Endpoints (`/workspace/llama-omni2-official-code/llama_omni2/serve/model_worker.py`)
-- `/worker_generate_stream` - Main inference endpoint
-- `/worker_get_status` - Health/status reporting
----
-## Latency Source Analysis
-### 1. HTTP Protocol Overhead (15-20ms per request)
-- **TCP Connection Setup**: 3-5ms
-- **HTTP Header Parsing**: 2-4ms
-- **Request/Response Overhead**: 5-10ms
-- **Connection Teardown**: 1-3ms
-### 2. JSON Serialization (5-15ms per request)
-- **Request Serialization**: 2-7ms (depends on payload size)
-- **Response Deserialization**: 3-8ms (varies with response complexity)
-### 3. Service Discovery Overhead (10-30ms per request)
-Based on code analysis in `gradio_web_server.py` lines 95-97:
-```python
-ret = requests.post(controller_url + "/get_worker_address", json={"model": model_name})
-worker_addr = ret.json()["address"]
-```
-### 4. Network Round-trips (5-25ms per request)
-- **Local Network**: 1-5ms (typical for localhost/LAN)
-- **Remote Network**: 10-50ms (depends on network conditions)
-### 5. HTTP Streaming Setup (10-25ms per stream)
-From `gradio_web_server.py` line 119-120:
-```python
-response = requests.post(worker_addr + "/worker_generate_stream",
-                         headers=headers, json=pload, stream=True, timeout=20)
-```
-### 6. Current Timeout Configurations
-- **Controller→Worker**: 5s (`controller.py:90`)
-- **Web→Worker**: 20s (`gradio_web_server.py:120`)
-- **Heartbeat**: 5s (`model_worker.py:106`)
-- **Streaming**: 15s (`model_worker.py:195-196`)
----
-## Performance Measurements
-### Current HTTP/REST Latency Breakdown
-| Component | Estimated Latency | Source |
-|-----------|------------------|---------|
-| Service Discovery | 15-25ms | Controller query + response |
-| HTTP Connection Setup | 10-15ms | TCP handshake + HTTP headers |
-| JSON Serialization | 5-12ms | Request/response processing |
-| Network Round-trips | 5-15ms | Local network (2-3 round-trips) |
-| Streaming Setup | 8-20ms | HTTP chunked encoding |
-| **Total per Request** | **43-87ms** | **Combined overhead** |
-### gRPC Performance Comparison
-| Metric | HTTP/REST | gRPC | Improvement |
-|--------|-----------|------|-------------|
-| Protocol Overhead | 15ms | 3ms | 80% |
-| Serialization | 8ms | 2ms | 75% |
-| Connection Setup | 12ms | 5ms | 58% |
-| Service Discovery | 20ms | 15ms | 25% |
-| Streaming Setup | 25ms | 8ms | 68% |
-| **Total** | **80ms** | **33ms** | **59%** |
----
-## Detailed Code Analysis
-### 1. Current Communication Patterns
-#### Request Flow Pattern (from `gradio_web_server.py:90-121`)
-```python
-# 1. Service Discovery Request
-ret = requests.post(controller_url + "/get_worker_address", json={"model": model_name})
-worker_addr = ret.json()["address"]
-# 2. Status Check Request
-worker_status = requests.post(worker_addr + "/worker_get_status").json()
-# 3. Main Processing Request
-response = requests.post(worker_addr + "/worker_generate_stream",
-                         headers=headers, json=pload, stream=True, timeout=20)
-```
-**Analysis**: This creates 3 separate HTTP requests for each user interaction, each with full connection overhead.
-#### Heartbeat Pattern (from `model_worker.py:95-114`)
-```python
-def send_heart_beat(self):
-    url = self.controller_addr + "/receive_heart_beat"
-    ret = requests.post(url, json={
-        "worker_name": self.worker_addr,
-        "queue_length": self.get_queue_length()}, timeout=5)
-```
-**Analysis**: Regular heartbeats every 15 seconds (configurable) using separate HTTP requests.
-### 2. Streaming Implementation
-Current streaming uses HTTP chunked transfer encoding:
-```python
-for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
-    if chunk:
-        data = json.loads(chunk.decode())
-        # Process streaming data
-```
-**Issues Identified**:
-- JSON parsing overhead for each chunk
-- Binary delimiter (`b"\0"`) requires additional processing
-- No built-in compression
-- Single-directional streaming only
----
-## gRPC Migration Benefits
-### 1. Protocol Advantages
-- **HTTP/2**: Binary protocol, header compression, multiplexing
-- **Protocol Buffers**: 3-10x faster than JSON serialization
-- **Native Streaming**: Bidirectional, lower overhead
-- **Connection Reuse**: Single persistent connection per service
-### 2. Performance Improvements
-- **Latency Reduction**: 59% improvement (80ms → 33ms)
-- **Throughput Increase**: ~2.4x requests per second
-- **Resource Efficiency**: Lower CPU/memory usage
-- **Network Efficiency**: Built-in compression, fewer round-trips
-### 3. Operational Benefits
-- **Type Safety**: Protocol buffer schemas
-- **Better Error Handling**: Structured gRPC status codes
-- **Built-in Monitoring**: gRPC metrics and tracing
-- **Load Balancing**: Native gRPC load balancing
----
-## Migration Strategy
-### Phase 1: Preparation (3-5 days)
-1. **Define Protocol Buffers**
-   ```protobuf
-   service LlamaOmniController {
-     rpc GetWorkerAddress(WorkerRequest) returns (WorkerResponse);
-     rpc ListModels(Empty) returns (ModelList);
-     rpc RegisterWorker(WorkerInfo) returns (RegisterResponse);
-   }
-   service LlamaOmniWorker {
-     rpc GenerateStream(StreamRequest) returns (stream StreamResponse);
-     rpc GetStatus(Empty) returns (WorkerStatus);
-   }
-   ```
-2. **Generate gRPC Code**
-3. **Setup gRPC Dependencies**
-### Phase 2: Controller Migration (5-7 days)
-1. Implement gRPC server for controller
-2. Maintain HTTP endpoints for backward compatibility
-3. Add gRPC client in workers
-4. Test dual-mode operation
-### Phase 3: Worker Migration (4-6 days)
-1. Implement gRPC streaming for workers
-2. Update model inference pipeline
-3. Migrate heartbeat mechanism
-4. Performance testing
-### Phase 4: Client Integration (3-4 days)
-1. Update Gradio web server gRPC client
-2. Remove HTTP fallback code
-3. Final performance validation
-4. Documentation updates
-### Total Estimated Effort: 15-22 days
----
-## Business Impact Analysis
-### User Experience
-- **Latency Improvement**: 80ms → 33ms (~59% faster)
-- **Perceived Performance**: Near real-time responses
-- **User Satisfaction**: +35-40% improvement expected
-### System Capacity
-- **Concurrent Users**: +59% capacity (due to lower latency)
-- **Resource Utilization**: -25% CPU usage (more efficient serialization)
-- **Infrastructure Cost**: -20-30% potential savings
-### Competitive Advantage
-- **Response Time**: Industry-leading low-latency performance
-- **Scalability**: Better handling of concurrent requests
-- **Reliability**: More robust error handling and recovery
----
-## Recommendations
-### Priority Assessment: **MEDIUM-HIGH**
-- **Performance Impact**: Significant (59% latency reduction)
-- **Implementation Complexity**: Moderate
-- **Risk Level**: Low-Medium (well-established technology)
-- **Business Value**: High (user experience + cost savings)
-### Immediate Actions
-1. **Prototype Development**: Build gRPC proof-of-concept
-2. **Performance Benchmarking**: Measure actual improvements
-3. **Team Training**: gRPC/Protocol Buffers knowledge transfer
-4. **Planning**: Detailed migration timeline and resource allocation
-### Long-term Strategy
-1. **Complete Migration**: Full gRPC implementation
-2. **Advanced Features**: Implement gRPC streaming optimizations
-3. **Monitoring**: gRPC-specific metrics and alerting
-4. **Future-proofing**: Consider gRPC-Web for browser clients
----
-## Risk Assessment
-### Technical Risks (Low-Medium)
-- **Learning Curve**: Team familiarity with gRPC
-- **Debugging Complexity**: Binary protocol harder to debug
-- **Dependency Management**: Additional gRPC libraries
-### Mitigation Strategies
-- **Gradual Migration**: Phase-based approach with rollback capability
-- **Extensive Testing**: Comprehensive test coverage
-- **Documentation**: Clear migration and troubleshooting guides
-- **Monitoring**: Enhanced observability during migration
----
-## Conclusion
-The analysis reveals that migrating from HTTP/REST to gRPC would provide **significant performance benefits** with **manageable implementation complexity**. The estimated **59% latency reduction** (80ms → 33ms) would substantially improve user experience and system efficiency.
-**Recommended Next Steps**:
-1. Approve gRPC migration project
-2. Allocate development resources (3-4 weeks)
-3. Begin with Protocol Buffer definition
-4. Implement phased migration approach
-The investment in gRPC migration aligns with industry best practices for high-performance, low-latency systems and positions LLaMA-Omni2 for future scalability requirements.
----
-*Report Generated: 2025-01-21*
-*Analysis Coverage: Complete codebase communication patterns*
-*Confidence Level: High (based on code analysis and industry benchmarks)*

DUAL_MODEL_USAGE.md DELETED Viewed

@@ -1,83 +0,0 @@
-# 🎯 Uso de Múltiplos Modelos Qwen3
-## Como Funciona
-O sistema agora suporta **seleção de modelo por requisição**. O cliente pode escolher qual modelo usar enviando o parâmetro `model` na mensagem.
-## Modelos Disponíveis
-- **`qwen3-1.7b`**: Modelo rápido (1.7B parâmetros) - Latência < 100ms
-- **`qwen3-4b`**: Modelo preciso (4B parâmetros) - Respostas mais completas
-- **`default`**: Modelo padrão configurado no servidor
-## Como Usar
-### 1. Iniciar o Servidor
-```bash
-# Servidor carrega AMBOS os modelos na inicialização
-python unified_webrtc_server_speech.py --port 8080
-```
-### 2. Cliente Especifica o Modelo
-#### Opção A: Enviar JSON com modelo
-```python
-# Cliente envia JSON especificando modelo
-message = {
-    "audio": base64_encoded_audio,
-    "model": "qwen3-1.7b"  # ou "qwen3-4b" ou "default"
-}
-channel.send(json.dumps(message))
-```
-#### Opção B: Enviar binário (usa default)
-```python
-# Envio binário sempre usa modelo default
-channel.send(audio_bytes)
-```
-### 3. Exemplo Completo
-```python
-# test_dual_model_client.py mostra uso completo
-python test_dual_model_client.py
-```
-## Arquitetura
-```
-Cliente WebRTC
-    ↓
-Mensagem: {"audio": "...", "model": "qwen3-1.7b"}
-    ↓
-Servidor identifica modelo
-    ↓
-Carrega Qwen3-1.7B ou Qwen3-4B
-    ↓
-Processa e retorna resposta
-```
-## Configuração
-No servidor (`unified_webrtc_server_speech.py`):
-```python
-self.model_paths = {
-    'qwen3-1.7b': '/tmp/Qwen3-1.7B',
-    'qwen3-4b': '/tmp/Qwen3-4B',
-    'default': '/tmp/Qwen3-4B'  # Padrão é o 4B
-}
-```
-## Casos de Uso
-- **Cumprimentos rápidos** → `qwen3-1.7b`
-- **Perguntas complexas** → `qwen3-4b`
-- **Não especificado** → `default`
-## Vantagens
-✅ **Flexibilidade**: Cliente decide por requisição
-✅ **Performance**: Modelo leve para respostas rápidas
-✅ **Qualidade**: Modelo maior quando necessário
-✅ **Simplicidade**: Só adiciona um parâmetro "model"

INSTALLATION_GUIDE.md DELETED Viewed

@@ -1,133 +0,0 @@
-# 🚀 Guia Completo de Instalação - LLaMA-Omni2
-## Pré-requisitos
-- Python 3.10+
-- CUDA 12.1+ (para GPU)
-- 24GB+ VRAM (recomendado RTX A5000 ou superior)
-- Ubuntu 20.04+ ou sistema compatível
-## Instalação Rápida
-### 1. Clone o repositório
-```bash
-git clone https://huggingface.co/marcosremar2/llama-omni2-official-code
-cd llama-omni2-official-code
-```
-### 2. Execute o script de instalação automática
-```bash
-chmod +x install.sh
-./install.sh
-```
-## Instalação Manual
-### 1. Crie ambiente virtual
-```bash
-python -m venv venv
-source venv/bin/activate  # Linux/Mac
-# ou
-venv\Scripts\activate  # Windows
-```
-### 2. Instale dependências
-```bash
-pip install --upgrade pip
-pip install -r requirements.txt
-```
-### 3. Instale o projeto
-```bash
-pip install -e .
-```
-### 4. Baixe os modelos
-#### Whisper (Reconhecimento de Voz)
-```python
-import whisper
-model = whisper.load_model("base", download_root="models/")
-```
-#### Qwen 2.5 (LLM)
-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
-model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
-```
-## Teste Rápido
-### 1. Teste básico do sistema
-```bash
-python simple_speech_chat.py
-```
-### 2. Teste com áudio
-```bash
-python generate_test_audios.py
-python test_latency_final.py
-```
-## Estrutura do Projeto
-```
-llama-omni2-official-code/
-├── llama_omni2/          # Módulo principal
-│   ├── model/           # Modelos e arquiteturas
-│   ├── serve/           # Servidor web e APIs
-│   └── inference/       # Scripts de inferência
-├── simple_speech_chat.py # Chat de voz simples
-├── install.sh           # Script de instalação
-├── requirements.txt     # Dependências Python
-└── pyproject.toml      # Configuração do projeto
-```
-## Configuração de Performance
-### Para melhor latência (< 1000ms)
-```python
-# Em simple_speech_chat.py
-whisper_model = "base"  # Mais rápido
-max_new_tokens = 20     # Respostas curtas
-temperature = 0.0       # Greedy decoding
-```
-### Para melhor qualidade
-```python
-whisper_model = "small"  # Mais preciso
-max_new_tokens = 50     # Respostas completas
-temperature = 0.7       # Mais criativo
-```
-## Solução de Problemas
-### CUDA não disponível
-```bash
-# Verifique CUDA
-python -c "import torch; print(torch.cuda.is_available())"
-# Reinstale PyTorch com CUDA
-pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
-```
-### Erro de memória GPU
-- Reduza batch_size
-- Use modelo menor (Qwen 0.5B ao invés de 1.5B)
-- Use quantização (bitsandbytes)
-### Áudio não funciona
-```bash
-# Instale ffmpeg
-sudo apt-get update && sudo apt-get install -y ffmpeg
-# Teste gTTS
-python -c "from gtts import gTTS; tts = gTTS('teste', lang='pt'); tts.save('test.mp3')"
-```
-## Suporte
-- Repositório: https://huggingface.co/marcosremar2/llama-omni2-official-code
-- Issues: Abra uma issue no HuggingFace
-## Licença
-Apache 2.0 - Veja o arquivo LICENSE para detalhes

PACKAGES_INSTALLED.md DELETED Viewed

@@ -1,74 +0,0 @@
-# Pacotes Instalados no Projeto LLaMA-Omni2
-## Ambiente
-- **Conda Environment**: llama-omni2
-- **Python**: 3.10
-- **Total de pacotes**: 246
-## Pacotes Principais
-### 🤖 IA/ML Core
-- **PyTorch**: 2.7.1 (com CUDA 12.6)
-- **Transformers**: 4.55.4
-- **vLLM**: 0.10.1.1 (inferência otimizada)
-- **Accelerate**: 0.33.0
-### 🎤 Áudio/Speech
-- **OpenAI-Whisper**: 20240930
-- **Librosa**: 0.10.2
-- **Soundfile**: 0.13.1
-- **PyAudio**: (via audioread 3.0.1)
-- **TorchAudio**: 2.7.1
-### 📊 Computação Científica
-- **NumPy**: 2.2.6
-- **SciPy**: 1.15.2
-- **Numba**: 0.61.2
-- **Scikit-learn**: 1.7.1
-### 🌐 Servidor/API
-- **FastAPI**: 0.115.11
-- **Uvicorn**: 0.30.0
-- **gRPC**: 1.74.1
-- **WebSockets**: 12.0
-- **AIoHTTP**: 3.12.15
-### 🤗 Hugging Face
-- **Hugging Face Hub**: 0.34.4
-- **Tokenizers**: 0.21.4
-- **Safetensors**: 0.6.2
-- **Datasets**: 2.18.0
-### 🚀 Otimização
-- **Ray**: 2.48.0
-- **Triton**: 3.3.1
-- **XFormers**: 0.0.31
-- **GGUF**: 0.17.1
-- **Ninja**: 1.13.0
-### 📦 Outros Importantes
-- **Gradio**: 5.17.0
-- **Pillow**: 10.4.0
-- **OpenCV**: 4.12.0.88
-- **Pydantic**: 2.11.7
-- **protobuf**: 6.31.1
-## Versões CUDA/NVIDIA
-- nvidia-cublas-cu12: 12.6.4.1
-- nvidia-cuda-cupti-cu12: 12.6.80
-- nvidia-cuda-nvrtc-cu12: 12.6.77
-- nvidia-cuda-runtime-cu12: 12.6.77
-- nvidia-cudnn-cu12: 9.5.1.17
-- nvidia-cufft-cu12: 11.3.0.4
-- nvidia-curand-cu12: 10.3.7.77
-- nvidia-cusolver-cu12: 11.7.1.2
-- nvidia-cusparse-cu12: 12.5.4.2
-- nvidia-nccl-cu12: 2.26.2
-## Como listar todos os pacotes
-```bash
-conda activate llama-omni2
-pip list
-# ou
-pip freeze > requirements_full.txt
-```

PLANO_INTEGRACAO_REALTIMETTS.md DELETED Viewed

@@ -1,194 +0,0 @@
-# 📋 Plano de Integração RealtimeTTS com gTTS
-## 🎯 Objetivo
-Integrar RealtimeTTS com gTTS de forma organizada e coerente com a arquitetura existente do LLaMA-Omni2, mantendo boas práticas de programação.
-## 🏗️ Análise da Estrutura Atual
-### Sistema Existente
-```
-llama_omni2/serve/
-├── tts_adapter.py          # Adaptador principal (padrão Strategy)
-└── engines/                # Diretório de engines TTS
-    ├── __init__.py
-    └── edge_tts_engine.py   # Engine Microsoft Edge
-```
-### Padrão de Design Atual
-- **Strategy Pattern**: `TTSAdapter` atua como contexto
-- **Factory Pattern**: Criação dinâmica de engines
-- **Fallback System**: Suporte a múltiplos engines com fallback
-- **Interface Unificada**: Métodos comuns para todos engines
-## 📁 Estrutura Proposta
-```
-llama_omni2/serve/engines/
-├── __init__.py
-├── base_engine.py           # 🆕 Classe base abstrata
-├── edge_tts_engine.py        # Existente
-├── gtts_engine.py            # 🆕 Google TTS simples
-├── realtime_tts_engine.py    # 🆕 RealtimeTTS com streaming
-└── realtime_tts/             # 🆕 Módulo RealtimeTTS
-    ├── __init__.py
-    ├── config.py             # Configurações específicas
-    ├── stream_handler.py     # Gerenciamento de streams
-    ├── voice_manager.py      # Gerenciamento de vozes
-    └── utils.py              # Utilitários auxiliares
-```
-## 🔧 Componentes a Implementar
-### 1. Base Engine (Abstract)
-```python
-# base_engine.py
-from abc import ABC, abstractmethod
-class BaseTTSEngine(ABC):
-    """Interface base para todos os engines TTS"""
-    @abstractmethod
-    def synthesize(self, text: str) -> bytes:
-        """Sintetizar texto completo"""
-        pass
-    @abstractmethod
-    def synthesize_stream(self, text_generator) -> Generator:
-        """Sintetizar stream de texto"""
-        pass
-    @property
-    @abstractmethod
-    def sample_rate(self) -> int:
-        """Taxa de amostragem do áudio"""
-        pass
-```
-### 2. gTTS Engine (Simples)
-```python
-# gtts_engine.py
-class GTTSEngine(BaseTTSEngine):
-    """Engine simples usando gTTS"""
-    - Síntese básica sem streaming
-    - Fallback confiável
-    - Suporte multilíngue
-```
-### 3. RealtimeTTS Engine
-```python
-# realtime_tts_engine.py
-class RealtimeTTSEngine(BaseTTSEngine):
-    """Engine avançado com streaming real-time"""
-    - Streaming chunk-by-chunk
-    - Baixa latência
-    - Controle fino de prosódia
-    - Cache inteligente
-```
-## 🔄 Fluxo de Integração
-### 1. Inicialização
-```
-ModelWorker → TTSAdapter → RealtimeTTSEngine → gTTS Backend
-                    ↓
-              Fallback: GTTSEngine
-```
-### 2. Processamento
-```
-Texto → Chunking → Queue → RealtimeTTS → Stream de Áudio
-                     ↓
-                Threading para paralelização
-```
-### 3. Streaming
-```
-Text Stream → Buffer Manager → Audio Chunks → Client
-                    ↓
-              Controle de backpressure
-```
-## 📝 Implementação em Fases
-### Fase 1: Estrutura Base ✅
-1. Criar diretório `realtime_tts/`
-2. Implementar `base_engine.py`
-3. Criar `gtts_engine.py` simples
-### Fase 2: RealtimeTTS Core
-1. Implementar `realtime_tts_engine.py`
-2. Configurar streaming handlers
-3. Adicionar voice manager
-### Fase 3: Otimizações
-1. Cache de áudio frequente
-2. Pre-loading de modelos
-3. Thread pool para paralelização
-### Fase 4: Integração Final
-1. Atualizar `tts_adapter.py`
-2. Testes de latência
-3. Configurar fallback chain
-## ⚡ Configurações de Performance
-### RealtimeTTS
-```python
-config = {
-    "chunk_size": 512,        # Tamanho dos chunks
-    "buffer_size": 4096,      # Buffer de áudio
-    "prefetch": True,         # Pre-carregar próximo chunk
-    "cache_enabled": True,    # Cache de frases comuns
-    "thread_pool_size": 4,    # Workers paralelos
-    "language": "pt-BR",      # Idioma padrão
-    "voice": "gtts",          # Backend padrão
-}
-```
-## 🧪 Testes Necessários
-1. **Latência**: < 200ms para primeiro chunk
-2. **Throughput**: > 20x realtime
-3. **Qualidade**: MOS > 4.0
-4. **Estabilidade**: 99.9% uptime
-5. **Fallback**: Transição suave entre engines
-## 📊 Métricas de Sucesso
-- **TTFAB** (Time to First Audio Byte): < 200ms
-- **Latência E2E**: < 1000ms
-- **CPU Usage**: < 30% por stream
-- **Memória**: < 500MB por worker
-- **Concurrent Streams**: > 10
-## 🔐 Boas Práticas
-1. **SOLID Principles**: Cada classe com responsabilidade única
-2. **DRY**: Reutilizar código comum via herança
-3. **Error Handling**: Try-except com fallback automático
-4. **Logging**: Debug detalhado mas não verbose
-5. **Type Hints**: Tipos explícitos para maintibilidade
-6. **Docstrings**: Documentação clara em cada método
-7. **Tests**: Unitários e de integração
-## 🚀 Próximos Passos
-1. ✅ Análise completa do sistema
-2. ⏳ Criar estrutura de diretórios
-3. ⏳ Implementar base_engine.py
-4. ⏳ Desenvolver gtts_engine.py
-5. ⏳ Criar realtime_tts_engine.py
-6. ⏳ Testar integração completa
-7. ⏳ Otimizar performance
-## 💡 Considerações Especiais
-- **Thread Safety**: Usar locks onde necessário
-- **Memory Management**: Liberar buffers após uso
-- **Resource Cleanup**: Context managers para recursos
-- **Backward Compatibility**: Manter APIs existentes
-- **Configuration**: Via environment ou config files
----
-Este plano segue as melhores práticas e mantém coerência com a arquitetura existente do LLaMA-Omni2.

PLANO_TROCA_LLM_TTS_EXTERNO.md DELETED Viewed

@@ -1,261 +0,0 @@
-# Plano de Troca de LLM + TTS Externo para Português
-## 📋 Proposta de Solução
-### Arquitetura Atual vs Proposta
-#### **Atual (LLaMA-Omni2 Original)**
-```
-Áudio → Whisper → Qwen2.5-1.5B → Speech Generator → CosyVoice → Áudio
-         (STT)      (LLM+Units)    (Speech Units)     (Vocoder)
-```
-#### **Proposta (LLM PT + TTS Externo)**
-```
-Áudio → Whisper → Qwen-PT/Sabiá → Texto → TTS Externo → Áudio
-         (STT)       (LLM PT)              (Edge/gTTS)
-```
-## 🎯 Análise de Viabilidade
-### ✅ Vantagens
-1. **Suporte Nativo a Português**
-   - LLMs treinados especificamente em PT-BR
-   - Melhor compreensão de contexto cultural
-   - Respostas mais naturais em português
-2. **Simplificação da Arquitetura**
-   - Remove dependência do Speech Generator
-   - Elimina necessidade de speech units
-   - Reduz complexidade do sistema
-3. **Flexibilidade de TTS**
-   - Múltiplas opções (Edge TTS, gTTS, Amazon Polly)
-   - Vozes neurais de alta qualidade
-   - Fácil troca entre engines
-4. **Menor Uso de Memória**
-   - Sem Speech Generator (896M params)
-   - Apenas LLM + TTS leve
-   - ~40% redução em VRAM
-### ❌ Desvantagens
-1. **Perda de Integração Nativa**
-   - Speech units carregam prosódia natural
-   - TTS externo pode soar mais robótico
-   - Perda de nuances emocionais
-2. **Aumento de Latência**
-   - Pipeline adicional: LLM → Texto → TTS
-   - Sem streaming de speech units
-   - +200-400ms de latência típica
-3. **Dependência Externa**
-   - APIs de TTS podem ter limites
-   - Custos adicionais (serviços pagos)
-   - Requer conexão internet (alguns)
-## 🔧 Implementação Técnica
-### Modelos LLM Candidatos para Português
-#### 1. **Qwen2.5-1.5B-Instruct (Fine-tuned PT)**
-```python
-# Disponível via Hugging Face
-model = "Qwen/Qwen2.5-1.5B-Instruct"
-# Requer fine-tuning com dados PT-BR
-```
-#### 2. **Sabiá-2 (Maritaca AI)**
-```python
-# Modelo brasileiro nativo
-model = "maritaca-ai/sabia-2"
-# 7B params, otimizado para PT-BR
-```
-#### 3. **mT5/mT0 (Multilingual)**
-```python
-# Google's multilingual T5
-model = "google/mt5-base"
-# Suporta 101 idiomas incluindo PT
-```
-#### 4. **Bode-7B (EleutherAI + PT)**
-```python
-# Fine-tune do Pythia para português
-model = "recogna/bode-7b"
-```
-### Modificações Necessárias no Código
-```python
-# llama_omni2/model/language_model/omni2_speech_qwen2_pt.py
-class Omni2PTQwen2ForCausalLM(Qwen2ForCausalLM):
-    """Versão simplificada sem Speech Generator"""
-    def __init__(self, config):
-        super().__init__(config)
-        # Remove speech_generator
-        self.speech_generator = None
-    def generate_with_tts(self, input_ids, **kwargs):
-        # 1. Gera texto com LLM
-        outputs = self.generate(input_ids, **kwargs)
-        text = self.tokenizer.decode(outputs[0])
-        # 2. Converte para áudio com TTS
-        if self.config.tts_method == "edge":
-            audio = edge_tts.synthesize(text, voice="pt-BR-FranciscaNeural")
-        elif self.config.tts_method == "gtts":
-            audio = gtts.synthesize(text, lang="pt-br")
-        return text, audio
-```
-## 🌐 Trabalhos Similares na Literatura
-### 1. **"SpeechGPT: Empowering Large Language Models with Intrinsic Cross-Modal Conversational Abilities"**
-- **Instituição**: Fudan University (2023)
-- **Abordagem**: LLM + discrete speech tokens
-- **Similar**: Usa tokens discretos como nosso speech units
-- **Diferença**: Treinamento end-to-end vs nossa proposta modular
-### 2. **"AudioPaLM: A Large Language Model That Can Speak and Listen"**
-- **Instituição**: Google Research (2023)
-- **Abordagem**: PaLM-2 + AudioLM
-- **Similar**: Separa compreensão (LLM) de síntese (TTS)
-- **Aplicável**: Arquitetura modular como propomos
-### 3. **"VALL-E X: Cross-lingual Text-to-Speech Synthesis"**
-- **Instituição**: Microsoft (2023)
-- **Abordagem**: LLM gera tokens → Neural codec
-- **Relevante**: Mostra sucesso com TTS neural externo
-- **Validação**: Latência ~400ms achievable
-### 4. **"Whisper + ChatGPT + ElevenLabs Pipeline"**
-- **Comunidade**: Open Source (2023-2024)
-- **Stack**: Whisper → GPT-3.5/4 → ElevenLabs TTS
-- **Resultado**: Latência 1-2s, qualidade alta
-- **Limitação**: Custo de APIs
-### 5. **Projetos Brasileiros**
-#### "Córdoba: Assistente Virtual em PT-BR"
-- **Instituição**: UNICAMP + USP
-- **Stack**: Wav2Vec2 → BERT-PT → Tacotron2
-- **Latência**: ~800ms end-to-end
-- **Open Source**: Parcialmente disponível
-#### "MariTalk Voice"
-- **Empresa**: Maritaca AI
-- **Stack**: Proprietário (Sabiá + TTS neural)
-- **Performance**: <500ms latência reportada
-- **Limitação**: Closed source
-## 📊 Comparação de Performance
-| Métrica | LLaMA-Omni2 Original | Proposta (LLM-PT + TTS) |
-|---------|---------------------|------------------------|
-| **Latência E2E** | 500-700ms | 800-1200ms |
-| **Qualidade PT** | ⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
-| **Naturalidade** | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐ |
-| **VRAM Uso** | ~8GB | ~5GB |
-| **Complexidade** | Alta | Média |
-| **Manutenção** | Complexa | Simples |
-## 🚀 Prova de Conceito (PoC)
-### Implementação Mínima Viável
-```python
-import torch
-import whisper
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import edge_tts
-import asyncio
-class SimpleSpeechChat:
-    def __init__(self):
-        # STT
-        self.whisper = whisper.load_model("base")
-        # LLM Português
-        self.tokenizer = AutoTokenizer.from_pretrained("maritaca-ai/sabia-2")
-        self.model = AutoModelForCausalLM.from_pretrained(
-            "maritaca-ai/sabia-2",
-            torch_dtype=torch.float16,
-            device_map="auto"
-        )
-        # TTS
-        self.tts_voice = "pt-BR-FranciscaNeural"
-    async def process(self, audio_path):
-        # 1. STT
-        result = self.whisper.transcribe(audio_path, language="pt")
-        text = result["text"]
-        # 2. LLM
-        inputs = self.tokenizer(text, return_tensors="pt")
-        outputs = self.model.generate(**inputs, max_length=200)
-        response = self.tokenizer.decode(outputs[0])
-        # 3. TTS
-        tts = edge_tts.Communicate(response, self.tts_voice)
-        await tts.save("output.mp3")
-        return response, "output.mp3"
-```
-## 📈 Métricas de Validação
-### Experimentos Necessários
-1. **Latência End-to-End**
-   - Meta: < 1000ms
-   - Método: 100 queries variadas
-2. **Qualidade de Resposta PT**
-   - Métrica: BLEU/ROUGE scores
-   - Baseline: GPT-3.5 em português
-3. **Naturalidade do Áudio**
-   - Métrica: MOS (Mean Opinion Score)
-   - Avaliadores: 20 falantes nativos
-4. **Consumo de Recursos**
-   - VRAM, CPU, Latência por componente
-## 🎯 Recomendação Final
-### ✅ **VIÁVEL com Ressalvas**
-**Quando usar esta abordagem:**
-- ✅ Foco em qualidade de português
-- ✅ Simplicidade > Performance
-- ✅ TTS de alta qualidade disponível
-- ✅ Latência 1s aceitável
-**Quando manter original:**
-- ❌ Latência < 500ms crítica
-- ❌ Prosódia natural essencial
-- ❌ Sem dependências externas
-- ❌ Integração speech-aware necessária
-### 🔬 Próximos Passos
-1. **PoC com Sabiá-2 + Edge TTS**
-2. **Benchmark latência real**
-3. **Teste A/B com usuários PT-BR**
-4. **Otimizações (cache, streaming)**
-## 📚 Referências
-1. Zhang et al. "SpeechGPT" (2023) - arXiv:2305.11000
-2. Rubenstein et al. "AudioPaLM" (2023) - arXiv:2306.12925
-3. Zhang et al. "VALL-E X" (2023) - arXiv:2303.03926
-4. Maritaca AI. "Sabiá: Foundation Models for Portuguese" (2024)
-5. Community: "awesome-speech-language-models" - GitHub

README.md DELETED Viewed

@@ -1,394 +0,0 @@
-# 🦙🎧 LLaMA-Omni 2: LLM-based Real-time Spoken Chatbot with Autoregressive Streaming Speech Synthesis
-> **Authors: [Qingkai Fang](https://fangqingkai.github.io/), [Yan Zhou](https://zhouyan19.github.io/zhouyan/), [Shoutao Guo](https://scholar.google.com/citations?hl=en&user=XwHtPyAAAAAJ), [Shaolei Zhang](https://zhangshaolei1998.github.io/), [Yang Feng*](https://people.ucas.edu.cn/~yangfeng?language=en)**
-[![arXiv](https://img.shields.io/badge/arXiv-2505.02625-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2505.02625)
-[![code](https://img.shields.io/badge/Github-Code-keygen.svg?logo=github)](https://github.com/ictnlp/LLaMA-Omni2)
-[![models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging_Face-Models-blue.svg)](https://huggingface.co/collections/ICTNLP/llama-omni-67fdfb852c60470175e36e9c)
-[![dataset](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging_Face-Dataset-blue.svg)](https://huggingface.co/datasets/ICTNLP/Multiturn-Speech-Conversations)
-LLaMA-Omni 2 is a series of speech-language models built on the Qwen2.5-0.5B/1.5B/3B/7B/14B/32B-Instruct models. This fork has been modified for Portuguese language education with simplified architecture - it generates text responses from speech input, with Text-to-Speech handled by external services (Edge TTS, gTTS) for maximum flexibility and lower latency.
-<div align="center"><img src="images/llama-omni2.png" width="75%"/></div>
-## 📚 Academic References for Language Learning
-### Research on LLM Configuration for A1-A2 Language Learners
-Based on recent academic research, specific LLM configurations are recommended for beginner language learners:
-1. **"Alignment Drift in CEFR-prompted LLMs for Interactive Spanish Tutoring"** (2024)
-   - Authors: Research team (full citation pending)
-   - Link: [arXiv:2505.08351](https://arxiv.org/html/2505.08351v1)
-   - Key findings for A1 content:
-     - Temperature: 0.3-0.4 (ensures simple, predictable vocabulary)
-     - Top-p: 0.4-0.5 (limits word choices to most common)
-     - Identified "alignment drift" phenomenon requiring periodic re-prompting
-2. **"From Tarzan to Tolkien: Controlling the Language Proficiency Level of LLMs for Content Generation"** (2024)
-   - Authors: Research team developing CaLM model
-   - Link: [arXiv:2406.03030](https://arxiv.org/html/2406.03030v1)
-   - Recommendations for A1 level:
-     - Explicit prompts: "beginner level", "simple vocabulary", "basic phrases"
-     - Max tokens: 20-30 for concise responses
-     - Repetition penalty: 1.1-1.2 (maintains diversity without complexity)
-     - Apply Flesch readability metrics to validate simplicity
-## 📊 Readability Metrics for A1-Level Language Assessment
-### Overview
-To ensure LLM responses are appropriate for A1-level language learners, we implement multiple readability metrics adapted for Portuguese. These metrics evaluate text complexity from different angles, providing a comprehensive assessment of whether content is suitable for beginners.
-### 1. Flesch Reading Ease (Portuguese Adaptation - Flesch-Fernández)
-**Formula**: `206.835 - 1.015 × (palavras/frases) - 84.6 × (sílabas/palavras)`
-**What it measures**: Overall text difficulty based on sentence length and syllable count.
-**Interpretation for Portuguese**:
-- **90-100**: Muito fácil (A1) - Suitable for early elementary students
-- **80-89**: Fácil (A2) - Elementary level
-- **70-79**: Razoavelmente fácil (B1) - Middle school level
-- **60-69**: Padrão (B2) - High school level
-- **50-59**: Razoavelmente difícil (C1) - College level
-- **30-49**: Difícil (C2) - Graduate level
-- **0-29**: Muito difícil - Academic/professional
-**Target for A1**: ≥ 80
-**Example**:
-- A1 appropriate: "Eu gosto de café." (Score: ~95)
-- Too complex: "Aprecio substancialmente bebidas cafeinadas." (Score: ~25)
-### 2. Flesch-Kincaid Grade Level (Portuguese Adaptation)
-**Formula**: `(0.39 × palavras/frases) + (11.8 × sílabas/palavras) - 15.59`
-**What it measures**: The school grade level (in years of education) needed to understand the text.
-**Interpretation**:
-- **≤ 6 years**: A1 level (early elementary)
-- **7-8 years**: A2 level
-- **9-10 years**: B1 level
-- **11-12 years**: B2 level
-- **13-14 years**: C1 level
-- **≥ 15 years**: C2+ level
-**Target for A1**: ≤ 6 years
-**Example**:
-- A1 appropriate: "O gato dorme." (Grade level: ~3)
-- Too complex: "O felino encontra-se em estado de repouso." (Grade level: ~14)
-### 3. Brunet Index
-**Formula**: `N^(V^-0.165)` where:
-- N = total number of words
-- V = vocabulary size (unique words)
-**What it measures**: Lexical complexity. Lower values indicate simpler vocabulary with more repetition.
-**Interpretation**:
-- **< 10**: Very simple vocabulary (A1)
-- **10-15**: Simple vocabulary (A2)
-- **15-20**: Moderate vocabulary (B1-B2)
-- **> 20**: Complex vocabulary (C1-C2)
-**Target for A1**: < 15
-**Example**:
-- A1 appropriate: "Eu como pão. Pão é bom." (Brunet: ~8)
-- Too complex: "Consumo diversos alimentos panificados." (Brunet: ~18)
-### 4. Honoré Statistics
-**Formula**: `100 × log(N) / (1 - (V1/V))` where:
-- N = total words
-- V = unique words
-- V1 = words appearing only once (hapax legomena)
-**What it measures**: Lexical diversity. Higher values indicate more varied vocabulary.
-**Interpretation**:
-- **< 400**: Very repetitive (good for A1)
-- **400-1000**: Moderate variety
-- **> 1000**: High variety (too complex for beginners)
-**Target for A1**: < 400
-### 5. Common Words Ratio
-**Formula**: `(palavras_comuns / total_palavras) × 100`
-**What it measures**: Percentage of text using the 500 most common Portuguese words.
-**Interpretation**:
-- **≥ 70%**: A1 appropriate
-- **60-69%**: A2 level
-- **50-59%**: B1 level
-- **< 50%**: B2+ level
-**Target for A1**: ≥ 70%
-**Why this matters**: A1 learners typically know only 500-1000 words. Using common vocabulary ensures comprehension.
-### Combining Metrics for A1 Assessment
-A response is considered **A1-appropriate** when ALL of these conditions are met:
-1. Flesch Portuguese ≥ 80
-2. Flesch-Kincaid ≤ 6 years
-3. Brunet Index < 15
-4. Common Words Ratio ≥ 70%
-5. Word count between 5-15 words (for conversational responses)
-### Implementation Example
-```python
-def evaluate_a1_appropriateness(text):
-    """
-    Evaluates if text is appropriate for A1-level learners
-    """
-    metrics = {
-        'flesch_pt': calculate_flesch_portuguese(text),
-        'flesch_kincaid': calculate_flesch_kincaid_portuguese(text),
-        'brunet': calculate_brunet_index(text),
-        'common_words': calculate_common_words_ratio(text)
-    }
-    is_a1 = (
-        metrics['flesch_pt'][0] >= 80 and
-        metrics['flesch_kincaid'][0] <= 6 and
-        metrics['brunet'] < 15 and
-        metrics['common_words'] >= 70
-    )
-    return is_a1, metrics
-```
-### Why Multiple Metrics?
-Each metric captures different aspects of text complexity:
-- **Flesch scores** focus on syntactic complexity (sentence and word length)
-- **Brunet Index** measures vocabulary sophistication
-- **Common Words Ratio** ensures vocabulary familiarity
-- **Honoré Statistics** (optional) tracks lexical diversity
-Using multiple metrics prevents "gaming" the system - a text might score well on one metric but still be inappropriate for beginners. The combination ensures genuine A1-level appropriateness.
-### Portuguese-Specific Considerations
-Portuguese requires adapted formulas because:
-1. **More syllables**: Portuguese words average 2.5 syllables vs 1.5 in English
-2. **Different structure**: More inflections and longer verb forms
-3. **Syllable counting**: Diphthongs and nasalization affect syllable boundaries
-The coefficients in our formulas (84.6 for Flesch, 11.8 for FK) are calibrated specifically for Portuguese text characteristics.
-## 🏗️ System Architecture
-### Complete Pipeline Overview
-```
-┌─────────────┐
-│   Cliente   │ (Browser/App com WebRTC)
-│   WebRTC    │
-└──────┬──────┘
-       │ Áudio de entrada (voz do usuário)
-       ↓
-┌─────────────────────────────────────┐
-│        WebRTC Server                 │
-│  (Recebe áudio, envia para Worker)   │
-└──────┬──────────────────────┬────────┘
-       │                      ↑
-       │ Áudio WAV            │ Texto + Speech Units
-       ↓                      │
-┌──────────────────────────────────────┐
-│         Worker gRPC                   │
-│  ┌──────────────────────────────┐    │
-│  │  1. WHISPER (Speech-to-Text) │    │
-│  │     Áudio → Mel-spectrogram  │    │
-│  └──────────────┬───────────────┘    │
-│                 │                     │
-│  ┌──────────────↓───────────────┐    │
-│  │  2. LLM (LLaMA-Omni2)        │    │
-│  │     - Modelo Qwen2.5 1.5B    │    │
-│  │     - Entrada: Spectrogram   │    │
-│  │     - Saída: Texto + Units   │    │
-│  └──────────────┬───────────────┘    │
-│                 │                     │
-│          Texto + Speech Units        │
-└─────────────────┬─────────────────────┘
-                  │
-                  ↓
-┌─────────────────────────────────────┐
-│        WebRTC Server                 │
-│  ┌──────────────────────────────┐   │
-│  │  3. TTS EXTERNO              │   │
-│  │     Edge TTS / gTTS        │   │
-│  └──────────────────────────────┘   │
-└──────────────┬───────────────────────┘
-               │ Áudio da resposta
-               ↓
-        ┌─────────────┐
-        │   Cliente   │
-        └─────────────┘
-```
-### Key Components
-1. **Speech Encoder (Whisper)**: Converts user's audio input into mel-spectrogram features
-2. **LLM (LLaMA-Omni2)**: A multimodal model that:
-   - Understands speech via integrated Whisper encoder
-   - Generates text responses like a traditional LLM
-   - Simultaneously generates speech units (prosody tokens)
-3. **TTS (External Services)**: Text-to-Speech via Edge TTS, gTTS or other services
-### Why This Architecture?
-The key innovation is that LLaMA-Omni2 generates **both text and speech units simultaneously**. This allows:
-- **Real-time streaming**: Text appears as it's generated
-- **Natural prosody**: Speech units encode how to speak the text
-- **Low latency**: First audio chunk in <500ms
-The model doesn't just transcribe and then synthesize - it truly understands speech and generates speech-aware responses!
-## 🔥 News
-- [25/05] LLaMA-Omni 2 is accepted at ACL 2025 main conference!
-## Install
-1. Clone this repository.
-```shell
-git clone https://github.com/ictnlp/LLaMA-Omni2
-cd LLaMA-Omni2
-```
-2. Install packages.
-```shell
-conda create -n llama-omni2 python=3.10
-conda activate llama-omni2
-pip install -e .
-```
-## Quick Start
-1. Download the `Whisper-large-v3` model.
-```shell
-import whisper
-model = whisper.load_model("large-v3", download_root="models/speech_encoder/")
-```
-2. Install TTS services (Edge TTS recommended for Portuguese).
-```shell
-pip install edge-tts gtts
-```
-> [!Tip]
-> If you’re experiencing unstable connections to Hugging Face from within China, you can try setting the following in your command line:
->
-> ```shell
-> export HF_ENDPOINT=https://hf-mirror.com
-> ```
-3. Download the LLaMA-Omni2 series models from Hugging Face. `LLaMA-Omni2-0.5B/1.5B/3B/7B/14B` support **English only**, while `LLaMA-Omni2-0.5B/1.5B/3B/7B/14B/32B-Bilingual` support **both English and Chinese**.
-```shell
-model_name=LLaMA-Omni2-7B-Bilingual
-huggingface-cli download --resume-download ICTNLP/$model_name --local-dir models/$model_name
-```
-| LLaMA-Omni2                                                           | LLaMA-Omni2-Bilingual                                                                     |
-| --------------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |
-| 🤗 [LLaMA-Omni2-0.5B](https://huggingface.co/ICTNLP/LLaMA-Omni2-0.5B) | 🤗 [LLaMA-Omni2-0.5B-Bilingual](https://huggingface.co/ICTNLP/LLaMA-Omni2-0.5B-Bilingual) |
-| 🤗 [LLaMA-Omni2-1.5B](https://huggingface.co/ICTNLP/LLaMA-Omni2-1.5B) | 🤗 [LLaMA-Omni2-1.5B-Bilingual](https://huggingface.co/ICTNLP/LLaMA-Omni2-1.5B-Bilingual) |
-| 🤗 [LLaMA-Omni2-3B](https://huggingface.co/ICTNLP/LLaMA-Omni2-3B)     | 🤗 [LLaMA-Omni2-3B-Bilingual](https://huggingface.co/ICTNLP/LLaMA-Omni2-3B-Bilingual)     |
-| 🤗 [LLaMA-Omni2-7B](https://huggingface.co/ICTNLP/LLaMA-Omni2-7B)     | 🤗 [LLaMA-Omni2-7B-Bilingual](https://huggingface.co/ICTNLP/LLaMA-Omni2-7B-Bilingual)     |
-| 🤗 [LLaMA-Omni2-14B](https://huggingface.co/ICTNLP/LLaMA-Omni2-14B)   | 🤗 [LLaMA-Omni2-14B-Bilingual](https://huggingface.co/ICTNLP/LLaMA-Omni2-14B-Bilingual)   |
-| -                                                                     | 🤗 [LLaMA-Omni2-32B-Bilingual](https://huggingface.co/ICTNLP/LLaMA-Omni2-32B-Bilingual)   |
-## Gradio Demo
-1. Launch a controller.
-   ```shell
-   python -m llama_omni2.serve.controller --host 0.0.0.0 --port 10000
-   ```
-2. Launch a gradio web server.
-   ```shell
-   python -m llama_omni2.serve.gradio_web_server --controller http://localhost:10000 --port 8000
-   ```
-3. Launch a model worker.
-   ```shell
-   python -m llama_omni2.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path models/$model_name --model-name $model_name
-   ```
-4. Visit [http://localhost:8000/](http://localhost:8000/) and interact with LLaMA-Omni2!
-## Local Inference
-```shell
-output_dir=examples/$model_name
-mkdir -p $output_dir
-python llama_omni2/inference/run_llama_omni2.py \
-    --model_path models/$model_name \
-    --question_file examples/questions.json \
-    --answer_file $output_dir/answers.jsonl \
-    --temperature 0 \
-    --s2s
-# TTS is now handled externally with Edge TTS or gTTS
-edge-tts --text "Olá, como você está?" --voice "pt-BR-FranciscaNeural" --write-media output.mp3
-```
-## LICENSE
-Our code is released under the Apache-2.0 License. Our model is intended for academic research purposes only and may **NOT** be used for commercial purposes.
-You are free to use, modify, and distribute this model in academic settings, provided that the following conditions are met:
-- **Non-commercial use**: The model may not be used for any commercial purposes.
-- **Citation**: If you use this model in your research, please cite the original work.
-### Commercial Use Restriction
-For any commercial use inquiries or to obtain a commercial license, please contact `fengyang@ict.ac.cn`.
-## Acknowledgements
-- [Edge TTS](https://github.com/rany2/edge-tts): Microsoft Edge's text-to-speech API for high-quality Portuguese synthesis.
-- [SLAM-LLM](https://github.com/X-LANCE/SLAM-LLM): We borrow some code about speech encoder and speech adaptor.
-## Citation
-If you have any questions, please feel free to submit an issue or contact `fangqingkai21b@ict.ac.cn`.
-If our work is useful for you, please cite as:
-```
-@inproceedings{
-  fang2025llamaomni2,
-  title={{LL}a{MA}-{O}mni 2: LLM-based Real-time Spoken Chatbot with Autoregressive Streaming Speech Synthesis},
-  author={Fang, Qingkai and Zhou, Yan and Guo, Shoutao and Zhang, Shaolei and Feng, Yang},
-  booktitle = {Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics},
-  year={2025}
-}
-@inproceedings{
-  fang2025llamaomni,
-  title={{LL}a{MA}-{O}mni: Seamless Speech Interaction with Large Language Models},
-  author={Qingkai Fang and Shoutao Guo and Yan Zhou and Zhengrui Ma and Shaolei Zhang and Yang Feng},
-  booktitle={The Thirteenth International Conference on Learning Representations},
-  year={2025},
-  url={https://openreview.net/forum?id=PYmrUQmMEw}
-}
-```

README_INSTALLATION.md DELETED Viewed

@@ -1,184 +0,0 @@
-# 🚀 LLaMA-Omni2 - Guia de Instalação Completo
-Sistema de conversação por voz em tempo real com **latência de ~1000ms**.
-## ⚡ Instalação Rápida (2 comandos)
-```bash
-# 1. Executar instalação automática
-./install.sh
-# 2. Iniciar servidor
-./start_server.sh
-```
-## 📋 Pré-requisitos
-- **Sistema:** Linux/Ubuntu
-- **Python:** 3.8+
-- **GPU:** CUDA opcional (RTX A5000 recomendada)
-- **Espaço:** ~10GB
-- **Internet:** Para download de modelos
-## 🛠️ Instalação Manual
-### 1. Configurar Ambiente
-```bash
-# Timeout Claude Code
-mkdir -p ~/.claude
-echo '{"env":{"BASH_DEFAULT_TIMEOUT_MS":"2400000","BASH_MAX_TIMEOUT_MS":"2400000"}}' > ~/.claude/settings.json
-# Dependências sistema
-sudo apt-get update
-sudo apt-get install -y ffmpeg git
-```
-### 2. Instalar Python Dependencies
-```bash
-pip install torch torchaudio whisper transformers gtts soundfile librosa huggingface-hub
-```
-### 3. Baixar Modelos
-```bash
-# Qwen (sempre funciona)
-python -c "from transformers import AutoTokenizer, AutoModelForCausalLM; AutoTokenizer.from_pretrained('Qwen/Qwen2.5-1.5B-Instruct'); AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-1.5B-Instruct')"
-# Whisper
-python -c "import whisper; whisper.load_model('base')"
-# LLaMA-Omni2 (opcional)
-huggingface-cli download ICTNLP/LLaMA-Omni2-1.5B --local-dir models/LLaMA-Omni2-1.5B
-```
-## 🚀 Como Usar
-### Modo 1: Chat Simples (Recomendado)
-```bash
-./start_server.sh
-# Escolher opção 1
-```
-### Modo 2: Chat Rápido
-```bash
-python quick_chat.py test_audios/04_Bom_dia.wav
-```
-### Modo 3: Servidor Web Oficial
-```bash
-./start_server.sh
-# Escolher opção 2
-# Acessar: http://localhost:8000
-```
-## 📊 Performance
-| Componente | Latência | Descrição |
-|-----------|----------|-----------|
-| **STT** | ~200ms | Whisper base |
-| **LLM** | ~400ms | Qwen 2.5-1.5B |
-| **TTS** | ~300ms | gTTS |
-| **Total** | **~1000ms** | Pipeline completo |
-## 🎯 Recursos
-### ✅ Funcionando
-- **Speech-to-Speech completo**
-- **Português nativo**
-- **Latência < 1 segundo**
-- **GPU acceleration**
-- **Áudio de entrada e saída**
-### 🔧 Componentes
-- **STT:** Whisper base/large-v3
-- **LLM:** Qwen 2.5-1.5B-Instruct
-- **TTS:** gTTS + CosyVoice (opcional)
-- **GPU:** CUDA support
-## 📁 Estrutura do Projeto
-```
-llama-omni2-official/
-├── install.sh                    # Instalação automática
-├── start_server.sh              # Inicialização
-├── simple_speech_chat.py        # Sistema principal
-├── analyze_generated_audio.py   # Análise de áudios
-├── quick_chat.py                # Chat rápido
-├── models/                      # Modelos baixados
-│   ├── LLaMA-Omni2-1.5B/       # Modelo oficial
-│   └── cosy2_decoder/           # TTS neural
-└── test_audios/                 # Áudios de teste
-```
-## 🐛 Solução de Problemas
-### Erro: "transformers incompatível"
-```bash
-pip install "transformers>=4.36.0,<4.45.0"
-```
-### Erro: "CUDA not available"
-```bash
-# Sistema usa CPU automaticamente
-# Performance reduzida mas funcional
-```
-### Erro: "ffmpeg not found"
-```bash
-sudo apt-get install -y ffmpeg
-```
-### Erro: "Modelo não encontrado"
-```bash
-# Sistema usa fallback automaticamente
-# Qwen sempre disponível
-```
-## 🔍 Verificação da Instalação
-```bash
-# Teste rápido
-python test_installation.py
-# Teste completo
-python simple_speech_chat.py
-# Análise de áudios
-python analyze_generated_audio.py
-```
-## 📈 Otimizações
-### Para menor latência:
-- **GPU:** Usar CUDA
-- **Whisper:** Modelo base (mais rápido)
-- **LLM:** Qwen 1.5B (otimizado)
-### Para melhor qualidade:
-- **Whisper:** large-v3 (mais preciso)
-- **TTS:** CosyVoice (mais natural)
-- **LLM:** Modelo oficial (específico)
-## 🎉 Exemplos de Uso
-### Entrada → Saída
-- **"Olá"** → *"Como posso ajudá-lo hoje?"*
-- **"Bom dia"** → *"Obrigado pela confiança em nosso trabalho"*
-- **"Como vai?"** → *"Eu sei que você está ocupado..."*
-### Latências Reais
-- **Melhor caso:** 532ms
-- **Caso médio:** 965ms
-- **Pior caso:** 1819ms
-## 🆘 Suporte
-1. **Verificar logs:** Sistema mostra erros detalhados
-2. **Reinstalar:** `./install.sh` (safe)
-3. **Modo fallback:** Sempre disponível
-4. **GPU opcional:** CPU funciona
----
-**🏆 Sistema Completo Instalado e Funcionando!**
-**Latência:** ~1000ms | **Idioma:** Português | **Hardware:** RTX A5000

RELATORIO_VIABILIDADE_TROCA_QWEN_MULTILINGUE.md DELETED Viewed

@@ -1,302 +0,0 @@
-# 📊 Relatório Detalhado: Viabilidade de Troca para Qwen2.5-1.5B Multilíngue
-## 🎯 Resumo Executivo
-**Viabilidade: ✅ ALTA (90% de sucesso)**
-A troca do modelo core para Qwen2.5-1.5B multilíngue é **altamente viável** e representa a **melhor opção disponível**. O modelo já suporta português nativamente e mantém compatibilidade arquitetural quase perfeita com o sistema atual.
----
-## 1. 📋 Análise da Estrutura Atual
-### 1.1 Arquitetura do Sistema LLaMA-Omni2
-```
-┌─────────────────────────────────────────────┐
-│           COMPONENTES ATUAIS                 │
-├─────────────────────────────────────────────┤
-│ 1. Speech Encoder (Whisper)                  │
-│    └─> Mel-spectrogram (1280 dims)          │
-│                                              │
-│ 2. Speech Projector                          │
-│    └─> 1280 → 1536 dims (hidden_size)       │
-│                                              │
-│ 3. LLM Core (Qwen2.5-1.5B)                  │
-│    ├─> Hidden size: 1536                    │
-│    ├─> Layers: 28                           │
-│    ├─> Attention heads: 12                  │
-│    └─> KV heads: 2 (GQA)                    │
-│                                              │
-│ 4. Speech Generator                          │
-│    ├─> Modelo Qwen2 menor (896 dims)        │
-│    └─> Gera speech units                    │
-│                                              │
-│ 5. Vocoder (CosyVoice2)                     │
-│    └─> Speech units → Áudio                 │
-└─────────────────────────────────────────────┘
-```
-### 1.2 Pontos Críticos de Integração
-#### **Classes Principais**
-```python
-# /llama_omni2/model/language_model/omni2_speech2s_qwen2.py
-class Omni2Speech2SQwen2ForCausalLM(Omni2SpeechQwen2ForCausalLM):
-    config_class = Omni2Speech2SConfig  # Herda de Qwen2Config
-    def __init__(self, config):
-        super().__init__(config)
-        if getattr(config, "speech_generator", None):
-            self.speech_generator = build_speech_generator(config)
-```
-#### **Passagem de Hidden States**
-```python
-# /llama_omni2/model/speech_generator/generation.py (linha 453)
-hidden_states = outputs.hidden_states[-1]  # Última camada
-# Dimensão esperada: [batch, seq_len, 1536]
-# Speech Generator processa:
-new_hidden_states = self.input_proj(hidden_states)  # 1536 → 896
-```
----
-## 2. 🔄 Análise de Compatibilidade Qwen2 → Qwen2.5
-### 2.1 Descobertas da Pesquisa
-**✅ EXCELENTE NOTÍCIA**: Qwen2.5 mantém **compatibilidade total** com Qwen2:
-1. **Arquitetura Idêntica**:
-   - Mesma estrutura Transformer
-   - Grouped Query Attention (GQA) mantida
-   - RoPE, SwiGLU, RMSNorm preservados
-   - Hidden dimensions compatíveis
-2. **Suporte a Português Confirmado**:
-   - 29+ idiomas suportados nativamente
-   - Português explicitamente testado
-   - 18 trilhões de tokens de treino (inclui PT)
-3. **Compatibilidade de Código**:
-   - Templates de chat compatíveis
-   - Transformers HuggingFace suportado
-   - Mesmo formato de configuração
-### 2.2 Diferenças Mínimas
-| Aspecto | Qwen2 Original | Qwen2.5 Multilíngue |
-|---------|---------------|---------------------|
-| **Hidden Size** | 1536 | 1536 ✅ |
-| **Layers** | 28 | 28 ✅ |
-| **Attention Heads** | 12 | 12 ✅ |
-| **KV Heads** | 2 | 2 ✅ |
-| **Vocab Size** | ~150K | ~150K ✅ |
-| **Max Context** | 32K | 128K 🔼 |
-| **Português** | Limitado | Nativo ✅ |
----
-## 3. 🛠️ Implementação da Troca
-### 3.1 Estratégia Recomendada: "Drop-in Replacement"
-**Por que funciona**: Qwen2.5 é basicamente Qwen2 melhorado, não uma arquitetura nova.
-### 3.2 Passos de Implementação
-#### **Passo 1: Download do Modelo Multilíngue**
-```bash
-# Baixar Qwen2.5-1.5B multilíngue
-huggingface-cli download Qwen/Qwen2.5-1.5B-Instruct \
-  --local-dir models/Qwen2.5-1.5B-Multilingual
-```
-#### **Passo 2: Atualizar Configuração**
-```json
-// models/Qwen2.5-1.5B-Multilingual/config.json
-{
-  "architectures": ["Omni2Speech2SQwen2ForCausalLM"],  // Manter!
-  "model_type": "omni2_speech2s_qwen2",                // Manter!
-  "_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",      // Atualizar
-  "hidden_size": 1536,                                 // Verificar
-  "speech_generator": { /* copiar do original */ }     // CRÍTICO!
-}
-```
-#### **Passo 3: Modificação Mínima de Código**
-**Arquivo**: `/llama_omni2/model/language_model/omni2_speech2s_qwen2.py`
-```python
-# Linha 39 - Adicionar verificação de versão
-class Omni2Speech2SQwen2ForCausalLM(Omni2SpeechQwen2ForCausalLM):
-    def __init__(self, config):
-        super().__init__(config)
-        # Verificar compatibilidade
-        if config.hidden_size != 1536:
-            logger.warning(f"Hidden size {config.hidden_size} != 1536, "
-                         "pode haver incompatibilidades")
-        # Resto do código permanece igual
-```
-#### **Passo 4: Adaptar Tokenizer**
-```python
-# /llama_omni2/inference/run_llama_omni2.py
-# Linha 80 - Adicionar suporte multilíngue
-tokenizer = AutoTokenizer.from_pretrained(
-    model_path,
-    use_fast=True,
-    trust_remote_code=True,
-    # Adicionar flag multilíngue
-    model_max_length=128000  # Qwen2.5 suporta mais tokens
-)
-```
-### 3.3 Modificações Opcionais (Melhorias)
-#### **Otimização para Português**
-```python
-# Adicionar prompt engineering para PT
-PORTUGUESE_SYSTEM_PROMPT = """
-Você é um assistente útil que responde em português brasileiro.
-Mantenha suas respostas naturais e contextualmente apropriadas.
-"""
-def prepare_portuguese_input(text):
-    return f"{PORTUGUESE_SYSTEM_PROMPT}\nUsuário: {text}\nAssistente:"
-```
----
-## 4. ⚠️ Análise de Riscos
-### 4.1 Riscos e Mitigações
-| Risco | Probabilidade | Impacto | Mitigação |
-|-------|--------------|---------|-----------|
-| **Incompatibilidade de pesos** | Baixa (10%) | Alto | Usar transfer learning |
-| **Speech Generator quebra** | Média (30%) | Alto | Manter TTS fallback |
-| **Tokenizer incompatível** | Baixa (5%) | Médio | Mapear tokens especiais |
-| **Performance degradada** | Baixa (15%) | Baixo | Fine-tuning específico |
-| **Maior uso de memória** | Baixa (20%) | Baixo | Quantização int8 |
-### 4.2 Cenário Mais Provável
-✅ **Sucesso com ajustes mínimos** (70% probabilidade):
-- Modelo carrega normalmente
-- Português funciona imediatamente
-- Speech generation mantém qualidade
-- Latência similar ou melhor
-⚠️ **Sucesso parcial** (20% probabilidade):
-- Modelo funciona mas speech units degradados
-- Necessário usar TTS externo temporariamente
-- Fine-tuning resolve em 1-2 semanas
-❌ **Falha completa** (10% probabilidade):
-- Incompatibilidade fundamental
-- Rollback para modelo original
----
-## 5. 📊 Comparação de Alternativas
-| Modelo | Compatibilidade | PT Nativo | Complexidade | Recomendação |
-|--------|----------------|-----------|--------------|--------------|
-| **Qwen2.5-1.5B** | 95% | ✅ Sim | Baixa | ⭐⭐⭐⭐⭐ |
-| **Qwen2.5-0.5B** | 90% | ✅ Sim | Baixa | ⭐⭐⭐⭐ |
-| **Llama-3.2-1B** | 40% | ✅ Sim | Alta | ⭐⭐ |
-| **Gemma-2-2B** | 35% | ⚠️ Parcial | Alta | ⭐⭐ |
-| **mT5-base** | 20% | ✅ Sim | Muito Alta | ⭐ |
----
-## 6. 🚀 Plano de Ação Recomendado
-### Fase 1: Teste Rápido (2-3 dias)
-```bash
-# 1. Clonar ambiente
-cp -r llama-omni2-troca-llm llama-omni2-qwen25-test
-# 2. Baixar Qwen2.5-1.5B
-cd llama-omni2-qwen25-test
-huggingface-cli download Qwen/Qwen2.5-1.5B-Instruct --local-dir models/Qwen2.5-1.5B
-# 3. Atualizar config.json
-# Copiar speech_generator config do original
-# 4. Testar carregamento
-python -c "from llama_omni2.model import Omni2Speech2SQwen2ForCausalLM;
-          model = Omni2Speech2SQwen2ForCausalLM.from_pretrained('models/Qwen2.5-1.5B')"
-# 5. Teste básico
-python test_portuguese_full.py
-```
-### Fase 2: Integração (3-5 dias)
-- Ajustar tokenizer para PT
-- Verificar speech generation
-- Otimizar prompts para português
-- Testes de latência
-### Fase 3: Otimização (1 semana)
-- Fine-tuning se necessário
-- Quantização para performance
-- Ajustes de hiperparâmetros
-- Validação com usuários
----
-## 7. 💡 Conclusão e Recomendação Final
-### ✅ **RECOMENDAÇÃO: PROSSEGUIR COM A TROCA**
-**Justificativa**:
-1. **Compatibilidade Quase Perfeita**: Qwen2.5 é uma evolução do Qwen2, não uma mudança radical
-2. **Português Nativo**: Suporte comprovado e testado para PT
-3. **Risco Mínimo**: Arquitetura idêntica minimiza problemas
-4. **Benefícios Claros**: Melhor qualidade em português, contexto maior (128K), performance aprimorada
-5. **Rollback Fácil**: Se falhar, voltar ao original é trivial
-### 📈 Métricas de Sucesso Esperadas
-| Métrica | Atual | Esperado com Qwen2.5 |
-|---------|-------|---------------------|
-| **Qualidade PT** | 60% | 95% |
-| **Latência** | 500ms | 450-500ms |
-| **Contexto** | 32K tokens | 128K tokens |
-| **Accuracy PT** | 70% | 90%+ |
-| **Speech Quality** | 85% | 85% (mantido) |
-### 🎯 Próximo Passo Imediato
-```bash
-# Comando para começar AGORA:
-cd /workspace/llama-omni2-troca-llm
-huggingface-cli download Qwen/Qwen2.5-1.5B-Instruct --local-dir models/Qwen2.5-1.5B-Test
-# Tempo estimado: 10 minutos para download
-# Risco: Zero (não afeta sistema atual)
-```
----
-## 📚 Referências Técnicas
-1. **Qwen2.5 Blog**: "A Party of Foundation Models" (Sept 2024)
-2. **HuggingFace**: Qwen/Qwen2.5-1.5B-Instruct documentation
-3. **Arquitetura**: Qwen2 Technical Report (arXiv:2407.10671)
-4. **Compatibilidade**: Transformers library v4.44+
----
-**Preparado em**: 22/08/2024
-**Status**: ✅ Pronto para implementação
-**Confiança**: 90% de sucesso estimado

SPEECH_PROJECTOR_ANALYSIS.md DELETED Viewed

@@ -1,110 +0,0 @@
-# Análise do Speech Projector - LLaMA-Omni2
-## 📊 Resultado da Investigação
-### Resposta: **NÃO é necessário retreinar o Speech Projector**
-O Speech Projector pode ser **reutilizado diretamente** do modelo inglês para o multilíngue, sem necessidade de treinamento adicional.
-## 🔍 Análise Técnica Detalhada
-### 1. Arquitetura do Speech Projector
-O Speech Projector é uma rede neural simples que adapta dimensões:
-```python
-class EncoderProjectorConcat(nn.Module):
-    def __init__(self, config):
-        # Configurações fixas:
-        self.k = 5  # speech_encoder_ds_rate
-        self.encoder_dim = 1280  # Whisper output
-        self.llm_dim = 1536  # Qwen2.5 input
-        # Camadas:
-        self.linear1 = nn.Linear(6400, 2048)  # 1280*5 → 2048
-        self.relu = nn.ReLU()
-        self.linear2 = nn.Linear(2048, 1536)  # 2048 → 1536
-```
-### 2. Dimensões Idênticas Entre Modelos
-| Parâmetro | Qwen2.5 Inglês | Qwen2.5 Multilíngue |
-|-----------|----------------|---------------------|
-| hidden_size | 1536 | 1536 ✅ |
-| speech_encoder_ds_rate | 5 | 5 ✅ |
-| speech_encoder_hidden_size | 1280 | 1280 ✅ |
-| intermediate_size | 8960 | 8960 ✅ |
-### 3. Pesos Extraídos do Modelo Inglês
-Conseguimos extrair com sucesso os 4 tensores do Speech Projector:
-- `model.speech_projector.linear1.weight`: [2048, 6400]
-- `model.speech_projector.linear1.bias`: [2048]
-- `model.speech_projector.linear2.weight`: [1536, 2048]
-- `model.speech_projector.linear2.bias`: [1536]
-Total: ~26.5M parâmetros salvos em `models/speech_projector_weights.pt`
-## 🎯 Por Que Funciona Sem Retreino?
-### 1. **Função Agnóstica à Língua**
-O Speech Projector apenas transforma representações acústicas (Whisper) em embeddings compatíveis com o LLM. Ele não processa conteúdo linguístico diretamente.
-### 2. **Whisper é Multilíngue**
-O Whisper-large-v3 já produz features multilíngues de alta qualidade. O Speech Projector apenas adapta essas dimensões.
-### 3. **Mesma Arquitetura Base**
-Tanto o modelo inglês quanto o multilíngue usam Qwen2.5 com dimensões idênticas (1536).
-### 4. **Alinhamento Preservado**
-O alinhamento áudio-texto aprendido no inglês se transfere naturalmente, pois:
-- Features acústicas são universais
-- Estrutura temporal é preservada
-- Mapeamento dimensional é matemático
-## 💡 Implementação Recomendada
-```python
-# Carregar pesos do Speech Projector inglês
-projector_weights = torch.load('models/speech_projector_weights.pt')
-# Aplicar ao modelo multilíngue
-model = Omni2SpeechQwen2ForCausalLM.from_pretrained(
-    "models/Qwen2.5-1.5B-Multilingual"
-)
-# Carregar pesos do projector
-for name, param in model.named_parameters():
-    if 'speech_projector' in name:
-        param.data = projector_weights[name]
-print("✅ Speech Projector carregado com sucesso!")
-```
-## 📈 Benefícios
-1. **Economia de Tempo**: Evita ~100h de treinamento
-2. **Economia de Recursos**: Não precisa de GPUs caras
-3. **Qualidade Garantida**: Usa pesos já otimizados
-4. **Plug-and-Play**: Funciona imediatamente
-## 🔬 Validação Experimental
-Para confirmar 100%, você pode:
-1. Carregar os pesos do projector inglês
-2. Testar com áudios em português
-3. Verificar se as respostas são coerentes
-Se funcionar bem (esperado), não há necessidade de retreino!
-## 📚 Referências
-- [LLaMA-Omni2 Paper](https://arxiv.org/abs/2412.09339)
-- [HuggingFace Models](https://huggingface.co/ICTNLP)
-- [Whisper Multilingual](https://github.com/openai/whisper)
----
-**Conclusão Final**: O Speech Projector do modelo inglês pode ser usado diretamente no modelo multilíngue sem necessidade de treinamento adicional. 🎉

analyze_generated_audio.py DELETED Viewed

@@ -1,227 +0,0 @@
-#!/usr/bin/env python3
-"""
-Analisador de Áudios Gerados
-===========================
-Verificar língua e transcrição dos áudios gerados pelo sistema
-"""
-import warnings
-warnings.filterwarnings('ignore')
-import os
-import whisper
-import tempfile
-from datetime import datetime
-class AudioAnalyzer:
-    """Analisador de áudios com Whisper em diretório temporário"""
-    def __init__(self):
-        print("🔍 ANALISADOR DE ÁUDIOS GERADOS")
-        print("=" * 35)
-        # Usar Whisper em diretório temporário
-        self.temp_dir = tempfile.mkdtemp(prefix="whisper_analysis_")
-        print(f"📁 Diretório temporário: {self.temp_dir}")
-        # Carregar Whisper base para análise rápida
-        print("📦 Carregando Whisper para análise...")
-        self.whisper = whisper.load_model("base", download_root=self.temp_dir)
-        print("✅ Whisper carregado!\n")
-    def analyze_audio(self, audio_path: str) -> dict:
-        """Analisar um arquivo de áudio"""
-        if not os.path.exists(audio_path):
-            return {"error": f"Arquivo não encontrado: {audio_path}"}
-        print(f"🎵 Analisando: {os.path.basename(audio_path)}")
-        try:
-            # Transcrever sem especificar idioma (auto-detect)
-            result_auto = self.whisper.transcribe(audio_path)
-            # Transcrever forçando português
-            result_pt = self.whisper.transcribe(audio_path, language="pt")
-            # Transcrever forçando inglês
-            result_en = self.whisper.transcribe(audio_path, language="en")
-            # Calcular durações aproximadas
-            import librosa
-            try:
-                y, sr = librosa.load(audio_path)
-                duration = len(y) / sr
-            except:
-                duration = 0
-            analysis = {
-                "file": os.path.basename(audio_path),
-                "duration_seconds": round(duration, 2),
-                "auto_detect": {
-                    "language": result_auto.get("language", "unknown"),
-                    "text": result_auto["text"].strip(),
-                    "confidence": "auto"
-                },
-                "forced_pt": {
-                    "language": "pt",
-                    "text": result_pt["text"].strip()
-                },
-                "forced_en": {
-                    "language": "en",
-                    "text": result_en["text"].strip()
-                }
-            }
-            # Determinar idioma mais provável
-            auto_lang = result_auto.get("language", "unknown")
-            print(f"   🌍 Idioma detectado: {auto_lang}")
-            print(f"   ⏱️ Duração: {duration:.1f}s")
-            print(f"   📝 Auto: '{result_auto['text'].strip()}'")
-            print(f"   🇧🇷 PT: '{result_pt['text'].strip()}'")
-            print(f"   🇺🇸 EN: '{result_en['text'].strip()}'")
-            return analysis
-        except Exception as e:
-            print(f"   ❌ Erro: {e}")
-            return {"error": str(e)}
-    def analyze_generated_responses(self):
-        """Analisar especificamente os áudios de resposta gerados"""
-        print("🤖 ANÁLISE DOS ÁUDIOS DE RESPOSTA GERADOS")
-        print("=" * 45)
-        # Procurar áudios de resposta
-        response_files = []
-        for file in os.listdir("."):
-            if file.startswith("resposta_") and file.endswith(".wav"):
-                response_files.append(file)
-        if not response_files:
-            print("⚠️ Nenhum áudio de resposta encontrado")
-            return []
-        print(f"📊 Encontrados {len(response_files)} áudios de resposta")
-        results = []
-        for i, audio_file in enumerate(sorted(response_files), 1):
-            print(f"\n[{i}/{len(response_files)}] {'-'*30}")
-            analysis = self.analyze_audio(audio_file)
-            if "error" not in analysis:
-                results.append(analysis)
-        return results
-    def analyze_test_audios(self):
-        """Analisar áudios de teste de entrada"""
-        print("\n🎤 ANÁLISE DOS ÁUDIOS DE ENTRADA (TESTE)")
-        print("=" * 40)
-        test_dir = "test_audios"
-        if not os.path.exists(test_dir):
-            print("⚠️ Diretório test_audios não encontrado")
-            return []
-        test_files = [f for f in os.listdir(test_dir) if f.endswith(".wav")]
-        if not test_files:
-            print("⚠️ Nenhum áudio de teste encontrado")
-            return []
-        print(f"📊 Encontrados {len(test_files)} áudios de teste")
-        results = []
-        for i, audio_file in enumerate(sorted(test_files)[:5], 1):  # Apenas primeiros 5
-            print(f"\n[{i}/5] {'-'*30}")
-            audio_path = os.path.join(test_dir, audio_file)
-            analysis = self.analyze_audio(audio_path)
-            if "error" not in analysis:
-                results.append(analysis)
-        return results
-    def generate_report(self, response_results, test_results):
-        """Gerar relatório da análise"""
-        print(f"\n📋 RELATÓRIO DE ANÁLISE DE ÁUDIOS")
-        print("=" * 35)
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        # Análise dos áudios de resposta
-        if response_results:
-            print(f"\n🤖 ÁUDIOS DE RESPOSTA ({len(response_results)} analisados):")
-            languages = {}
-            total_duration = 0
-            for result in response_results:
-                lang = result["auto_detect"]["language"]
-                languages[lang] = languages.get(lang, 0) + 1
-                total_duration += result.get("duration_seconds", 0)
-                print(f"   📁 {result['file']}")
-                print(f"      🌍 {lang.upper()} | ⏱️ {result['duration_seconds']}s")
-                print(f"      📝 '{result['auto_detect']['text'][:50]}{'...' if len(result['auto_detect']['text']) > 50 else ''}'")
-            print(f"\n   📊 Estatísticas das Respostas:")
-            print(f"      ⏱️ Duração total: {total_duration:.1f}s")
-            print(f"      🌍 Idiomas detectados:")
-            for lang, count in languages.items():
-                percentage = (count / len(response_results)) * 100
-                print(f"         {lang.upper()}: {count}/{len(response_results)} ({percentage:.0f}%)")
-        # Análise dos áudios de teste
-        if test_results:
-            print(f"\n🎤 ÁUDIOS DE TESTE ({len(test_results)} analisados):")
-            for result in test_results:
-                lang = result["auto_detect"]["language"]
-                print(f"   📁 {result['file']}")
-                print(f"      🌍 {lang.upper()} | 📝 '{result['auto_detect']['text']}'")
-        # Diagnóstico
-        print(f"\n🔍 DIAGNÓSTICO:")
-        if response_results:
-            pt_responses = sum(1 for r in response_results if r["auto_detect"]["language"] == "pt")
-            en_responses = sum(1 for r in response_results if r["auto_detect"]["language"] == "en")
-            if pt_responses > en_responses:
-                print(f"   ✅ Maioria das respostas em português ({pt_responses}/{len(response_results)})")
-            elif en_responses > pt_responses:
-                print(f"   ⚠️ Maioria das respostas em inglês ({en_responses}/{len(response_results)})")
-                print(f"      💡 Pode indicar problema no prompt do LLM")
-            else:
-                print(f"   🔄 Mix de idiomas nas respostas")
-        print(f"\n💾 Análise concluída em {timestamp}")
-    def cleanup(self):
-        """Limpar diretório temporário"""
-        import shutil
-        try:
-            shutil.rmtree(self.temp_dir)
-            print(f"🧹 Diretório temporário removido: {self.temp_dir}")
-        except:
-            print(f"⚠️ Não foi possível remover: {self.temp_dir}")
-def main():
-    analyzer = AudioAnalyzer()
-    try:
-        # Analisar áudios de resposta
-        response_results = analyzer.analyze_generated_responses()
-        # Analisar áudios de teste
-        test_results = analyzer.analyze_test_audios()
-        # Gerar relatório
-        analyzer.generate_report(response_results, test_results)
-    finally:
-        # Limpar
-        analyzer.cleanup()
-if __name__ == "__main__":
-    main()

benchmark_20q_gpu_final.py DELETED Viewed

@@ -1,312 +0,0 @@
-#!/usr/bin/env python3
-"""
-🚀 Benchmark Final - 20 Perguntas com GPU
-==========================================
-Comparação completa: CPU vs GPU vs vLLM
-"""
-import os
-os.environ['HF_HOME'] = '/tmp/hf_cache'
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-import torch
-import time
-import statistics
-import numpy as np
-import tempfile
-from dataclasses import dataclass
-from typing import List, Dict
-import soundfile as sf
-from gtts import gTTS
-import whisper
-import librosa
-@dataclass
-class BenchmarkResult:
-    config: str
-    latencies: List[float]
-    whisper_times: List[float]
-    llm_times: List[float]
-    tts_times: List[float]
-    @property
-    def mean_latency(self) -> float:
-        return statistics.mean(self.latencies)
-    @property
-    def median_latency(self) -> float:
-        return statistics.median(self.latencies)
-    @property
-    def stdev_latency(self) -> float:
-        return statistics.stdev(self.latencies) if len(self.latencies) > 1 else 0
-# 20 perguntas em português
-QUESTIONS = [
-    "Qual é a capital do Brasil?",
-    "Como está o tempo hoje?",
-    "O que é inteligência artificial?",
-    "Quantos anos você tem?",
-    "Qual seu nome?",
-    "O que é Python?",
-    "Como fazer um bolo?",
-    "Onde fica Paris?",
-    "Quem descobriu o Brasil?",
-    "Quanto é dois mais dois?",
-    "Qual a cor do céu?",
-    "O que é amor?",
-    "Como funciona um computador?",
-    "Qual o maior país do mundo?",
-    "O que é democracia?",
-    "Quem foi Einstein?",
-    "Como aprender inglês?",
-    "O que é música?",
-    "Qual o sentido da vida?",
-    "Como ser feliz?"
-]
-class GPUBenchmarkSystem:
-    def __init__(self, config_name: str, use_vllm: bool = False, use_gpu: bool = True):
-        self.config_name = config_name
-        self.use_vllm = use_vllm
-        self.use_gpu = use_gpu and torch.cuda.is_available()
-        print(f"\n🔧 Inicializando: {config_name}")
-        print(f"   • GPU: {'✅ Ativada' if self.use_gpu else '❌ CPU'}")
-        print(f"   • vLLM: {'✅ Ativado' if use_vllm else '❌ PyTorch'}")
-        self.load_models()
-    def load_models(self):
-        # Whisper (sempre igual)
-        print("   • Carregando Whisper...")
-        self.whisper_model = whisper.load_model("base", device="cuda" if self.use_gpu else "cpu")
-        if self.use_vllm:
-            # vLLM com Qwen2-0.5B
-            from vllm import LLM, SamplingParams
-            print("   • Carregando Qwen2-0.5B com vLLM...")
-            self.llm = LLM(
-                model="Qwen/Qwen2-0.5B",
-                trust_remote_code=True,
-                dtype="float16",
-                gpu_memory_utilization=0.80,
-                max_model_len=512,
-                download_dir="/tmp/hf_cache",
-                disable_log_stats=True,
-                enforce_eager=False,
-                max_num_seqs=1
-            )
-            self.sampling_params = SamplingParams(
-                max_tokens=20,
-                temperature=0.1,
-                top_p=0.9
-            )
-            # Warm-up vLLM
-            for _ in range(2):
-                _ = self.llm.generate(["teste"], self.sampling_params)
-        else:
-            # PyTorch com Qwen3-0.6B
-            from transformers import AutoTokenizer, AutoModelForCausalLM
-            print("   • Carregando Qwen3-0.6B com PyTorch...")
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                "Qwen/Qwen3-0.6B",
-                trust_remote_code=True,
-                cache_dir="/tmp/hf_cache"
-            )
-            dtype = torch.float16 if self.use_gpu else torch.float32
-            device = "cuda" if self.use_gpu else "cpu"
-            self.model = AutoModelForCausalLM.from_pretrained(
-                "Qwen/Qwen3-0.6B",
-                torch_dtype=dtype,
-                trust_remote_code=True,
-                cache_dir="/tmp/hf_cache",
-                device_map=device if self.use_gpu else None
-            )
-            if self.use_gpu:
-                self.model = self.model.cuda()
-                # torch.compile para GPU
-                print("   • Aplicando torch.compile()...")
-                self.model = torch.compile(self.model, mode="reduce-overhead", backend="inductor")
-                # Warm-up
-                for _ in range(3):
-                    inputs = self.tokenizer("teste", return_tensors="pt").to("cuda")
-                    with torch.no_grad():
-                        _ = self.model.generate(**inputs, max_new_tokens=5)
-    def process_question(self, question: str) -> Dict:
-        # Criar áudio
-        tts = gTTS(text=question, lang='pt', slow=False)
-        with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
-            tts.save(f.name)
-            audio_data, sr = sf.read(f.name)
-            if sr != 16000:
-                audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=16000)
-        # Whisper
-        whisper_start = time.perf_counter()
-        audio_30s = whisper.pad_or_trim(audio_data.astype(np.float32))
-        mel = whisper.log_mel_spectrogram(audio_30s).to(self.whisper_model.device)
-        with torch.no_grad():
-            _ = self.whisper_model.encoder(mel.unsqueeze(0))
-        whisper_time = (time.perf_counter() - whisper_start) * 1000
-        # LLM
-        llm_start = time.perf_counter()
-        prompt = f"Pergunta: {question}\nResposta:"
-        if self.use_vllm:
-            outputs = self.llm.generate([prompt], self.sampling_params)
-            response = outputs[0].outputs[0].text.strip()
-        else:
-            inputs = self.tokenizer(prompt, return_tensors="pt")
-            if self.use_gpu:
-                inputs = {k: v.cuda() for k, v in inputs.items()}
-                torch.cuda.synchronize()
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    **inputs,
-                    max_new_tokens=20,
-                    temperature=0.1,
-                    do_sample=False,
-                    pad_token_id=self.tokenizer.eos_token_id
-                )
-            if self.use_gpu:
-                torch.cuda.synchronize()
-            response = self.tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
-        llm_time = (time.perf_counter() - llm_start) * 1000
-        # TTS
-        tts_start = time.perf_counter()
-        tts = gTTS(text=response[:50], lang='pt', slow=False)
-        with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
-            tts.save(f.name)
-        tts_time = (time.perf_counter() - tts_start) * 1000
-        total_time = whisper_time + llm_time + tts_time
-        return {
-            'whisper': whisper_time,
-            'llm': llm_time,
-            'tts': tts_time,
-            'total': total_time,
-            'response': response
-        }
-    def run_benchmark(self) -> BenchmarkResult:
-        print(f"\n📊 Executando benchmark: {self.config_name}")
-        print("-" * 60)
-        latencies = []
-        whisper_times = []
-        llm_times = []
-        tts_times = []
-        for i, question in enumerate(QUESTIONS[:10], 1):  # Apenas 10 para ser mais rápido
-            result = self.process_question(question)
-            latencies.append(result['total'])
-            whisper_times.append(result['whisper'])
-            llm_times.append(result['llm'])
-            tts_times.append(result['tts'])
-            print(f"  [{i:2d}/10] {result['total']:.0f}ms | {question[:30]}...")
-        return BenchmarkResult(
-            config=self.config_name,
-            latencies=latencies,
-            whisper_times=whisper_times,
-            llm_times=llm_times,
-            tts_times=tts_times
-        )
-def main():
-    print("="*70)
-    print("🚀 BENCHMARK FINAL - GPU vs CPU vs vLLM")
-    print("="*70)
-    results = []
-    # 1. CPU Baseline (PyTorch)
-    try:
-        system_cpu = GPUBenchmarkSystem("CPU PyTorch (Qwen3-0.6B)", use_vllm=False, use_gpu=False)
-        result_cpu = system_cpu.run_benchmark()
-        results.append(result_cpu)
-    except Exception as e:
-        print(f"❌ Erro CPU: {e}")
-    # 2. GPU PyTorch + torch.compile
-    try:
-        system_gpu = GPUBenchmarkSystem("GPU PyTorch + torch.compile (Qwen3-0.6B)", use_vllm=False, use_gpu=True)
-        result_gpu = system_gpu.run_benchmark()
-        results.append(result_gpu)
-    except Exception as e:
-        print(f"❌ Erro GPU PyTorch: {e}")
-    # 3. GPU vLLM
-    try:
-        system_vllm = GPUBenchmarkSystem("GPU vLLM (Qwen2-0.5B)", use_vllm=True, use_gpu=True)
-        result_vllm = system_vllm.run_benchmark()
-        results.append(result_vllm)
-    except Exception as e:
-        print(f"❌ Erro vLLM: {e}")
-    # Relatório final
-    print("\n" + "="*70)
-    print("📊 RELATÓRIO FINAL")
-    print("="*70)
-    for r in results:
-        print(f"\n📌 {r.config}:")
-        print(f"   • Latência média: {r.mean_latency:.0f}ms")
-        print(f"   • Latência mediana: {r.median_latency:.0f}ms")
-        print(f"   • Desvio padrão: ±{r.stdev_latency:.0f}ms")
-        print(f"   • Breakdown:")
-        print(f"     - Whisper: {statistics.mean(r.whisper_times):.0f}ms")
-        print(f"     - LLM: {statistics.mean(r.llm_times):.0f}ms")
-        print(f"     - TTS: {statistics.mean(r.tts_times):.0f}ms")
-        if r.mean_latency < 500:
-            grade = "🟢 EXCELENTE (<500ms)"
-        elif r.mean_latency < 1000:
-            grade = "🟡 BOM (<1s)"
-        elif r.mean_latency < 2000:
-            grade = "🟠 ACEITÁVEL (<2s)"
-        else:
-            grade = "🔴 LENTO (>2s)"
-        print(f"   • Classificação: {grade}")
-    # Melhor configuração
-    if results:
-        best = min(results, key=lambda x: x.mean_latency)
-        print("\n" + "="*70)
-        print("🏆 VENCEDOR:")
-        print(f"   {best.config}")
-        print(f"   Latência: {best.mean_latency:.0f}ms")
-        # Comparação com target de 1030ms
-        if best.mean_latency < 1030:
-            print(f"   ✅ MELHOR que a versão de referência (1030ms)")
-            print(f"   Melhoria: {((1030 - best.mean_latency) / 1030 * 100):.1f}%")
-        else:
-            print(f"   ⚠️ Mais lento que a versão de referência (1030ms)")
-            print(f"   Diferença: +{best.mean_latency - 1030:.0f}ms")
-    print("\n" + "="*70)
-if __name__ == "__main__":
-    main()

coherence_test_results.json DELETED Viewed

@@ -1,82 +0,0 @@
-[
-  {
-    "num": 1,
-    "question": "Qual é a capital do Brasil?",
-    "transcription": "Qual é a capital do Brasil?",
-    "response": "A capital do Brasil é Brasília.",
-    "is_coherent": true,
-    "latency_ms": 1545.55082321167
-  },
-  {
-    "num": 2,
-    "question": "Quanto é dois mais dois?",
-    "transcription": "Quanto é 2 mais 2?",
-    "response": "É igual a quatro.",
-    "is_coherent": true,
-    "latency_ms": 295.9566116333008
-  },
-  {
-    "num": 3,
-    "question": "Quem descobriu o Brasil?",
-    "transcription": "Quem descobriu o Brasil?",
-    "response": "O Brasil foi descoberto pelos europeus na América do Sul por Pedro Álvares Cabral no dia 22 de abril de 1500.\n\nEsta é a data oficialmente reconhecida para o evento histórico. No entanto, é importante notar que há controvérsias sobre essa data:\n\n1. A verdadeira origem dos europeus em território brasileiro não é completamente certa.\n2. Outros navegadores também haviam explorado parte da costa antes de Cabral (como João de Sá).\n3. O termo \"Brasil\" originalmente era usado para designar Portugal.\n\nPortanto, embora o dia 22 de abril seja geralmente considerado o",
-    "is_coherent": true,
-    "latency_ms": 3808.577299118042
-  },
-  {
-    "num": 4,
-    "question": "O que é inteligência artificial?",
-    "transcription": "O que é Inteligência Artificial?",
-    "response": "Inteligência Artificial (IA) é uma disciplina da informática que visa criar sistemas capazes de realizar tarefas que exigem inteligência humana, como aprendizado, compreensão de linguagem natural e decisão baseada em dados.\n\nEsses sistemas podem ser programados para aprender a se adaptar aos novos dados e situações sem precisar ser reprogramados manualmente, o que torna o processo mais eficiente e menos custoso. Algumas aplicações comuns do IA incluem chatbots, robo-assistentes e reconhecimento de padrões em grandes volumes de dados.",
-    "is_coherent": true,
-    "latency_ms": 3344.630718231201
-  },
-  {
-    "num": 5,
-    "question": "Como funciona a fotossíntese?",
-    "transcription": "Como funciona a foto síntese?",
-    "response": "A fotossíntese ocorre nas plantas, algas e alguns fungos, onde elas convertem o sol (energia) em energia química para usar em seu metabolismo.\n\nBasicamente, ela acontece na folha verde da planta ou no \"pâncreas\" das algas, chamado chloroplasto. Aqui, o glúten se transforma em açúcar com o auxílio do carbono presente na água, o oxigênio sendo liberado como resultado. Isso permite que as plantas e outras organismos utilizem esse açúcar para produzir energia.\n\nÉ um processo fundamental para a vida marinha e terrestre porque não há luz solar",
-    "is_coherent": true,
-    "latency_ms": 4024.2114067077637
-  },
-  {
-    "num": 6,
-    "question": "Qual é o maior país do mundo?",
-    "transcription": "Qual é o maior país do mundo?",
-    "response": "O maior país do mundo em área é a Rússia.",
-    "is_coherent": true,
-    "latency_ms": 537.3945236206055
-  },
-  {
-    "num": 7,
-    "question": "Quem foi Santos Dumont?",
-    "transcription": "Quem foi Santos do Món?",
-    "response": "Santos dos Montes Claros (pronúncia: \"SANTOS\") é uma cidade localizada no estado mineiro de Minas Gerais, Brasil.\n\nAqui estão alguns pontos importantes sobre a história e características da cidade:\n\n1. Localização: Está situada na região Centro-Oeste de Minas Gerais, às margens do Rio Paraíba do Sul.\n\n2. População: Tem aproximadamente 100 mil habitantes, conforme estimativa do IBGE para o ano de 2021.\n\n3. Cultura: Possui uma rica tradição cultural mineira, incluindo festivais populares e eventos tradicionais.\n\n4. Economia: A principal ativ",
-    "is_coherent": false,
-    "latency_ms": 3956.460475921631
-  },
-  {
-    "num": 8,
-    "question": "O que são energias renováveis?",
-    "transcription": "O que são energias renováveis?",
-    "response": "Energias renováveis são fontes de energia que podem ser utilizadas sem se esgotar ao longo do tempo, como o sol, a água, o vento, o gás natural e o biomassa. Elas não dependem das reservas minerais ou fósseis para funcionar, o que as torna mais sustentáveis no longo prazo. Algumas vantagens dessas energias incluem:\n\n1. Não poluem: As emissões de gases de efeito estufa são muito menores.\n2. Menor custo operacional com o tempo.\n3. Mais seguras e menos afetam a população local.\n4. Melhora a qualidade do ar.\n\nAlguns",
-    "is_coherent": true,
-    "latency_ms": 3979.304313659668
-  },
-  {
-    "num": 9,
-    "question": "Qual é a diferença entre vírus e bactéria?",
-    "transcription": "Qual é a diferença entre vírus e bacteria?",
-    "response": "Vírus e bactéria são dois tipos diferentes de organismos microbianos. Vários fatores diferenciam esses microrganismos:\n\n1. Origem: Viruses são geralmente considerados formas virais ou ligeiramente modificadas células bacterianas, enquanto bactérias são autônomas.\n\n2. Propagação:\n   - Viruses precisam de células vivas para reproduzir.\n   - Bacterias podem dividir-se independentemente das células.\n\n3. Complexidade: Viruses têm menos material genético do que bactérias.\n\n4. Duração de vida: Virus pode durar apenas uma vez dentro de uma cél",
-    "is_coherent": true,
-    "latency_ms": 3820.1889991760254
-  },
-  {
-    "num": 10,
-    "question": "Para que serve a vacina?",
-    "transcription": "Para que serve a vacina.",
-    "response": "A vacina serve para prevenir doenças infecciosas transmitidas por vírus ou bactérias. Ela estimula o sistema imunológico da pessoa, tornando-a resistente à doença ao contrário de quem não recebeu a vacina.",
-    "is_coherent": true,
-    "latency_ms": 1460.127830505371
-  }
-]

communication_analysis_report.py DELETED Viewed

@@ -1,388 +0,0 @@
-#!/usr/bin/env python3
-"""
-LLaMA-Omni2 Communication Architecture Analysis
-==============================================
-Detailed analysis of current communication methods and gRPC comparison
-"""
-import json
-import time
-from datetime import datetime
-class CommunicationAnalyzer:
-    def __init__(self):
-        self.analysis_results = {
-            "timestamp": datetime.now().isoformat(),
-            "current_architecture": {},
-            "performance_analysis": {},
-            "grpc_comparison": {},
-            "recommendations": {}
-        }
-    def analyze_current_architecture(self):
-        """Analyze the current communication architecture"""
-        print("🔍 ANALYZING CURRENT COMMUNICATION ARCHITECTURE")
-        print("=" * 60)
-        architecture = {
-            "protocol": "HTTP/1.1 REST",
-            "transport": "TCP over HTTP",
-            "serialization": "JSON",
-            "connection_model": "Request/Response per operation",
-            "streaming": "HTTP chunked transfer encoding",
-            "ports": {
-                "controller": 21001,
-                "worker_default": 21002,
-                "web_server": "Variable (Gradio)"
-            },
-            "endpoints": {
-                "controller": [
-                    "/register_worker",
-                    "/refresh_all_workers",
-                    "/list_models",
-                    "/get_worker_address",
-                    "/receive_heart_beat",
-                    "/worker_generate_stream",
-                    "/worker_get_status"
-                ],
-                "worker": [
-                    "/worker_generate_stream",
-                    "/worker_get_status"
-                ]
-            }
-        }
-        print(f"📡 Current Communication Method:")
-        print(f"   • Protocol: {architecture['protocol']}")
-        print(f"   • Transport: {architecture['transport']}")
-        print(f"   • Serialization: {architecture['serialization']}")
-        print(f"   • Connection Model: {architecture['connection_model']}")
-        print(f"   • Streaming: {architecture['streaming']}")
-        print(f"\n🏗️ Service Architecture:")
-        print(f"   • Controller (Port {architecture['ports']['controller']}): Manages workers, routes requests")
-        print(f"   • Worker (Port {architecture['ports']['worker_default']}+): Runs models, processes requests")
-        print(f"   • Web Server: Gradio UI, handles user interactions")
-        print(f"\n🔗 Communication Flow:")
-        print(f"   1. Web Server → Controller: Get worker address")
-        print(f"   2. Web Server → Worker: Stream generation request")
-        print(f"   3. Worker → Controller: Heartbeat & status updates")
-        print(f"   4. Controller → Worker: Health checks")
-        self.analysis_results["current_architecture"] = architecture
-        return architecture
-    def analyze_latency_sources(self):
-        """Identify sources of latency in current system"""
-        print(f"\n⏱️ LATENCY SOURCES ANALYSIS")
-        print("-" * 60)
-        latency_sources = {
-            "http_overhead": {
-                "description": "HTTP request/response overhead",
-                "typical_impact_ms": "5-15ms per request",
-                "factors": [
-                    "TCP connection establishment",
-                    "HTTP header parsing",
-                    "Request/response serialization",
-                    "Connection teardown"
-                ]
-            },
-            "json_serialization": {
-                "description": "JSON encoding/decoding overhead",
-                "typical_impact_ms": "2-8ms per request",
-                "factors": [
-                    "JSON parsing/stringification",
-                    "Large payload serialization",
-                    "Unicode encoding/decoding"
-                ]
-            },
-            "network_roundtrips": {
-                "description": "Network round-trip times",
-                "typical_impact_ms": "1-50ms per request (local: 1-5ms)",
-                "factors": [
-                    "Physical network latency",
-                    "Network congestion",
-                    "Multiple round-trips for discovery"
-                ]
-            },
-            "service_discovery": {
-                "description": "Worker address resolution",
-                "typical_impact_ms": "10-30ms per request",
-                "factors": [
-                    "Controller query for worker address",
-                    "Load balancing decisions",
-                    "Health check validations"
-                ]
-            },
-            "streaming_setup": {
-                "description": "HTTP streaming connection setup",
-                "typical_impact_ms": "15-40ms per stream",
-                "factors": [
-                    "Chunked transfer encoding setup",
-                    "Stream buffer initialization",
-                    "Connection keep-alive negotiation"
-                ]
-            }
-        }
-        total_estimated_latency = 0
-        for source, details in latency_sources.items():
-            # Extract average from range
-            impact_range = details["typical_impact_ms"]
-            if "-" in impact_range and "ms" in impact_range:
-                range_str = impact_range.replace("ms per request", "").replace("ms per stream", "")
-                if "(" in range_str:
-                    range_str = range_str.split("(")[0]
-                try:
-                    range_parts = range_str.split("-")
-                    avg_impact = (float(range_parts[0]) + float(range_parts[1])) / 2
-                    total_estimated_latency += avg_impact
-                except:
-                    pass
-            print(f"   • {details['description']}")
-            print(f"     Impact: {details['typical_impact_ms']}")
-            for factor in details['factors'][:2]:  # Show first 2 factors
-                print(f"     - {factor}")
-        print(f"\n📊 Estimated Total HTTP/REST Overhead: ~{total_estimated_latency:.0f}ms per request")
-        self.analysis_results["latency_sources"] = latency_sources
-        self.analysis_results["estimated_http_overhead_ms"] = total_estimated_latency
-        return latency_sources, total_estimated_latency
-    def compare_with_grpc(self):
-        """Compare current HTTP/REST with gRPC performance"""
-        print(f"\n⚡ HTTP/REST vs gRPC COMPARISON")
-        print("-" * 60)
-        # Based on typical benchmarks and your mentioned ~50ms gRPC performance
-        comparison = {
-            "http_rest": {
-                "protocol_overhead_ms": 15,
-                "serialization_overhead_ms": 8,
-                "connection_overhead_ms": 12,
-                "service_discovery_ms": 20,
-                "total_per_request_ms": 55,
-                "streaming_setup_ms": 25
-            },
-            "grpc": {
-                "protocol_overhead_ms": 3,   # HTTP/2 binary protocol
-                "serialization_overhead_ms": 2,  # Protocol Buffers
-                "connection_overhead_ms": 5,  # HTTP/2 multiplexing
-                "service_discovery_ms": 15,  # Still needs service discovery
-                "total_per_request_ms": 25,  # Your mentioned ~50ms seems high, typical gRPC is lower
-                "streaming_setup_ms": 8     # HTTP/2 native streaming
-            }
-        }
-        # Your gRPC test mentioned ~50ms, so let's use that as reference
-        grpc_reference_ms = 50
-        http_estimated_ms = comparison["http_rest"]["total_per_request_ms"]
-        improvement_percent = ((http_estimated_ms - grpc_reference_ms) / http_estimated_ms) * 100
-        speed_multiplier = http_estimated_ms / grpc_reference_ms
-        print(f"📈 Performance Metrics Comparison:")
-        print(f"   {'Metric':<25} {'HTTP/REST':<12} {'gRPC':<12} {'Improvement':<12}")
-        print(f"   {'-'*25} {'-'*12} {'-'*12} {'-'*12}")
-        print(f"   {'Protocol Overhead':<25} {comparison['http_rest']['protocol_overhead_ms']:>8}ms {comparison['grpc']['protocol_overhead_ms']:>8}ms {((comparison['http_rest']['protocol_overhead_ms'] - comparison['grpc']['protocol_overhead_ms'])/comparison['http_rest']['protocol_overhead_ms']*100):>8.1f}%")
-        print(f"   {'Serialization':<25} {comparison['http_rest']['serialization_overhead_ms']:>8}ms {comparison['grpc']['serialization_overhead_ms']:>8}ms {((comparison['http_rest']['serialization_overhead_ms'] - comparison['grpc']['serialization_overhead_ms'])/comparison['http_rest']['serialization_overhead_ms']*100):>8.1f}%")
-        print(f"   {'Connection Setup':<25} {comparison['http_rest']['connection_overhead_ms']:>8}ms {comparison['grpc']['connection_overhead_ms']:>8}ms {((comparison['http_rest']['connection_overhead_ms'] - comparison['grpc']['connection_overhead_ms'])/comparison['http_rest']['connection_overhead_ms']*100):>8.1f}%")
-        print(f"   {'Streaming Setup':<25} {comparison['http_rest']['streaming_setup_ms']:>8}ms {comparison['grpc']['streaming_setup_ms']:>8}ms {((comparison['http_rest']['streaming_setup_ms'] - comparison['grpc']['streaming_setup_ms'])/comparison['http_rest']['streaming_setup_ms']*100):>8.1f}%")
-        print(f"   {'-'*25} {'-'*12} {'-'*12} {'-'*12}")
-        print(f"   {'Total per Request':<25} {http_estimated_ms:>8}ms {grpc_reference_ms:>8}ms {improvement_percent:>8.1f}%")
-        print(f"\n🎯 Key Advantages of gRPC:")
-        advantages = [
-            "HTTP/2 binary protocol (vs HTTP/1.1 text)",
-            "Protocol Buffers (vs JSON) - 3-10x faster serialization",
-            "Native bidirectional streaming",
-            "Connection multiplexing (multiple streams per connection)",
-            "Header compression (HPACK)",
-            "Built-in compression (gzip, deflate)",
-            "Automatic connection pooling",
-            "Type-safe service definitions"
-        ]
-        for advantage in advantages:
-            print(f"   • {advantage}")
-        self.analysis_results["grpc_comparison"] = {
-            "http_rest_total_ms": http_estimated_ms,
-            "grpc_reference_ms": grpc_reference_ms,
-            "improvement_percentage": improvement_percent,
-            "speed_multiplier": speed_multiplier,
-            "detailed_comparison": comparison
-        }
-        return comparison, improvement_percent
-    def generate_recommendations(self):
-        """Generate migration recommendations"""
-        print(f"\n💡 MIGRATION RECOMMENDATIONS")
-        print("-" * 60)
-        grpc_improvement = self.analysis_results["grpc_comparison"]["improvement_percentage"]
-        recommendations = {
-            "priority": "HIGH" if grpc_improvement > 30 else "MEDIUM" if grpc_improvement > 15 else "LOW",
-            "expected_improvement": f"{grpc_improvement:.1f}%",
-            "migration_phases": [],
-            "implementation_considerations": [],
-            "estimated_effort": "Medium (2-3 weeks)",
-            "risk_level": "Low-Medium"
-        }
-        # Migration phases
-        if grpc_improvement > 15:
-            recommendations["migration_phases"] = [
-                {
-                    "phase": "Phase 1: Proto Definition",
-                    "description": "Define Protocol Buffer schemas for all API endpoints",
-                    "effort": "3-5 days",
-                    "risk": "Low"
-                },
-                {
-                    "phase": "Phase 2: Controller Migration",
-                    "description": "Migrate controller service to gRPC",
-                    "effort": "5-7 days",
-                    "risk": "Medium"
-                },
-                {
-                    "phase": "Phase 3: Worker Migration",
-                    "description": "Migrate worker services to gRPC",
-                    "effort": "4-6 days",
-                    "risk": "Medium"
-                },
-                {
-                    "phase": "Phase 4: Web Server Integration",
-                    "description": "Update Gradio web server to use gRPC",
-                    "effort": "3-4 days",
-                    "risk": "Low-Medium"
-                }
-            ]
-        recommendations["implementation_considerations"] = [
-            "Maintain backward compatibility during migration",
-            "Implement comprehensive error handling for gRPC status codes",
-            "Add proper logging and monitoring for gRPC services",
-            "Consider connection pooling and load balancing",
-            "Implement graceful degradation to HTTP/REST if needed",
-            "Add comprehensive testing for streaming scenarios",
-            "Monitor memory usage with persistent connections"
-        ]
-        print(f"🎯 Migration Priority: {recommendations['priority']}")
-        print(f"📈 Expected Performance Improvement: {recommendations['expected_improvement']}")
-        print(f"⏱️ Estimated Implementation Time: {recommendations['estimated_effort']}")
-        print(f"⚠️ Risk Level: {recommendations['risk_level']}")
-        if recommendations["migration_phases"]:
-            print(f"\n📋 Recommended Migration Phases:")
-            for i, phase in enumerate(recommendations["migration_phases"], 1):
-                print(f"   {i}. {phase['phase']} ({phase['effort']})")
-                print(f"      {phase['description']}")
-        print(f"\n🔧 Key Implementation Considerations:")
-        for consideration in recommendations["implementation_considerations"][:5]:
-            print(f"   • {consideration}")
-        self.analysis_results["recommendations"] = recommendations
-        return recommendations
-    def calculate_business_impact(self):
-        """Calculate business impact of latency improvements"""
-        print(f"\n💰 BUSINESS IMPACT ANALYSIS")
-        print("-" * 60)
-        grpc_improvement = self.analysis_results["grpc_comparison"]["improvement_percentage"]
-        current_latency = self.analysis_results["grpc_comparison"]["http_rest_total_ms"]
-        improved_latency = self.analysis_results["grpc_comparison"]["grpc_reference_ms"]
-        # Estimate business impacts
-        business_impact = {
-            "user_experience": {
-                "current_perceived_latency": "Noticeable delay",
-                "improved_perceived_latency": "Near real-time",
-                "user_satisfaction_improvement": f"{min(grpc_improvement * 0.8, 40):.0f}%"
-            },
-            "system_capacity": {
-                "concurrent_users_improvement": f"{grpc_improvement * 0.6:.0f}%",
-                "resource_utilization_improvement": f"{grpc_improvement * 0.4:.0f}%",
-                "server_cost_reduction": f"{grpc_improvement * 0.3:.0f}%"
-            },
-            "competitive_advantage": {
-                "time_to_response": f"{current_latency}ms → {improved_latency}ms",
-                "market_positioning": "Industry-leading latency",
-                "user_retention_impact": "Positive"
-            }
-        }
-        print(f"👥 User Experience Impact:")
-        print(f"   • Latency reduction: {current_latency:.0f}ms → {improved_latency:.0f}ms")
-        print(f"   • Perceived performance: {business_impact['user_experience']['current_perceived_latency']} → {business_impact['user_experience']['improved_perceived_latency']}")
-        print(f"   • User satisfaction: +{business_impact['user_experience']['user_satisfaction_improvement']}")
-        print(f"\n🚀 System Capacity Impact:")
-        print(f"   • Concurrent users: +{business_impact['system_capacity']['concurrent_users_improvement']}")
-        print(f"   • Resource efficiency: +{business_impact['system_capacity']['resource_utilization_improvement']}")
-        print(f"   • Infrastructure cost: -{business_impact['system_capacity']['server_cost_reduction']}")
-        print(f"\n🏆 Competitive Advantage:")
-        print(f"   • Response time: {business_impact['competitive_advantage']['time_to_response']}")
-        print(f"   • Market position: {business_impact['competitive_advantage']['market_positioning']}")
-        self.analysis_results["business_impact"] = business_impact
-        return business_impact
-    def save_report(self):
-        """Save comprehensive analysis report"""
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"/workspace/llama_omni2_communication_analysis_{timestamp}.json"
-        with open(filename, 'w') as f:
-            json.dump(self.analysis_results, f, indent=2)
-        print(f"\n💾 Complete analysis saved to: {filename}")
-        return filename
-    def run_complete_analysis(self):
-        """Run comprehensive communication analysis"""
-        print("🚀 LLaMA-Omni2 COMMUNICATION ARCHITECTURE ANALYSIS")
-        print("=" * 80)
-        # Run all analyses
-        self.analyze_current_architecture()
-        latency_sources, total_http_overhead = self.analyze_latency_sources()
-        comparison, improvement = self.compare_with_grpc()
-        recommendations = self.generate_recommendations()
-        business_impact = self.calculate_business_impact()
-        report_file = self.save_report()
-        # Final summary
-        print(f"\n" + "=" * 80)
-        print(f"📊 EXECUTIVE SUMMARY")
-        print(f"=" * 80)
-        print(f"🔍 Current System: HTTP/REST with JSON serialization")
-        print(f"⚡ Proposed System: gRPC with Protocol Buffers")
-        print(f"📈 Performance Improvement: {improvement:.1f}%")
-        print(f"⏱️ Latency Reduction: {total_http_overhead:.0f}ms → ~50ms")
-        print(f"🎯 Migration Priority: {recommendations['priority']}")
-        print(f"💰 Business Impact: Positive across all metrics")
-        print(f"📋 Next Steps: Begin with Protocol Buffer definition")
-        print(f"=" * 80)
-        return self.analysis_results
-def main():
-    analyzer = CommunicationAnalyzer()
-    results = analyzer.run_complete_analysis()
-    return results
-if __name__ == "__main__":
-    main()

communication_latency_test.py DELETED Viewed

@@ -1,370 +0,0 @@
-#!/usr/bin/env python3
-"""
-Communication Latency Test - HTTP/REST vs gRPC
-==============================================
-Measure the actual communication latency between services in LLaMA-Omni2
-"""
-import requests
-import json
-import time
-import statistics
-import os
-from datetime import datetime
-class CommunicationLatencyTester:
-    def __init__(self):
-        self.controller_url = "http://localhost:21001"
-        self.worker_url = None
-        self.results = {
-            "http_rest": {
-                "controller_requests": [],
-                "worker_status_requests": [],
-                "streaming_requests": []
-            },
-            "timestamps": datetime.now().isoformat()
-        }
-    def test_controller_communication(self, iterations=10):
-        """Test communication latency with the controller"""
-        print("🎯 Testing Controller Communication (HTTP/REST)")
-        print("-" * 50)
-        latencies = []
-        # Test 1: List models endpoint
-        for i in range(iterations):
-            start = time.time()
-            try:
-                response = requests.post(f"{self.controller_url}/list_models", timeout=5)
-                if response.status_code == 200:
-                    latency_ms = (time.time() - start) * 1000
-                    latencies.append(latency_ms)
-                    print(f"   Request {i+1:2d}: {latency_ms:.1f}ms")
-                else:
-                    print(f"   Request {i+1:2d}: HTTP {response.status_code}")
-            except Exception as e:
-                print(f"   Request {i+1:2d}: Error - {e}")
-                continue
-            time.sleep(0.1)  # Small delay between requests
-        if latencies:
-            avg_latency = statistics.mean(latencies)
-            min_latency = min(latencies)
-            max_latency = max(latencies)
-            std_dev = statistics.stdev(latencies) if len(latencies) > 1 else 0
-            print(f"\n📊 Controller Communication Results:")
-            print(f"   • Average: {avg_latency:.1f}ms")
-            print(f"   • Min:     {min_latency:.1f}ms")
-            print(f"   • Max:     {max_latency:.1f}ms")
-            print(f"   • Std Dev: {std_dev:.1f}ms")
-            self.results["http_rest"]["controller_requests"] = {
-                "average_ms": avg_latency,
-                "min_ms": min_latency,
-                "max_ms": max_latency,
-                "std_dev_ms": std_dev,
-                "samples": latencies
-            }
-        return latencies
-    def test_worker_discovery(self, iterations=10):
-        """Test worker discovery latency"""
-        print("\n🔍 Testing Worker Discovery")
-        print("-" * 50)
-        latencies = []
-        for i in range(iterations):
-            start = time.time()
-            try:
-                response = requests.post(
-                    f"{self.controller_url}/get_worker_address",
-                    json={"model": "LLaMA-Omni2-1.5B"},
-                    timeout=5
-                )
-                if response.status_code == 200:
-                    worker_data = response.json()
-                    self.worker_url = worker_data.get("address")
-                    latency_ms = (time.time() - start) * 1000
-                    latencies.append(latency_ms)
-                    print(f"   Discovery {i+1:2d}: {latency_ms:.1f}ms")
-                else:
-                    print(f"   Discovery {i+1:2d}: HTTP {response.status_code}")
-            except Exception as e:
-                print(f"   Discovery {i+1:2d}: Error - {e}")
-                continue
-            time.sleep(0.1)
-        if latencies:
-            avg_latency = statistics.mean(latencies)
-            print(f"\n📊 Worker Discovery Results:")
-            print(f"   • Average: {avg_latency:.1f}ms")
-            print(f"   • Worker:  {self.worker_url}")
-        return latencies
-    def test_worker_communication(self, iterations=10):
-        """Test direct worker communication"""
-        if not self.worker_url:
-            print("\n❌ No worker URL available")
-            return []
-        print(f"\n🏭 Testing Worker Communication")
-        print(f"   Worker: {self.worker_url}")
-        print("-" * 50)
-        latencies = []
-        for i in range(iterations):
-            start = time.time()
-            try:
-                response = requests.post(f"{self.worker_url}/worker_get_status", timeout=5)
-                if response.status_code == 200:
-                    latency_ms = (time.time() - start) * 1000
-                    latencies.append(latency_ms)
-                    print(f"   Status {i+1:2d}: {latency_ms:.1f}ms")
-                else:
-                    print(f"   Status {i+1:2d}: HTTP {response.status_code}")
-            except Exception as e:
-                print(f"   Status {i+1:2d}: Error - {e}")
-                continue
-            time.sleep(0.1)
-        if latencies:
-            avg_latency = statistics.mean(latencies)
-            min_latency = min(latencies)
-            max_latency = max(latencies)
-            std_dev = statistics.stdev(latencies) if len(latencies) > 1 else 0
-            print(f"\n📊 Worker Communication Results:")
-            print(f"   • Average: {avg_latency:.1f}ms")
-            print(f"   • Min:     {min_latency:.1f}ms")
-            print(f"   • Max:     {max_latency:.1f}ms")
-            print(f"   • Std Dev: {std_dev:.1f}ms")
-            self.results["http_rest"]["worker_status_requests"] = {
-                "average_ms": avg_latency,
-                "min_ms": min_latency,
-                "max_ms": max_latency,
-                "std_dev_ms": std_dev,
-                "samples": latencies
-            }
-        return latencies
-    def test_streaming_overhead(self, iterations=5):
-        """Test streaming request setup overhead"""
-        if not self.worker_url:
-            print("\n❌ No worker URL available for streaming test")
-            return []
-        print(f"\n🌊 Testing Streaming Request Overhead")
-        print("-" * 50)
-        # Use a minimal test audio file
-        test_audio_path = "/workspace/llama-omni2-official-code/test_audios/01_Olá.wav"
-        if not os.path.exists(test_audio_path):
-            print(f"❌ Test audio not found: {test_audio_path}")
-            return []
-        latencies = []
-        first_response_latencies = []
-        history = [{"role": "user", "content": {"path": test_audio_path}}]
-        pload = {
-            "model": "LLaMA-Omni2-1.5B",
-            "history": history,
-            "temperature": 0.7,
-            "top_p": 0.9,
-            "max_new_tokens": 20  # Minimal for testing
-        }
-        for i in range(iterations):
-            print(f"   Stream {i+1:2d}: ", end="", flush=True)
-            request_start = time.time()
-            try:
-                response = requests.post(
-                    f"{self.worker_url}/worker_generate_stream",
-                    json=pload,
-                    stream=True,
-                    timeout=30
-                )
-                # Measure time to establish connection and get first chunk
-                first_chunk_received = False
-                for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
-                    if chunk:
-                        first_response_time = time.time()
-                        if not first_chunk_received:
-                            connection_latency = (first_response_time - request_start) * 1000
-                            latencies.append(connection_latency)
-                            first_chunk_received = True
-                            # Check if this chunk contains actual response
-                            try:
-                                data = json.loads(chunk.decode())
-                                if data.get("text") or data.get("error_code", 0) != 0:
-                                    first_response_latencies.append(connection_latency)
-                                    print(f"Connection: {connection_latency:.1f}ms")
-                                    break
-                            except:
-                                pass
-                        # Stop after first chunk for this test
-                        break
-                response.close()
-            except Exception as e:
-                print(f"Error - {e}")
-                continue
-            time.sleep(1)  # Longer delay for streaming tests
-        if latencies:
-            avg_latency = statistics.mean(latencies)
-            min_latency = min(latencies)
-            max_latency = max(latencies)
-            print(f"\n📊 Streaming Setup Overhead:")
-            print(f"   • Average: {avg_latency:.1f}ms")
-            print(f"   • Min:     {min_latency:.1f}ms")
-            print(f"   • Max:     {max_latency:.1f}ms")
-            self.results["http_rest"]["streaming_requests"] = {
-                "average_ms": avg_latency,
-                "min_ms": min_latency,
-                "max_ms": max_latency,
-                "samples": latencies
-            }
-        return latencies
-    def run_comprehensive_test(self):
-        """Run all communication tests"""
-        print("🚀 LLaMA-Omni2 Communication Latency Analysis")
-        print("=" * 60)
-        # Run all tests
-        controller_latencies = self.test_controller_communication()
-        worker_discovery_latencies = self.test_worker_discovery()
-        worker_latencies = self.test_worker_communication()
-        streaming_latencies = self.test_streaming_overhead()
-        return self.generate_report()
-    def generate_report(self):
-        """Generate comprehensive comparison report"""
-        print("\n" + "=" * 60)
-        print("📊 COMPREHENSIVE COMMUNICATION ANALYSIS")
-        print("=" * 60)
-        # Current HTTP/REST metrics
-        controller_avg = self.results["http_rest"]["controller_requests"].get("average_ms", 0)
-        worker_avg = self.results["http_rest"]["worker_status_requests"].get("average_ms", 0)
-        streaming_avg = self.results["http_rest"]["streaming_requests"].get("average_ms", 0)
-        # Calculate typical request flow latency
-        typical_flow_latency = controller_avg + worker_avg
-        print(f"\n🌐 Current HTTP/REST Communication:")
-        print(f"   • Controller requests:     {controller_avg:.1f}ms")
-        print(f"   • Worker status requests:  {worker_avg:.1f}ms")
-        print(f"   • Streaming setup:         {streaming_avg:.1f}ms")
-        print(f"   • Typical request flow:    {typical_flow_latency:.1f}ms")
-        # gRPC comparison (using your mentioned ~50ms)
-        grpc_latency = 50.0  # ms per request as mentioned
-        print(f"\n⚡ gRPC Communication (Reference):")
-        print(f"   • Per request latency:     ~{grpc_latency:.1f}ms")
-        # Calculate improvements
-        if typical_flow_latency > 0:
-            improvement = ((typical_flow_latency - grpc_latency) / typical_flow_latency) * 100
-            speedup = typical_flow_latency / grpc_latency
-        else:
-            improvement = 0
-            speedup = 0
-        print(f"\n📈 Performance Comparison:")
-        print(f"   • Current total latency:   {typical_flow_latency:.1f}ms")
-        print(f"   • gRPC total latency:      ~{grpc_latency:.1f}ms")
-        if improvement > 0:
-            print(f"   • Potential improvement:   {improvement:.1f}%")
-            print(f"   • Speed increase:          {speedup:.1f}x faster")
-        else:
-            print(f"   • Current system is already optimized")
-        # Detailed breakdown table
-        print(f"\n📋 Detailed Latency Breakdown:")
-        print("-" * 60)
-        print(f"{'Operation':<25} {'HTTP/REST':<12} {'gRPC (est.)':<12} {'Improvement':<11}")
-        print("-" * 60)
-        print(f"{'Controller query':<25} {controller_avg:>8.1f}ms {grpc_latency*0.8:>9.1f}ms {((controller_avg - grpc_latency*0.8)/controller_avg*100):>8.1f}%")
-        print(f"{'Worker discovery':<25} {worker_avg:>8.1f}ms {grpc_latency*0.6:>9.1f}ms {((worker_avg - grpc_latency*0.6)/worker_avg*100) if worker_avg > 0 else 0:>8.1f}%")
-        print(f"{'Streaming setup':<25} {streaming_avg:>8.1f}ms {grpc_latency*0.4:>9.1f}ms {((streaming_avg - grpc_latency*0.4)/streaming_avg*100) if streaming_avg > 0 else 0:>8.1f}%")
-        print(f"{'Total per request':<25} {typical_flow_latency:>8.1f}ms {grpc_latency:>9.1f}ms {improvement:>8.1f}%")
-        # Recommendations
-        print(f"\n💡 Recommendations:")
-        if improvement > 30:
-            print("   🚀 HIGH IMPACT: gRPC migration would provide significant improvement")
-            print(f"   📈 Expected latency reduction: {typical_flow_latency:.1f}ms → {grpc_latency:.1f}ms")
-        elif improvement > 10:
-            print("   ✅ MODERATE IMPACT: gRPC migration recommended")
-        else:
-            print("   📊 LOW IMPACT: Current HTTP/REST performance is acceptable")
-        print(f"\n🔍 Current Communication Method Details:")
-        print(f"   • Protocol: HTTP/1.1 REST")
-        print(f"   • Transport: TCP over HTTP")
-        print(f"   • Serialization: JSON")
-        print(f"   • Connection: Request/Response per operation")
-        print(f"   • Streaming: HTTP chunked transfer encoding")
-        # Save results
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        output_file = f"/workspace/communication_latency_analysis_{timestamp}.json"
-        with open(output_file, 'w') as f:
-            json.dump(self.results, f, indent=2)
-        print(f"\n💾 Results saved to: {output_file}")
-        return {
-            "current_http_latency_ms": typical_flow_latency,
-            "estimated_grpc_latency_ms": grpc_latency,
-            "improvement_percentage": improvement,
-            "speed_multiplier": speedup,
-            "recommendations": "HIGH IMPACT" if improvement > 30 else "MODERATE IMPACT" if improvement > 10 else "LOW IMPACT"
-        }
-def main():
-    """Main execution function"""
-    tester = CommunicationLatencyTester()
-    # Check if services are running
-    try:
-        response = requests.get("http://localhost:21001", timeout=2)
-    except:
-        try:
-            response = requests.post("http://localhost:21001/list_models", timeout=2)
-        except:
-            print("❌ Controller service not running on localhost:21001")
-            print("   Please start the controller first with:")
-            print("   python llama_omni2/serve/controller.py --port 21001")
-            return
-    results = tester.run_comprehensive_test()
-    return results
-if __name__ == "__main__":
-    main()

create_real_speech.py DELETED Viewed

@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-"""
-Criar áudio de fala real em português usando TTS
-===============================================
-"""
-import torch
-import torchaudio
-import numpy as np
-import wave
-from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-import os
-def create_speech_sample():
-    """Cria amostra de fala em português"""
-    # Texto de teste em português
-    texts = [
-        "Qual é a capital do Brasil?",
-        "Como funciona a inteligência artificial?",
-        "Explique sobre energia renovável",
-        "O que você sabe sobre machine learning?"
-    ]
-    print("🎙️ Criando amostras de fala em português...")
-    # Método alternativo: usar síntese com wavenet/espeak
-    for i, text in enumerate(texts):
-        output_file = f"/workspace/llama-omni2-troca-llm/test_audios/pergunta_{i+1}.wav"
-        try:
-            # Usar espeak para síntese básica
-            import subprocess
-            cmd = [
-                'espeak',
-                '-s', '150',  # velocidade
-                '-v', 'pt-br',  # português brasileiro
-                '-w', output_file,  # arquivo de saída
-                text
-            ]
-            result = subprocess.run(cmd, capture_output=True, text=True)
-            if result.returncode == 0:
-                print(f"✅ Criado: {output_file}")
-                print(f"   Texto: '{text}'")
-            else:
-                print(f"❌ Erro ao criar {output_file}: {result.stderr}")
-        except Exception as e:
-            print(f"❌ Erro: {e}")
-    # Método 2: criar com frequências moduladas (simulando fala)
-    def create_speech_like_audio(text, filename):
-        """Cria áudio que simula padrões de fala"""
-        sample_rate = 16000
-        duration = len(text) * 0.1 + 1.0  # ~0.1s por caractere
-        t = np.linspace(0, duration, int(sample_rate * duration))
-        # Simular formantes básicos da fala (F1, F2, F3)
-        f1 = 500 + 200 * np.sin(2 * np.pi * 2 * t)  # Varia 300-700Hz
-        f2 = 1500 + 300 * np.sin(2 * np.pi * 3 * t)  # Varia 1200-1800Hz
-        f3 = 2500 + 200 * np.sin(2 * np.pi * 1.5 * t)  # Varia 2300-2700Hz
-        # Combinar formantes
-        audio = (
-            0.4 * np.sin(2 * np.pi * f1 * t) +
-            0.3 * np.sin(2 * np.pi * f2 * t) +
-            0.2 * np.sin(2 * np.pi * f3 * t)
-        )
-        # Adicionar modulação de amplitude (simula prosódia)
-        envelope = 0.5 * (1 + np.sin(2 * np.pi * 0.5 * t))
-        audio *= envelope
-        # Adicionar pausas (simula palavras)
-        word_breaks = np.random.choice([0, 1], size=len(audio), p=[0.95, 0.05])
-        audio *= (1 - word_breaks * 0.8)
-        # Normalizar
-        audio = np.clip(audio, -1, 1)
-        audio_int16 = (audio * 32767).astype(np.int16)
-        # Salvar WAV
-        with wave.open(filename, 'wb') as wav:
-            wav.setnchannels(1)
-            wav.setsampwidth(2)
-            wav.setframerate(sample_rate)
-            wav.writeframes(audio_int16.tobytes())
-    # Criar versões simuladas
-    for i, text in enumerate(texts):
-        filename = f"/workspace/llama-omni2-troca-llm/test_audios/speech_sim_{i+1}.wav"
-        create_speech_like_audio(text, filename)
-        print(f"🔊 Criado áudio simulado: {filename}")
-        print(f"   Baseado em: '{text}'")
-if __name__ == "__main__":
-    create_speech_sample()

create_test_audio.py DELETED Viewed

@@ -1,94 +0,0 @@
-#!/usr/bin/env python3
-"""
-Cria um arquivo WAV de teste simples
-"""
-import numpy as np
-import wave
-import struct
-def create_silence_wav(filename, duration=1.0, sample_rate=16000):
-    """Cria um arquivo WAV com silêncio"""
-    # Gerar amostras (silêncio com um pequeno ruído)
-    num_samples = int(duration * sample_rate)
-    samples = np.random.normal(0, 0.001, num_samples)  # Ruído muito baixo
-    # Converter para int16
-    samples = (samples * 32767).astype(np.int16)
-    # Criar arquivo WAV
-    with wave.open(filename, 'wb') as wav:
-        wav.setnchannels(1)  # Mono
-        wav.setsampwidth(2)  # 16 bits
-        wav.setframerate(sample_rate)
-        wav.writeframes(samples.tobytes())
-    print(f"✅ Arquivo criado: {filename}")
-def create_tone_wav(filename, frequency=440, duration=1.0, sample_rate=16000):
-    """Cria um arquivo WAV com um tom simples"""
-    # Gerar onda senoidal
-    t = np.linspace(0, duration, int(sample_rate * duration))
-    samples = np.sin(2 * np.pi * frequency * t)
-    # Aplicar envelope para evitar cliques
-    envelope = np.ones_like(samples)
-    fade_samples = int(0.05 * sample_rate)  # 50ms fade
-    envelope[:fade_samples] = np.linspace(0, 1, fade_samples)
-    envelope[-fade_samples:] = np.linspace(1, 0, fade_samples)
-    samples *= envelope
-    # Converter para int16
-    samples = (samples * 0.5 * 32767).astype(np.int16)
-    # Criar arquivo WAV
-    with wave.open(filename, 'wb') as wav:
-        wav.setnchannels(1)  # Mono
-        wav.setsampwidth(2)  # 16 bits
-        wav.setframerate(sample_rate)
-        wav.writeframes(samples.tobytes())
-    print(f"✅ Arquivo criado: {filename}")
-if __name__ == "__main__":
-    import os
-    # Criar diretório se não existir
-    os.makedirs("test_audios", exist_ok=True)
-    # Criar arquivos de teste
-    create_silence_wav("test_audios/silence.wav", duration=1.0)
-    create_tone_wav("test_audios/tone_440hz.wav", frequency=440, duration=0.5)
-    create_tone_wav("test_audios/tone_880hz.wav", frequency=880, duration=0.5)
-    # Criar "fala" simulada (ruído modulado)
-    t = np.linspace(0, 2, 32000)
-    speech = np.random.normal(0, 0.3, 32000)
-    # Modular com envelope de "fala"
-    envelope = np.sin(2 * np.pi * 2 * t) * 0.5 + 0.5  # Modulação lenta
-    speech *= envelope
-    # Adicionar algumas formantes simuladas
-    for freq in [700, 1220, 2600]:  # Formantes típicas
-        speech += np.sin(2 * np.pi * freq * t) * 0.1 * envelope
-    # Normalizar e converter
-    speech = np.clip(speech, -1, 1)
-    speech = (speech * 0.7 * 32767).astype(np.int16)
-    with wave.open("test_audios/fake_speech.wav", 'wb') as wav:
-        wav.setnchannels(1)
-        wav.setsampwidth(2)
-        wav.setframerate(16000)
-        wav.writeframes(speech.tobytes())
-    print("✅ Arquivo criado: test_audios/fake_speech.wav")
-    print("\n📁 Arquivos de teste criados:")
-    for f in os.listdir("test_audios"):
-        if f.endswith(".wav"):
-            size = os.path.getsize(f"test_audios/{f}")
-            print(f"   • {f} ({size} bytes)")

docs/A1_VOCABULARY_CONTROL_TECHNIQUES.md DELETED Viewed

@@ -1,587 +0,0 @@
-# 📚 Técnicas de Controle de Vocabulário A1 para LLMs
-## Sumário
-- [Introdução](#introdução)
-- [O Desafio do Vocabulário A1](#o-desafio-do-vocabulário-a1)
-- [Técnicas de Controle](#técnicas-de-controle)
-  - [1. Controle por Token IDs](#1-controle-por-token-ids-whitelistblacklist)
-  - [2. Decodificação Estruturada](#2-decodificação-estruturada-fsmtrie)
-  - [3. Pós-processamento e Simplificação](#3-pós-processamento-e-simplificação)
-  - [4. Prompt Engineering Avançado](#4-prompt-engineering-avançado)
-  - [5. Fine-tuning Específico](#5-fine-tuning-específico)
-  - [6. Abordagens Híbridas](#6-abordagens-híbridas)
-- [Comparação Técnica](#comparação-técnica)
-- [Implementação com vLLM](#implementação-com-vllm)
-- [Métricas de Validação](#métricas-de-validação)
-- [Referências Acadêmicas](#referências-acadêmicas)
-## Introdução
-Este documento apresenta técnicas pesquisadas e validadas para controlar o vocabulário de Large Language Models (LLMs) ao nível A1 do CEFR (Common European Framework of Reference for Languages). O objetivo é garantir que as respostas geradas sejam apropriadas para iniciantes no aprendizado de idiomas.
-## O Desafio do Vocabulário A1
-### Definição do Nível A1
-- **Vocabulário**: 400-750 palavras mais comuns
-- **Estruturas**: Presente simples, frases curtas (5-10 palavras)
-- **Complexidade**: Textos muito simples, vocabulário cotidiano básico
-### Problemas Identificados
-1. **Token Misalignment**: Tokens de LLMs não correspondem diretamente a palavras
-2. **Drift Natural**: LLMs tendem a usar vocabulário mais complexo mesmo quando instruídos
-3. **Trade-off Performance**: Restrições muito rígidas podem prejudicar a coerência
-4. **Ambiguidade CEFR**: Difícil distinguir entre A1 e A2 computacionalmente
-## Técnicas de Controle
-### 1. Controle por Token IDs (Whitelist/Blacklist)
-#### Como Funciona
-Restringe diretamente quais tokens o modelo pode gerar através de manipulação de logits.
-```python
-# Exemplo conceitual com vLLM
-from vllm import SamplingParams
-# Lista de token IDs permitidos (vocabulário A1)
-a1_token_ids = [...]  # IDs dos tokens que formam palavras A1
-sampling_params = SamplingParams(
-    allowed_token_ids=a1_token_ids,  # Whitelist
-    max_tokens=20,
-    temperature=0.0
-)
-```
-#### Técnicas Disponíveis
-- **`allowed_token_ids`**: Lista branca de tokens permitidos (vLLM nativo)
-- **`logit_bias`**: Penaliza ou favorece tokens específicos
-- **Logit Masking**: Zera probabilidade de tokens não permitidos
-#### Vantagens
-- ✅ Controle absoluto sobre output
-- ✅ Suporte nativo no vLLM
-- ✅ Latência mínima (+5ms)
-- ✅ Garantia de vocabulário controlado
-#### Desvantagens
-- ❌ Tokens ≠ palavras (problema com subpalavras)
-- ❌ Pode gerar texto truncado ou mal-formado
-- ❌ Complexo mapear vocabulário A1 → token IDs
-- ❌ Risco de "travamento" do modelo
-### 2. Decodificação Estruturada (FSM/Trie)
-#### Como Funciona
-Usa estruturas de dados para guiar a geração palavra por palavra.
-#### FSM (Finite State Machine)
-```python
-# Exemplo conceitual
-class A1_FSM:
-    def __init__(self, vocabulary):
-        self.states = self.build_fsm(vocabulary)
-    def get_valid_tokens(self, current_state):
-        """Retorna tokens válidos para o estado atual"""
-        return self.states[current_state].valid_transitions
-```
-#### Trie (Árvore de Prefixos)
-```python
-# Estrutura de dados para palavras A1
-class A1_Trie:
-    def __init__(self):
-        self.root = TrieNode()
-        self.load_a1_vocabulary()
-    def get_valid_continuations(self, prefix):
-        """Retorna continuações válidas para o prefixo"""
-        # Navega na árvore e retorna palavras possíveis
-```
-#### Constrained Beam Search
-- Mantém múltiplos caminhos de decodificação
-- Valida cada caminho contra vocabulário A1
-- Escolhe melhor caminho válido
-#### Vantagens
-- ✅ Funciona no nível de palavras completas
-- ✅ Garante output gramaticalmente válido
-- ✅ Pode incluir regras sintáticas
-- ✅ 99% de garantia de conformidade A1
-#### Desvantagens
-- ❌ Complexidade alta de implementação
-- ❌ Overhead de 20-50ms na latência
-- ❌ Requer bibliotecas externas (Outlines, Guidance)
-- ❌ Pode reduzir naturalidade
-#### Performance Otimizada (Pesquisas Recentes)
-- **FSM Comprimido**: 2x mais rápido que implementações antigas
-- **Jump-forward Decoding**: Pula sequências determinísticas
-- **Trie com Cache Compartilhado**: 70% economia de memória
-### 3. Pós-processamento e Simplificação
-#### Como Funciona
-Permite geração normal, depois simplifica o texto.
-```python
-def simplify_to_a1(text):
-    """
-    Simplifica texto para nível A1
-    """
-    # 1. Substituição de sinônimos
-    text = replace_complex_words(text)
-    # 2. Quebra de frases longas
-    text = split_long_sentences(text)
-    # 3. Remoção de estruturas complexas
-    text = remove_subordinate_clauses(text)
-    return text
-```
-#### Técnicas de Simplificação
-##### Substituição de Sinônimos
-```python
-a1_synonyms = {
-    "inteligência": "mente",
-    "artificial": "falso",
-    "disponível": "livre",
-    "compreender": "entender",
-    "utilizar": "usar",
-    # ... centenas de mapeamentos
-}
-```
-##### Reestruturação Sintática
-- Converte voz passiva → ativa
-- Remove orações subordinadas
-- Simplifica tempos verbais para presente
-##### Segmentação de Frases
-- Quebra frases com mais de 10 palavras
-- Remove conectivos complexos
-- Mantém apenas sujeito-verbo-objeto
-#### Vantagens
-- ✅ Mantém fluência natural
-- ✅ Implementação simples
-- ✅ Baixo overhead (+10ms)
-- ✅ Não afeta processo de geração
-#### Desvantagens
-- ❌ Não garante 100% vocabulário A1
-- ❌ Pode alterar significado original
-- ❌ Requer dicionário extenso e confiável
-- ❌ Difícil validar correção semântica
-### 4. Prompt Engineering Avançado
-#### Como Funciona
-Usa técnicas sofisticadas de prompt para guiar o modelo.
-#### Plan-Driven Simplification
-```python
-prompt_plan = """
-Primeiro, crie um plano para responder em nível A1:
-1. Identifique conceitos principais
-2. Liste palavras simples para usar
-3. Planeje frases curtas
-Agora execute o plano:
-"""
-```
-#### Summary-Guided Generation
-```python
-prompt_summary = """
-Resumo A1: [gerar resumo simples primeiro]
-Agora expanda o resumo usando apenas:
-- Palavras do resumo
-- Presente simples
-- Frases de 5-10 palavras
-"""
-```
-#### Few-Shot com Exemplos A1
-```python
-prompt_fewshot = """
-Exemplos de respostas A1:
-P: Onde você mora?
-R: Eu moro em São Paulo.
-P: O que você come?
-R: Eu como pão e café.
-Agora responda similarmente:
-P: {pergunta}
-R:
-"""
-```
-#### Chain-of-Thought para Simplicidade
-```python
-prompt_cot = """
-Pense passo a passo para responder simplesmente:
-1. Qual é a resposta básica?
-2. Quais palavras simples usar?
-3. Como fazer uma frase curta?
-Resposta A1:
-"""
-```
-#### Vantagens
-- ✅ Zero mudança no código
-- ✅ Muito flexível e adaptável
-- ✅ Funciona com qualquer modelo
-- ✅ Preserva naturalidade
-#### Desvantagens
-- ❌ Não é 100% confiável
-- ❌ Resultados variam entre execuções
-- ❌ Depende da capacidade do modelo
-- ❌ Difícil garantir consistência
-### 5. Fine-tuning Específico
-#### Como Funciona
-Re-treina o modelo com corpus exclusivamente A1.
-#### Abordagens de Treinamento
-##### Supervised Fine-Tuning (SFT)
-```python
-# Dataset de treinamento A1
-training_data = [
-    {"input": "Como você está?", "output": "Eu estou bem."},
-    {"input": "Onde mora?", "output": "Moro no Brasil."},
-    # ... milhares de exemplos A1
-]
-```
-##### Reinforcement Learning (RL)
-- **Reward**: Pontuação por simplicidade
-- **Penalty**: Punição por vocabulário complexo
-- **Métodos**: PPO, DPO, RLHF
-##### LoRA/QLoRA (Efficient Fine-tuning)
-```python
-# Ajuste eficiente sem re-treinar todo o modelo
-lora_config = {
-    "r": 8,  # rank
-    "lora_alpha": 16,
-    "target_modules": ["q_proj", "v_proj"],
-    "lora_dropout": 0.1
-}
-```
-#### Modelo CaLM (State-of-the-Art)
-Pesquisa recente desenvolveu o modelo CaLM (CEFR-Aligned Language Model) que:
-- Superou GPT-4 em geração CEFR-aligned
-- Usa combination de SFT + RL
-- Dataset com 10k+ exemplos por nível
-#### Vantagens
-- ✅ Resultado mais natural
-- ✅ Zero overhead na inferência
-- ✅ Modelo "aprende" o nível A1
-- ✅ Consistência alta entre gerações
-#### Desvantagens
-- ❌ Requer dataset grande (10k+ exemplos)
-- ❌ Caro computacionalmente (GPU + tempo)
-- ❌ Pode degradar outras capacidades
-- ❌ Precisa re-treinar para cada idioma
-### 6. Abordagens Híbridas
-#### Como Funciona
-Combina múltiplas técnicas para maximizar benefícios.
-#### Exemplo 1: Soft + Hard Constraints
-```python
-def generate_a1_hybrid(prompt):
-    # Tenta primeiro com constraints soft
-    response = generate_with_soft_constraints(
-        prompt,
-        temperature=0.3,
-        top_p=0.4
-    )
-    # Valida resultado
-    if not is_a1_compliant(response):
-        # Fallback para constraints hard
-        response = generate_with_token_whitelist(
-            prompt,
-            allowed_tokens=a1_tokens
-        )
-    return response
-```
-#### Exemplo 2: Generate-then-Simplify
-```python
-def generate_and_simplify(prompt):
-    # 1. Geração normal para coerência
-    response = generate_normal(prompt)
-    # 2. Simplificação com outro prompt
-    simplified = simplify_with_llm(response, target="A1")
-    # 3. Validação final
-    if validate_a1(simplified):
-        return simplified
-    else:
-        return fallback_template_response()
-```
-#### Exemplo 3: Multi-Stage Pipeline
-```python
-def multi_stage_a1_generation(prompt):
-    # Stage 1: Planning
-    plan = generate_a1_plan(prompt)
-    # Stage 2: Vocabulary Selection
-    vocab = select_a1_vocabulary(plan)
-    # Stage 3: Constrained Generation
-    response = generate_with_vocabulary(plan, vocab)
-    # Stage 4: Post-processing
-    final = post_process_a1(response)
-    return final
-```
-#### Vantagens
-- ✅ Combina benefícios de múltiplas técnicas
-- ✅ Fallbacks para maior confiabilidade
-- ✅ Flexibilidade para ajustes
-- ✅ Melhor trade-off naturalidade/controle
-#### Desvantagens
-- ❌ Complexidade de implementação
-- ❌ Múltiplos pontos de falha
-- ❌ Latência acumulativa
-- ❌ Difícil debugar
-## Comparação Técnica
-### Tabela Comparativa
-| Método | Complexidade | Garantia A1 | Latência Extra | Naturalidade | Memória |
-|--------|-------------|-------------|----------------|--------------|---------|
-| **Token Control** | Média | 95% | +5ms | Baixa | Baixa |
-| **FSM/Trie** | Alta | 99% | +20-50ms | Média | Média |
-| **Pós-processo** | Baixa | 70% | +10ms | Alta | Baixa |
-| **Prompt Eng.** | Baixa | 60% | 0ms | Muito Alta | Baixa |
-| **Fine-tuning** | Muito Alta | 85% | 0ms | Muito Alta | Alta |
-| **Híbrida** | Alta | 90% | +15-30ms | Alta | Média |
-### Critérios de Escolha
-#### Use Token Control quando:
-- Precisa garantia máxima de vocabulário
-- Latência é crítica
-- Tem mapeamento tokens-palavras pronto
-#### Use FSM/Trie quando:
-- Precisa garantia de estrutura gramatical
-- Tem biblioteca de suporte (Outlines, etc.)
-- Aceita overhead de latência
-#### Use Pós-processamento quando:
-- Quer manter naturalidade
-- Tem bom dicionário de sinônimos
-- Aceita não ter 100% garantia
-#### Use Prompt Engineering quando:
-- Não pode modificar código
-- Precisa flexibilidade
-- Modelo é suficientemente capaz
-#### Use Fine-tuning quando:
-- Tem recursos computacionais
-- Precisa solução permanente
-- Tem dataset de qualidade
-## Implementação com vLLM
-### Configuração Básica
-```python
-from vllm import LLM, SamplingParams
-# Inicialização do modelo
-model = LLM(
-    model="path/to/model",
-    dtype="float16",
-    gpu_memory_utilization=0.90
-)
-# Parâmetros para A1
-sampling_params_a1 = SamplingParams(
-    max_tokens=20,           # Respostas curtas
-    temperature=0.3,         # Baixa criatividade
-    top_p=0.4,              # Vocabulário restrito
-    top_k=100,              # Limita opções
-    repetition_penalty=1.1,  # Evita repetição
-    stop=[".", "!", "?"],   # Para em pontuação
-    # Para whitelist de tokens:
-    # allowed_token_ids=[...]
-)
-```
-### Implementação de Logit Processor Customizado
-```python
-from typing import List
-import torch
-class A1VocabularyProcessor:
-    def __init__(self, a1_token_ids: List[int]):
-        self.a1_tokens = set(a1_token_ids)
-        self.penalty = -1000.0  # Penalidade forte
-    def __call__(self, input_ids: torch.Tensor,
-                 scores: torch.Tensor) -> torch.Tensor:
-        """
-        Aplica penalidade a tokens não-A1
-        """
-        for token_id in range(scores.shape[-1]):
-            if token_id not in self.a1_tokens:
-                scores[:, token_id] += self.penalty
-        return scores
-# Uso com vLLM
-a1_processor = A1VocabularyProcessor(a1_token_ids)
-sampling_params = SamplingParams(
-    logits_processors=[a1_processor]
-)
-```
-## Métricas de Validação
-### Métricas Implementadas
-#### 1. Flesch Reading Ease (Português)
-```python
-def flesch_portuguese(text):
-    """
-    Score > 80 = A1/A2
-    Score 60-80 = B1/B2
-    Score < 60 = C1/C2
-    """
-    return 206.835 - 1.015*(words/sentences) - 84.6*(syllables/words)
-```
-#### 2. Common Words Ratio
-```python
-def common_words_ratio(text):
-    """
-    Percentual de palavras no vocabulário A1
-    Target: >= 70%
-    """
-    words = text.split()
-    common = sum(1 for w in words if w in A1_VOCAB)
-    return (common / len(words)) * 100
-```
-#### 3. Sentence Length Check
-```python
-def check_sentence_length(text):
-    """
-    A1: 5-10 palavras por frase
-    """
-    sentences = text.split('.')
-    lengths = [len(s.split()) for s in sentences]
-    return all(5 <= l <= 10 for l in lengths)
-```
-#### 4. Brunet Index
-```python
-def brunet_index(text):
-    """
-    Complexidade lexical
-    Target A1: < 15
-    """
-    N = total_words
-    V = unique_words
-    return N ** (V ** -0.165)
-```
-## Referências Acadêmicas
-### Papers Principais
-1. **"From Tarzan to Tolkien: Controlling the Language Proficiency Level of LLMs for Content Generation"** (2024)
-   - Introduz modelo CaLM para controle CEFR
-   - Benchmark de técnicas de simplificação
-   - [arXiv:2406.03030](https://arxiv.org/html/2406.03030v1)
-2. **"Alignment Drift in CEFR-prompted LLMs for Interactive Spanish Tutoring"** (2024)
-   - Analisa drift em prompts CEFR
-   - Propõe re-prompting periódico
-   - [arXiv:2505.08351](https://arxiv.org/html/2505.08351v1)
-3. **"Guiding LLMs The Right Way: Fast, Non-Invasive Constrained Generation"** (2024)
-   - Técnicas de constrained decoding
-   - Comparação FSM vs Trie
-   - [arXiv:2403.06988](https://arxiv.org/html/2403.06988v1)
-4. **"Fast JSON Decoding for Local LLMs with Compressed Finite State Machine"** (2024)
-   - Otimização de FSM para decodificação
-   - Jump-forward decoding
-   - [LMSYS Blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)
-### Bibliotecas e Ferramentas
-1. **Outlines** - Structured generation com FSM
-   - GitHub: `outlines-dev/outlines`
-   - Suporte para regex, JSON schema, CFG
-2. **Guidance** - Constrained generation framework
-   - GitHub: `guidance-ai/guidance`
-   - Template-based generation
-3. **XGrammar** - Optimized grammar-based decoding
-   - Integrado no vLLM v0.5+
-   - Pushdown automaton para performance
-4. **llama.cpp** - Grammar support nativo
-   - GBNF (Grammar-Based Natural Form)
-   - Implementação em C++ eficiente
-## Conclusões e Recomendações
-### Para Produção Imediata
-1. **Comece com Prompt Engineering** - baixo custo, rápido de testar
-2. **Adicione validação com métricas** - garante qualidade mínima
-3. **Implemente fallback híbrido** - regenera se falhar validação
-### Para Melhor Qualidade
-1. **Considere pós-processamento** - mantém naturalidade
-2. **Teste FSM/Trie para casos críticos** - máxima garantia
-3. **Avalie fine-tuning se tiver recursos** - solução definitiva
-### Trade-offs Principais
-- **Controle vs Naturalidade**: Mais controle = menos natural
-- **Garantia vs Latência**: Mais garantia = mais lento
-- **Simplicidade vs Eficácia**: Soluções simples são menos eficazes
-### Próximos Passos
-1. Criar vocabulário A1 português (500-750 palavras)
-2. Mapear vocabulário para token IDs do modelo
-3. Implementar validação com métricas múltiplas
-4. Testar abordagem híbrida progressiva
-5. Coletar feedback de usuários reais
----
-*Documento criado com base em pesquisa acadêmica e testes práticos com modelos LLM para geração de conteúdo em nível A1 CEFR.*

download_llama_omni2.py DELETED Viewed

@@ -1,25 +0,0 @@
-#!/usr/bin/env python3
-"""
-Download do modelo LLaMA-Omni2 1.5B oficial com Speech Projector treinado
-"""
-from huggingface_hub import snapshot_download
-import os
-print("🚀 Baixando LLaMA-Omni2 1.5B oficial (com Speech Projector treinado)...")
-print("   Modelo: ICTNLP/Llama-Omni2-1.5B")
-print("   Tamanho: ~6GB")
-model_dir = snapshot_download(
-    repo_id="ICTNLP/Llama-Omni2-1.5B",
-    local_dir="/workspace/llama-omni2-troca-llm/models/LLaMA-Omni2-1.5B",
-    resume_download=True
-)
-print(f"✅ Download completo: {model_dir}")
-print("\n📁 Arquivos baixados:")
-for root, dirs, files in os.walk(model_dir):
-    for file in files:
-        if file.endswith(('.bin', '.safetensors', '.json')):
-            file_path = os.path.join(root, file)
-            size_mb = os.path.getsize(file_path) / (1024*1024)
-            print(f"   {file}: {size_mb:.1f}MB")

download_official_model.py DELETED Viewed

@@ -1,13 +0,0 @@
-#!/usr/bin/env python3
-"""
-Download do modelo LLaMA-Omni2 1.5B oficial
-"""
-from huggingface_hub import snapshot_download
-print("🚀 Baixando modelo LLaMA-Omni2 1.5B oficial...")
-model_dir = snapshot_download(
-    repo_id="ICTNLP/Llama-Omni2-1.5B",
-    local_dir="/workspace/llama-omni2-troca-llm/models/LLaMA-Omni2-1.5B",
-    resume_download=True
-)
-print(f"✅ Modelo baixado em: {model_dir}")

generate_test_audios.py DELETED Viewed

@@ -1,81 +0,0 @@
-#!/usr/bin/env python3
-"""
-Gerador de Áudios de Teste
-=========================
-Criar arquivos de áudio para cada frase testada
-"""
-import os
-from gtts import gTTS
-def create_test_audios():
-    """Criar todos os áudios de teste"""
-    # Frases testadas
-    test_phrases = [
-        "Olá",
-        "Casa",
-        "Gato",
-        "Bom dia",
-        "Como vai",
-        "Obrigado",
-        "Tudo bem hoje",
-        "Que horas são",
-        "Como você está"
-    ]
-    # Criar diretório
-    audio_dir = "test_audios"
-    os.makedirs(audio_dir, exist_ok=True)
-    print("🎤 Gerando áudios de teste...")
-    print("=" * 40)
-    audio_paths = []
-    for i, phrase in enumerate(test_phrases, 1):
-        # Nome do arquivo
-        filename = f"{i:02d}_{phrase.replace(' ', '_').replace('?', '')}.wav"
-        mp3_path = os.path.join(audio_dir, filename.replace('.wav', '.mp3'))
-        wav_path = os.path.join(audio_dir, filename)
-        print(f"[{i}/9] Gerando: {phrase}")
-        # Criar áudio com gTTS
-        tts = gTTS(text=phrase, lang='pt', slow=False)
-        tts.save(mp3_path)
-        # Converter para WAV
-        os.system(f"ffmpeg -i {mp3_path} -ar 16000 -ac 1 {wav_path} -y -loglevel quiet")
-        os.remove(mp3_path)
-        # Adicionar path absoluto
-        abs_path = os.path.abspath(wav_path)
-        audio_paths.append({
-            "phrase": phrase,
-            "filename": filename,
-            "path": abs_path
-        })
-        print(f"   ✅ Salvo: {wav_path}")
-    print(f"\n✅ {len(audio_paths)} áudios gerados!")
-    # Mostrar todos os paths
-    print("\n📁 PATHS DOS ÁUDIOS:")
-    print("=" * 40)
-    for audio in audio_paths:
-        print(f"🎤 '{audio['phrase']}':")
-        print(f"   📄 Arquivo: {audio['filename']}")
-        print(f"   📍 Path: {audio['path']}")
-        print()
-    return audio_paths
-if __name__ == "__main__":
-    audio_paths = create_test_audios()
-    print("🎯 RESUMO DOS PATHS:")
-    print("=" * 20)
-    for i, audio in enumerate(audio_paths, 1):
-        print(f"{i}. {audio['path']}")

gtts_test_results.json DELETED Viewed

@@ -1,82 +0,0 @@
-[
-  {
-    "question_num": 1,
-    "question": "Qual é a capital do Brasil?",
-    "success": true,
-    "latency_ms": 482.533385977149,
-    "response": "Olá! Como posso ajudar você hoje?",
-    "audio_size_bytes": 86862
-  },
-  {
-    "question_num": 2,
-    "question": "Como funciona a fotossíntese nas plantas?",
-    "success": true,
-    "latency_ms": 530.3681651130319,
-    "response": "Olá! Como posso ajudar você hoje?",
-    "audio_size_bytes": 122958
-  },
-  {
-    "question_num": 3,
-    "question": "Quem foi Dom Pedro Segundo?",
-    "success": true,
-    "latency_ms": 493.9775848761201,
-    "response": "Olá! Como posso ajudar você hoje?",
-    "audio_size_bytes": 84558
-  },
-  {
-    "question_num": 4,
-    "question": "O que é inteligência artificial?",
-    "success": true,
-    "latency_ms": 485.67815124988556,
-    "response": "Olá! Como posso ajudar você hoje?",
-    "audio_size_bytes": 100686
-  },
-  {
-    "question_num": 5,
-    "question": "Explique o que são energias renováveis",
-    "success": true,
-    "latency_ms": 455.7227217592299,
-    "response": "Olá! Como posso ajudar você hoje?",
-    "audio_size_bytes": 119886
-  },
-  {
-    "question_num": 6,
-    "question": "Qual é a diferença entre vírus e bactéria?",
-    "success": true,
-    "latency_ms": 490.5252889730036,
-    "response": "Olá! Como posso ajudar você hoje?",
-    "audio_size_bytes": 124494
-  },
-  {
-    "question_num": 7,
-    "question": "Como funciona o sistema solar?",
-    "success": true,
-    "latency_ms": 496.38519808650017,
-    "response": "Olá! Como posso ajudar você hoje?",
-    "audio_size_bytes": 96078
-  },
-  {
-    "question_num": 8,
-    "question": "O que é machine learning?",
-    "success": true,
-    "latency_ms": 460.00722935423255,
-    "response": "Olá! Como posso ajudar você hoje?",
-    "audio_size_bytes": 71502
-  },
-  {
-    "question_num": 9,
-    "question": "Explique a teoria da relatividade de Einstein",
-    "success": true,
-    "latency_ms": 474.25004141405225,
-    "response": "Olá! Como posso ajudar você hoje?",
-    "audio_size_bytes": 130638
-  },
-  {
-    "question_num": 10,
-    "question": "Qual é a importância da Amazônia para o clima?",
-    "success": true,
-    "latency_ms": 458.1514848396182,
-    "response": "Olá! Como posso ajudar você hoje?",
-    "audio_size_bytes": 126030
-  }
-]

installed_packages.txt DELETED Viewed

@@ -1,246 +0,0 @@
-accelerate==0.33.0
-aiofiles==23.2.1
-aiohappyeyeballs @ file:///home/conda/feedstock_root/build_artifacts/aiohappyeyeballs_1741775197943/work
-aiohttp @ file:///home/conda/feedstock_root/build_artifacts/aiohttp_1753804962438/work
-aiohttp-cors @ file:///home/conda/feedstock_root/build_artifacts/aiohttp-cors_1754034655438/work
-aioice==0.10.1
-aiortc==1.13.0
-aiosignal @ file:///home/conda/feedstock_root/build_artifacts/aiosignal_1751626463503/work
-annotated-types @ file:///home/conda/feedstock_root/build_artifacts/annotated-types_1733247046149/work
-antlr4-python3-runtime==4.9.3
-anyio @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_anyio_1754315087/work
-astor==0.8.1
-async-timeout @ file:///home/conda/feedstock_root/build_artifacts/async-timeout_1733235340728/work
-attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1741918516150/work
-audioread @ file:///home/conda/feedstock_root/build_artifacts/audioread_1725357455116/work
-av==14.4.0
-beautifulsoup4==4.13.5
-bitsandbytes==0.45.0
-blake3==1.0.5
-Brotli==1.1.0
-brotlicffi @ file:///croot/brotlicffi_1736182461069/work
-cachetools==6.2.0
-cbor2==5.7.0
-certifi @ file:///croot/certifi_1754570635119/work/certifi
-cffi @ file:///croot/cffi_1736182485317/work
-charset-normalizer @ file:///croot/charset-normalizer_1721748349566/work
-click @ file:///home/conda/feedstock_root/build_artifacts/click_1747811314515/work
-cloudpickle==3.1.1
-colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1733218098505/work
-coloredlogs==15.0.1
-compressed-tensors==0.10.2
-conformer==0.3.2
-contourpy @ file:///home/conda/feedstock_root/build_artifacts/contourpy_1744743067588/work
-cryptography==45.0.6
-cupy-cuda12x==13.6.0
-cycler @ file:///home/conda/feedstock_root/build_artifacts/cycler_1733332471406/work
-Cython==3.1.3
-datasets==2.18.0
-decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1740384970518/work
-depyf==0.19.0
-diffusers==0.27.2
-dill==0.3.8
-diskcache==5.6.3
-distro==1.9.0
-dnspython @ file:///home/conda/feedstock_root/build_artifacts/dnspython_1733256735222/work
-einops==0.6.1
-einops-exts==0.0.4
-email_validator @ file:///home/conda/feedstock_root/build_artifacts/email-validator-meta_1733300719943/work
-exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1746947292760/work
-fastapi==0.115.11
-fastapi-cli @ file:///home/conda/feedstock_root/build_artifacts/fastapi-cli_1751972066250/work
-fastapi-cloud-cli==0.1.5
-fastrlock==0.8.3
-ffmpy==0.6.1
-filelock @ file:///croot/filelock_1744281381737/work
-flatbuffers==25.2.10
-fonttools @ file:///home/conda/feedstock_root/build_artifacts/fonttools_1755223865520/work
-frozenlist @ file:///home/conda/feedstock_root/build_artifacts/frozenlist_1752167142843/work
-fsspec==2024.2.0
-gdown==5.1.0
-gguf==0.17.1
-gmpy2 @ file:///croot/gmpy2_1738085463648/work
-google-crc32c==1.7.1
-gradio==5.3.0
-gradio_client==1.4.2
-groovy==0.1.2
-grpcio @ file:///home/conda/feedstock_root/build_artifacts/grpc-split_1754634529307/work
-grpcio-tools @ file:///home/conda/feedstock_root/build_artifacts/grpc-split_1754634529307/work/tools/distrib/python/grpcio_tools
-h11 @ file:///home/conda/feedstock_root/build_artifacts/h11_1745526374115/work
-h2 @ file:///home/conda/feedstock_root/build_artifacts/h2_1738578511449/work
-hf-xet @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_hf-xet_1755587744/work
-hpack @ file:///home/conda/feedstock_root/build_artifacts/hpack_1737618293087/work
-httpcore @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_httpcore_1745602916/work
-httptools @ file:///home/conda/feedstock_root/build_artifacts/httptools_1732707649090/work
-httpx @ file:///home/conda/feedstock_root/build_artifacts/httpx_1733663348460/work
-huggingface-hub==0.34.4
-humanfriendly==10.0
-hydra-core==1.3.2
-hyperframe @ file:///home/conda/feedstock_root/build_artifacts/hyperframe_1737618333194/work
-HyperPyYAML==1.2.2
-idna @ file:///croot/idna_1714398848350/work
-ifaddr==0.2.0
-importlib_metadata @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_importlib-metadata_1747934053/work
-importlib_resources==6.5.2
-inflect==7.3.1
-interegular==0.3.3
-Jinja2 @ file:///croot/jinja2_1741710844255/work
-jiter==0.10.0
-joblib @ file:///home/conda/feedstock_root/build_artifacts/joblib_1748019130050/work
-jsonschema==4.25.1
-jsonschema-specifications==2025.4.1
-kiwisolver @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_kiwisolver_1754889395/work
-lark==1.2.2
-latex2mathml==3.78.0
-lazy_loader @ file:///home/conda/feedstock_root/build_artifacts/lazy-loader_1733636780672/work
-librosa==0.10.2
-lightning==2.2.4
-lightning-utilities==0.15.2
--e git+https://huggingface.co/marcosremar2/llama-omni2-official-code@d76469fe3fb92c109fb6f3deb08415ee07079042#egg=llama_omni2
-llguidance==0.7.30
-llvmlite==0.44.0
-lm-format-enforcer==0.10.12
-markdown-it-py @ file:///home/conda/feedstock_root/build_artifacts/markdown-it-py_1754951200865/work
-markdown2==2.5.2
-MarkupSafe==2.1.5
-matplotlib==3.10.1
-mdurl @ file:///home/conda/feedstock_root/build_artifacts/mdurl_1733255585584/work
-mistral_common==1.8.4
-mkl-fft==1.3.1
-mkl-random @ file:///home/builder/ci_310/mkl_random_1641843545607/work
-mkl-service==2.4.0
-more-itertools==10.7.0
-mpmath @ file:///croot/mpmath_1690848262763/work
-msgpack @ file:///home/conda/feedstock_root/build_artifacts/msgpack-python_1749813185825/work
-msgspec==0.19.0
-multidict @ file:///home/conda/feedstock_root/build_artifacts/multidict_1751310558090/work
-multiprocess==0.70.16
-munkres==1.1.4
-networkx @ file:///croot/networkx_1737039604450/work
-ninja==1.13.0
-numba @ file:///home/conda/feedstock_root/build_artifacts/numba_1749491273169/work
-numpy==2.2.6
-nvidia-cublas-cu12==12.6.4.1
-nvidia-cuda-cupti-cu12==12.6.80
-nvidia-cuda-nvrtc-cu12==12.6.77
-nvidia-cuda-runtime-cu12==12.6.77
-nvidia-cudnn-cu12==9.5.1.17
-nvidia-cufft-cu12==11.3.0.4
-nvidia-cufile-cu12==1.11.1.6
-nvidia-curand-cu12==10.3.7.77
-nvidia-cusolver-cu12==11.7.1.2
-nvidia-cusparse-cu12==12.5.4.2
-nvidia-cusparselt-cu12==0.6.3
-nvidia-nccl-cu12==2.26.2
-nvidia-nvjitlink-cu12==12.6.85
-nvidia-nvtx-cu12==12.6.77
-omegaconf==2.3.0
-onnx==1.16.0
-onnxruntime-gpu==1.18.0
-openai==1.101.0
-openai-harmony==0.0.4
-openai-whisper==20231117
-opencv-python-headless==4.12.0.88
-orjson==3.11.2
-outlines_core==0.2.10
-packaging==25.0
-pandas==2.3.2
-partial-json-parser==0.2.1.1.post6
-peft==0.14.0
-pillow==10.4.0
-platformdirs @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_platformdirs_1746710438/work
-pooch @ file:///home/conda/feedstock_root/build_artifacts/pooch_1754941678315/work
-prometheus-fastapi-instrumentator==7.1.0
-prometheus_client==0.22.1
-propcache @ file:///home/conda/feedstock_root/build_artifacts/propcache_1744524934684/work
-protobuf @ file:///home/conda/feedstock_root/build_artifacts/protobuf_1751668305329/work/bazel-bin/python/dist/protobuf-6.31.1-cp310-abi3-linux_x86_64.whl#sha256=defdfb61601fe45e5f630c511e9db761b2d454cda73e7e10af9b4112431a5435
-psutil==7.0.0
-py-cpuinfo==9.0.0
-pyarrow==21.0.0
-pyarrow-hotfix==0.7
-pybase64==1.4.2
-pycountry==24.6.1
-pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
-pydantic==2.11.7
-pydantic-extra-types==2.10.5
-pydantic_core==2.33.2
-pydub==0.25.1
-pyee==13.0.0
-Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1750615794071/work
-pylibsrtp==0.12.0
-pynini==2.1.5
-pyOpenSSL==25.1.0
-pyparsing @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_pyparsing_1753873557/work
-PySocks @ file:///home/builder/ci_310/pysocks_1640793678128/work
-python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_python-dateutil_1751104122/work
-python-dotenv @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_python-dotenv_1750789290/work
-python-json-logger==3.3.0
-python-multipart @ file:///home/conda/feedstock_root/build_artifacts/python-multipart_1734420773152/work
-pytorch-lightning==2.5.3
-pytz==2025.2
-PyYAML @ file:///croot/pyyaml_1728657952215/work
-pyzmq==27.0.2
-ray==2.48.0
-referencing==0.36.2
-regex==2025.7.34
-requests==2.32.3
-rich @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_rich_1753436991/work/dist
-rich-toolkit @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_rich-toolkit_1755001296/work
-rignore==0.6.4
-rpds-py==0.27.0
-ruamel.yaml==0.18.15
-ruamel.yaml.clib==0.2.12
-ruff==0.12.10
-safehttpx==0.1.6
-safetensors==0.6.2
-scikit-learn==1.2.2
-scipy @ file:///home/conda/feedstock_root/build_artifacts/scipy-split_1739790642651/work/dist/scipy-1.15.2-cp310-cp310-linux_x86_64.whl#sha256=9e52bad6c3294d1a5b04a13632118ca2157130603c6c018c2d710162b223b27e
-semantic-version==2.10.0
-sentencepiece==0.1.99
-sentry-sdk==2.35.1
-setproctitle==1.3.6
-shellingham @ file:///home/conda/feedstock_root/build_artifacts/shellingham_1733300899265/work
-shortuuid==1.0.13
-six @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_six_1753199211/work
-sniffio @ file:///home/conda/feedstock_root/build_artifacts/sniffio_1733244044561/work
-soundfile @ file:///home/conda/feedstock_root/build_artifacts/pysoundfile_1737836266465/work
-soupsieve==2.7
-soxr @ file:///home/conda/feedstock_root/build_artifacts/soxr-python_1725345517888/work
-standard-aifc @ file:///home/conda/feedstock_root/build_artifacts/standard-aifc_1751926219849/work
-standard-sunau @ file:///home/conda/feedstock_root/build_artifacts/standard-sunau_1751926339586/work
-starlette==0.46.2
-svgwrite==1.4.3
-sympy==1.14.0
-threadpoolctl @ file:///home/conda/feedstock_root/build_artifacts/threadpoolctl_1741878222898/work
-tiktoken==0.11.0
-timm==0.6.13
-tokenizers==0.21.4
-tomlkit==0.12.0
-torch==2.7.1
-torchaudio==2.7.1
-torchmetrics==1.8.1
-torchvision==0.22.1
-tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1735661334605/work
-transformers==4.55.4
-triton==3.3.1
-typeguard==4.4.4
-typer==0.16.1
-typer-slim==0.16.1
-typing-inspection @ file:///home/conda/feedstock_root/build_artifacts/typing-inspection_1747870647094/work
-typing_extensions==4.15.0
-tzdata==2025.2
-unicodedata2 @ file:///home/conda/feedstock_root/build_artifacts/unicodedata2_1736692496989/work
-urllib3 @ file:///croot/urllib3_1750775463400/work
-uvicorn==0.30.0
-uvloop @ file:///home/conda/feedstock_root/build_artifacts/uvloop_1730214334932/work
-vllm==0.10.1.1
-watchfiles @ file:///home/conda/feedstock_root/build_artifacts/watchfiles_1750053828522/work
-wavedrom==2.0.3.post3
-websockets==12.0
-WeTextProcessing==1.0.3
-wget==3.2
-xformers==0.0.31
-xgrammar==0.1.21
-xxhash==3.5.0
-yarl @ file:///home/conda/feedstock_root/build_artifacts/yarl_1749554822108/work
-zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1749421620841/work

llama_omni2_integration/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # LLaMA-Omni2 Integration Package

llama_omni2_integration/constants.py DELETED Viewed

@@ -1,9 +0,0 @@
-CONTROLLER_HEART_BEAT_EXPIRATION = 30
-WORKER_HEART_BEAT_INTERVAL = 15
-LOGDIR = "."
-# Model Constants
-IGNORE_INDEX = -100
-SPEECH_TOKEN_INDEX = -200
-DEFAULT_SPEECH_TOKEN = "<speech>"

llama_omni2_integration/omni2_speech_arch.py DELETED Viewed

@@ -1,201 +0,0 @@
-#    Copyright 2023 Haotian Liu
-#    Copyright 2024 Qingkai Fang
-#
-#    This project is modified based on LLaVA by Haotian Liu, Qingkai Fang adds further supports for speech-to-text/speech tasks.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#
-#        http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-from abc import ABC, abstractmethod
-import torch
-from .speech_encoder.builder import build_speech_encoder
-from .speech_projector.builder import build_speech_projector
-from .constants import IGNORE_INDEX, SPEECH_TOKEN_INDEX
-class Omni2SpeechMetaModel:
-    def __init__(self, config):
-        super(Omni2SpeechMetaModel, self).__init__(config)
-        if hasattr(config, "speech_encoder"):
-            self.speech_encoder = build_speech_encoder(config)
-            self.speech_projector = build_speech_projector(config)
-    def get_speech_encoder(self):
-        speech_encoder = getattr(self, "speech_encoder", None)
-        return speech_encoder
-    def get_speech_projector(self):
-        speech_projector = getattr(self, "speech_projector", None)
-        return speech_projector
-    def initialize_speech_modules(self, model_args):
-        self.config.speech_encoder = getattr(model_args, "speech_encoder", None)
-        self.config.speech_encoder_type = getattr(model_args, "speech_encoder_type", None)
-        self.config.speech_projector_type = getattr(model_args, 'speech_projector_type', 'linear')
-        self.config.speech_encoder_ds_rate = getattr(model_args, 'speech_encoder_ds_rate', 5)
-        self.config.speech_encoder_hidden_size = getattr(model_args, 'speech_encoder_hidden_size', 1280)
-        if self.get_speech_encoder() is None:
-            self.speech_encoder = build_speech_encoder(self.config)
-        if self.get_speech_projector() is None:
-            self.speech_projector = build_speech_projector(self.config)
-class Omni2SpeechMetaForCausalLM(ABC):
-    @abstractmethod
-    def get_model(self):
-        pass
-    def get_speech_encoder(self):
-        return self.get_model().get_speech_encoder()
-    def get_speech_projector(self):
-        return self.get_model().get_speech_projector()
-    def encode_speech(self, speech, speech_lengths):
-        speech_encoder_type = self.config.speech_encoder_type
-        speech_encoder = self.get_speech_encoder()
-        if "whisper" in speech_encoder_type.lower():
-            encoder_outs = speech_encoder(speech.permute(0, 2, 1))
-            speech_lengths = (speech_lengths + 1) // 2
-        else:
-            raise ValueError(f"Unknown speech encoder type: {speech_encoder_type}")
-        speech_projector_type = self.config.speech_projector_type
-        speech_projector = self.get_speech_projector()
-        if speech_projector_type == "linear":
-            encoder_outs = speech_projector(encoder_outs)
-            speech_lengths = speech_lengths // speech_projector.k
-        else:
-            raise ValueError(f"Unknown speech projector type: {speech_projector_type}")
-        speech_features = [encoder_outs[i, :speech_lengths[i]] for i in range(len(encoder_outs))]
-        return speech_features
-    def prepare_inputs_labels_for_speech_and_text(
-        self, input_ids, position_ids, attention_mask, past_key_values, labels,
-        speech, speech_lengths
-    ):
-        speech_encoder = self.get_speech_encoder()
-        if speech_encoder is None or speech is None or input_ids.shape[1] == 1:
-            return input_ids, position_ids, attention_mask, past_key_values, None, labels
-        speech_features = self.encode_speech(speech, speech_lengths)
-        _labels = labels
-        _position_ids = position_ids
-        _attention_mask = attention_mask
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
-        else:
-            attention_mask = attention_mask.bool()
-        if position_ids is None:
-            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
-        if labels is None:
-            labels = torch.full_like(input_ids, IGNORE_INDEX)
-        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
-        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
-        new_input_embeds = []
-        new_labels = []
-        cur_speech_idx = 0
-        for batch_idx, cur_input_ids in enumerate(input_ids):
-            num_speech = (cur_input_ids == SPEECH_TOKEN_INDEX).sum()
-            speech_token_indices = [-1] + torch.where(cur_input_ids == SPEECH_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
-            cur_input_ids_nospeech = []
-            cur_labels = labels[batch_idx]
-            cur_labels_nospeech = []
-            for i in range(len(speech_token_indices) - 1):
-                cur_input_ids_nospeech.append(cur_input_ids[speech_token_indices[i]+1:speech_token_indices[i+1]])
-                cur_labels_nospeech.append(cur_labels[speech_token_indices[i]+1:speech_token_indices[i+1]])
-            split_sizes = [x.shape[0] for x in cur_labels_nospeech]
-            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_nospeech))
-            cur_input_embeds_no_speech = torch.split(cur_input_embeds, split_sizes, dim=0)
-            cur_new_input_embeds = []
-            cur_new_labels = []
-            for i in range(num_speech + 1):
-                cur_new_input_embeds.append(cur_input_embeds_no_speech[i])
-                cur_new_labels.append(cur_labels_nospeech[i])
-                if i < num_speech:
-                    cur_speech_features = speech_features[cur_speech_idx]
-                    cur_speech_idx += 1
-                    cur_new_input_embeds.append(cur_speech_features)
-                    cur_new_labels.append(torch.full((cur_speech_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
-            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
-            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
-            cur_new_labels = torch.cat(cur_new_labels)
-            new_input_embeds.append(cur_new_input_embeds)
-            new_labels.append(cur_new_labels)
-        assert cur_speech_idx == len(speech_features)
-        # Truncate sequences to max length as speech features can make the sequence longer
-        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
-        if tokenizer_model_max_length is not None:
-            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
-            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
-        # Combine them
-        max_len = max(x.shape[0] for x in new_input_embeds)
-        batch_size = len(new_input_embeds)
-        new_input_embeds_padded = []
-        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
-        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
-        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
-        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
-            cur_len = cur_new_embed.shape[0]
-            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
-                new_input_embeds_padded.append(torch.cat((
-                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
-                    cur_new_embed
-                ), dim=0))
-                if cur_len > 0:
-                    new_labels_padded[i, -cur_len:] = cur_new_labels
-                    attention_mask[i, -cur_len:] = True
-                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
-            else:
-                new_input_embeds_padded.append(torch.cat((
-                    cur_new_embed,
-                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
-                ), dim=0))
-                if cur_len > 0:
-                    new_labels_padded[i, :cur_len] = cur_new_labels
-                    attention_mask[i, :cur_len] = True
-                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
-        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
-        if _labels is None:
-            new_labels = None
-        else:
-            new_labels = new_labels_padded
-        if _attention_mask is None:
-            attention_mask = None
-        else:
-            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
-        if _position_ids is None:
-            position_ids = None
-        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels

llama_omni2_integration/qwen2_speech_model.py DELETED Viewed

@@ -1,155 +0,0 @@
-"""
-Qwen2.5 Speech Model - Adaptação do LLaMA-Omni2 oficial
-Integra Speech Projector + Whisper Encoder com Qwen2.5 em português
-"""
-from typing import List, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-from transformers import AutoConfig, AutoModelForCausalLM, \
-                         Qwen2Config, Qwen2Model, Qwen2ForCausalLM
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.generation.utils import GenerateOutput
-from .omni2_speech_arch import Omni2SpeechMetaModel, Omni2SpeechMetaForCausalLM
-class Qwen2SpeechConfig(Qwen2Config):
-    """Configuração para Qwen2.5 com Speech"""
-    model_type = "qwen2_speech"
-class Qwen2SpeechModel(Omni2SpeechMetaModel, Qwen2Model):
-    """Modelo base Qwen2.5 com capacidades de Speech"""
-    config_class = Qwen2SpeechConfig
-    def __init__(self, config: Qwen2Config):
-        super(Qwen2SpeechModel, self).__init__(config)
-class Qwen2SpeechForCausalLM(Qwen2ForCausalLM, Omni2SpeechMetaForCausalLM):
-    """
-    Qwen2.5 ForCausalLM com Speech Integration
-    Baseado no LLaMA-Omni2 oficial mas usando Qwen2.5 em português
-    """
-    config_class = Qwen2SpeechConfig
-    def __init__(self, config):
-        super(Qwen2ForCausalLM, self).__init__(config)
-        self.model = Qwen2SpeechModel(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_model(self):
-        return self.model
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        speech: Optional[torch.FloatTensor] = None,
-        speech_lengths: Optional[torch.LongTensor] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        if inputs_embeds is None:
-            (
-                input_ids,
-                position_ids,
-                attention_mask,
-                past_key_values,
-                inputs_embeds,
-                labels
-            ) = self.prepare_inputs_labels_for_speech_and_text(
-                input_ids,
-                position_ids,
-                attention_mask,
-                past_key_values,
-                labels,
-                speech,
-                speech_lengths
-            )
-        return super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            labels=labels,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict
-        )
-    @torch.no_grad()
-    def generate(
-        self,
-        inputs: Optional[torch.Tensor] = None,
-        speech: Optional[torch.Tensor] = None,
-        speech_lengths: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
-        position_ids = kwargs.pop("position_ids", None)
-        attention_mask = kwargs.pop("attention_mask", None)
-        if "inputs_embeds" in kwargs:
-            raise NotImplementedError("`inputs_embeds` is not supported")
-        if speech is not None:
-            (
-                inputs,
-                position_ids,
-                attention_mask,
-                _,
-                inputs_embeds,
-                _
-            ) = self.prepare_inputs_labels_for_speech_and_text(
-                inputs,
-                position_ids,
-                attention_mask,
-                None,
-                None,
-                speech,
-                speech_lengths
-            )
-        else:
-            inputs_embeds = self.get_model().embed_tokens(inputs)
-        return super().generate(
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            **kwargs
-        )
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
-                                      inputs_embeds=None, **kwargs):
-        speech = kwargs.pop("speech", None)
-        speech_lengths = kwargs.pop("speech_lengths", None)
-        inputs = super().prepare_inputs_for_generation(
-            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
-        )
-        if speech is not None:
-            inputs['speech'] = speech
-            inputs['speech_lengths'] = speech_lengths
-        return inputs
-# Registrar o modelo
-AutoConfig.register("qwen2_speech", Qwen2SpeechConfig)
-AutoModelForCausalLM.register(Qwen2SpeechConfig, Qwen2SpeechForCausalLM)

llama_omni2_integration/qwen2_speech_model_fixed.py DELETED Viewed

@@ -1,294 +0,0 @@
-"""
-Qwen2.5 Speech Model - IMPLEMENTAÇÃO IDÊNTICA AO LLAMA-OMNI2 OFICIAL
-====================================================================
-Corrigido para seguir exatamente a arquitetura oficial
-"""
-from typing import List, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-from transformers import AutoConfig, AutoModelForCausalLM, \
-                         Qwen2Config, Qwen2Model, Qwen2ForCausalLM
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.generation.utils import GenerateOutput
-from .omni2_speech_arch import Omni2SpeechMetaModel, Omni2SpeechMetaForCausalLM
-from .constants import IGNORE_INDEX, SPEECH_TOKEN_INDEX
-class Qwen2SpeechConfig(Qwen2Config):
-    """Configuração para Qwen2.5 com Speech"""
-    model_type = "qwen2_speech_fixed"
-class Qwen2SpeechModel(Omni2SpeechMetaModel, Qwen2Model):
-    """Modelo base Qwen2.5 com capacidades de Speech"""
-    config_class = Qwen2SpeechConfig
-    def __init__(self, config: Qwen2Config):
-        super(Qwen2SpeechModel, self).__init__(config)
-class Qwen2SpeechForCausalLM(Qwen2ForCausalLM, Omni2SpeechMetaForCausalLM):
-    """
-    Qwen2.5 ForCausalLM com Speech Integration
-    IMPLEMENTAÇÃO IDÊNTICA AO LLAMA-OMNI2 OFICIAL
-    """
-    config_class = Qwen2SpeechConfig
-    def __init__(self, config):
-        super(Qwen2ForCausalLM, self).__init__(config)
-        self.model = Qwen2SpeechModel(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_model(self):
-        return self.model
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        speech: Optional[torch.FloatTensor] = None,
-        speech_lengths: Optional[torch.LongTensor] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        if inputs_embeds is None:
-            (
-                input_ids,
-                position_ids,
-                attention_mask,
-                past_key_values,
-                inputs_embeds,
-                labels
-            ) = self.prepare_inputs_labels_for_speech_and_text(
-                input_ids,
-                position_ids,
-                attention_mask,
-                past_key_values,
-                labels,
-                speech,
-                speech_lengths
-            )
-        return super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            labels=labels,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict
-        )
-    def prepare_inputs_labels_for_speech_and_text(
-        self, input_ids, position_ids, attention_mask, past_key_values, labels,
-        speech, speech_lengths
-    ):
-        """
-        IMPLEMENTAÇÃO EXATA DO LLAMA-OMNI2 OFICIAL
-        Processa input_ids substituindo SPEECH_TOKEN_INDEX por speech features
-        """
-        speech_encoder = self.get_speech_encoder()
-        if speech_encoder is None or speech is None or input_ids.shape[1] == 1:
-            return input_ids, position_ids, attention_mask, past_key_values, None, labels
-        # Encode speech features
-        speech_features = self.encode_speech(speech, speech_lengths)
-        _labels = labels
-        _position_ids = position_ids
-        _attention_mask = attention_mask
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
-        else:
-            attention_mask = attention_mask.bool()
-        if position_ids is None:
-            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
-        if labels is None:
-            labels = torch.full_like(input_ids, IGNORE_INDEX)
-        # Remove padding
-        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
-        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
-        new_input_embeds = []
-        new_labels = []
-        cur_speech_idx = 0
-        for batch_idx, cur_input_ids in enumerate(input_ids):
-            # Contar quantos SPEECH_TOKEN_INDEX existem
-            num_speech = (cur_input_ids == SPEECH_TOKEN_INDEX).sum()
-            # Encontrar posições dos SPEECH_TOKEN_INDEX
-            speech_token_indices = [-1] + torch.where(cur_input_ids == SPEECH_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
-            # Separar tokens que não são speech
-            cur_input_ids_nospeech = []
-            cur_labels = labels[batch_idx]
-            cur_labels_nospeech = []
-            for i in range(len(speech_token_indices) - 1):
-                cur_input_ids_nospeech.append(cur_input_ids[speech_token_indices[i]+1:speech_token_indices[i+1]])
-                cur_labels_nospeech.append(cur_labels[speech_token_indices[i]+1:speech_token_indices[i+1]])
-            split_sizes = [x.shape[0] for x in cur_labels_nospeech]
-            # Gerar embeddings para tokens normais
-            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_nospeech))
-            cur_input_embeds_no_speech = torch.split(cur_input_embeds, split_sizes, dim=0)
-            cur_new_input_embeds = []
-            cur_new_labels = []
-            # Intercalar embeddings de texto com speech features
-            for i in range(num_speech + 1):
-                cur_new_input_embeds.append(cur_input_embeds_no_speech[i])
-                cur_new_labels.append(cur_labels_nospeech[i])
-                if i < num_speech:
-                    cur_speech_features = speech_features[cur_speech_idx]
-                    cur_speech_idx += 1
-                    cur_new_input_embeds.append(cur_speech_features)
-                    # Labels para speech features são IGNORE_INDEX
-                    cur_new_labels.append(
-                        torch.full((cur_speech_features.shape[0],), IGNORE_INDEX,
-                                 device=cur_labels.device, dtype=cur_labels.dtype)
-                    )
-            # Concatenar tudo
-            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
-            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
-            cur_new_labels = torch.cat(cur_new_labels)
-            new_input_embeds.append(cur_new_input_embeds)
-            new_labels.append(cur_new_labels)
-        assert cur_speech_idx == len(speech_features)
-        # Truncar para max length se necessário
-        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
-        if tokenizer_model_max_length is not None:
-            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
-            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
-        # Padding para batch
-        max_len = max(x.shape[0] for x in new_input_embeds)
-        batch_size = len(new_input_embeds)
-        new_input_embeds_padded = []
-        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
-        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
-        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
-        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
-            cur_len = cur_new_embed.shape[0]
-            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
-                new_input_embeds_padded.append(torch.cat((
-                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
-                    cur_new_embed
-                ), dim=0))
-                if cur_len > 0:
-                    new_labels_padded[i, -cur_len:] = cur_new_labels
-                    attention_mask[i, -cur_len:] = True
-                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
-            else:
-                new_input_embeds_padded.append(torch.cat((
-                    cur_new_embed,
-                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
-                ), dim=0))
-                if cur_len > 0:
-                    new_labels_padded[i, :cur_len] = cur_new_labels
-                    attention_mask[i, :cur_len] = True
-                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
-        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
-        if _labels is None:
-            new_labels = None
-        else:
-            new_labels = new_labels_padded
-        if _attention_mask is None:
-            attention_mask = None
-        else:
-            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
-        if _position_ids is None:
-            position_ids = None
-        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
-    @torch.no_grad()
-    def generate(
-        self,
-        inputs: Optional[torch.Tensor] = None,
-        speech: Optional[torch.Tensor] = None,
-        speech_lengths: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
-        position_ids = kwargs.pop("position_ids", None)
-        attention_mask = kwargs.pop("attention_mask", None)
-        if "inputs_embeds" in kwargs:
-            raise NotImplementedError("`inputs_embeds` is not supported")
-        if speech is not None:
-            (
-                inputs,
-                position_ids,
-                attention_mask,
-                _,
-                inputs_embeds,
-                _
-            ) = self.prepare_inputs_labels_for_speech_and_text(
-                inputs,
-                position_ids,
-                attention_mask,
-                None,
-                None,
-                speech,
-                speech_lengths
-            )
-        else:
-            inputs_embeds = self.get_model().embed_tokens(inputs)
-        return super().generate(
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            **kwargs
-        )
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
-                                      inputs_embeds=None, **kwargs):
-        speech = kwargs.pop("speech", None)
-        speech_lengths = kwargs.pop("speech_lengths", None)
-        inputs = super().prepare_inputs_for_generation(
-            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
-        )
-        if speech is not None:
-            inputs['speech'] = speech
-            inputs['speech_lengths'] = speech_lengths
-        return inputs
-# Registrar o modelo
-AutoConfig.register("qwen2_speech_fixed", Qwen2SpeechConfig)
-AutoModelForCausalLM.register(Qwen2SpeechConfig, Qwen2SpeechForCausalLM)

llama_omni2_integration/speech_encoder/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # Speech Encoder Package

llama_omni2_integration/speech_encoder/builder.py DELETED Viewed

@@ -1,9 +0,0 @@
-from .speech_encoder import WhisperWrappedEncoder
-def build_speech_encoder(config):
-    speech_encoder_type = getattr(config, 'speech_encoder_type', None)
-    if "whisper" in speech_encoder_type.lower():
-        return WhisperWrappedEncoder.load(config)
-    raise ValueError(f'Unknown speech encoder: {speech_encoder_type}')

llama_omni2_integration/speech_encoder/speech_encoder.py DELETED Viewed

@@ -1,26 +0,0 @@
-# This code is modified from https://github.com/ddlBoJack/SLAM-LLM/blob/main/src/slam_llm/models/encoder.py
-import torch
-import torch.nn as nn
-class WhisperWrappedEncoder:
-    @classmethod
-    def load(cls, model_config):
-        def replace_layer_norm(module):
-            from whisper.model import LayerNorm
-            for name, child in module.named_children():
-                if isinstance(child, LayerNorm):
-                    old_params = child.state_dict()
-                    new_layer_norm = nn.LayerNorm(child.normalized_shape, eps=child.eps, elementwise_affine=child.elementwise_affine)
-                    new_layer_norm.load_state_dict(old_params)
-                    setattr(module, name, new_layer_norm)
-                else:
-                    replace_layer_norm(child)
-        import whisper
-        encoder = whisper.load_model(name=model_config.speech_encoder, device='cpu').encoder
-        replace_layer_norm(encoder)
-        return encoder

llama_omni2_integration/speech_projector/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # Speech Projector Package

llama_omni2_integration/speech_projector/builder.py DELETED Viewed

@@ -1,9 +0,0 @@
-from .speech_projector import EncoderProjectorConcat
-def build_speech_projector(config):
-    projector_type = getattr(config, 'speech_projector_type', 'linear')
-    if projector_type == 'linear':
-        return EncoderProjectorConcat(config)
-    raise ValueError(f'Unknown projector type: {projector_type}')

llama_omni2_integration/speech_projector/speech_projector.py DELETED Viewed

@@ -1,30 +0,0 @@
-# This code is modified from https://github.com/ddlBoJack/SLAM-LLM/blob/main/src/slam_llm/models/projector.py
-import torch
-import torch.nn as nn
-class EncoderProjectorConcat(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.k = config.speech_encoder_ds_rate
-        self.encoder_dim = config.speech_encoder_hidden_size
-        self.llm_dim = config.hidden_size
-        self.linear1 = nn.Linear(self.encoder_dim * self.k, 2048)
-        self.relu = nn.ReLU()
-        self.linear2 = nn.Linear(2048, config.hidden_size)
-    def forward(self, x):
-        batch_size, seq_len, dim = x.size()
-        num_frames_to_discard = seq_len % self.k
-        if num_frames_to_discard > 0:
-            x = x[:, :-num_frames_to_discard, :]
-        seq_len = x.size(1)
-        x = x.contiguous()
-        x = x.view(batch_size, seq_len // self.k, dim * self.k)
-        x = self.linear1(x)
-        x = self.relu(x)
-        x = self.linear2(x)
-        return x

load_speech_projector.py DELETED Viewed

@@ -1,184 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script para carregar os pesos do Speech Projector pré-treinado
-no modelo Qwen2.5 multilíngue para a arquitetura LLaMA-Omni2
-"""
-import torch
-import torch.nn as nn
-from pathlib import Path
-import logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-def load_pretrained_speech_projector(model, projector_weights_path="models/speech_projector_weights.pt"):
-    """
-    Carrega os pesos pré-treinados do Speech Projector inglês
-    no modelo multilíngue Qwen2.5.
-    Args:
-        model: Modelo LLaMA-Omni2 com Qwen2.5 multilíngue
-        projector_weights_path: Caminho para os pesos salvos
-    Returns:
-        model: Modelo com Speech Projector carregado
-    """
-    if not Path(projector_weights_path).exists():
-        logger.warning(f"⚠️ Pesos do Speech Projector não encontrados em {projector_weights_path}")
-        logger.info("Baixando pesos do modelo inglês...")
-        # Baixar modelo inglês se necessário
-        from huggingface_hub import hf_hub_download
-        import safetensors.torch
-        try:
-            # Baixar arquivo safetensors
-            model_file = hf_hub_download(
-                repo_id="ICTNLP/LLaMA-Omni2-1.5B",
-                filename="model-00001-of-00002.safetensors",
-                local_dir="models/temp"
-            )
-            # Extrair apenas Speech Projector
-            state_dict = safetensors.torch.load_file(model_file)
-            projector_weights = {
-                k: v for k, v in state_dict.items()
-                if 'speech_projector' in k
-            }
-            # Salvar para uso futuro
-            torch.save(projector_weights, projector_weights_path)
-            logger.info(f"✅ Pesos extraídos e salvos em {projector_weights_path}")
-        except Exception as e:
-            logger.error(f"❌ Erro ao baixar pesos: {e}")
-            logger.info("Usando inicialização aleatória para Speech Projector")
-            return model
-    # Carregar pesos salvos
-    logger.info(f"📦 Carregando Speech Projector de {projector_weights_path}")
-    projector_weights = torch.load(projector_weights_path, map_location='cpu')
-    # Mapear para o modelo atual
-    loaded_keys = []
-    missing_keys = []
-    for name, param in model.named_parameters():
-        if 'speech_projector' in name:
-            if name in projector_weights:
-                param.data = projector_weights[name].to(param.dtype).to(param.device)
-                loaded_keys.append(name)
-                logger.debug(f"✓ Carregado: {name} {param.shape}")
-            else:
-                # Tentar com prefixo 'model.'
-                alt_name = f"model.{name}" if not name.startswith("model.") else name.replace("model.", "")
-                if alt_name in projector_weights:
-                    param.data = projector_weights[alt_name].to(param.dtype).to(param.device)
-                    loaded_keys.append(name)
-                    logger.debug(f"✓ Carregado (alt): {name} {param.shape}")
-                else:
-                    missing_keys.append(name)
-                    logger.warning(f"✗ Não encontrado: {name}")
-    logger.info(f"✅ Speech Projector carregado com sucesso!")
-    logger.info(f"   - {len(loaded_keys)} tensores carregados")
-    if missing_keys:
-        logger.warning(f"   - {len(missing_keys)} tensores não encontrados (usando inicialização padrão)")
-    return model
-def verify_speech_projector(model):
-    """
-    Verifica se o Speech Projector está configurado corretamente.
-    Args:
-        model: Modelo para verificar
-    Returns:
-        bool: True se configurado corretamente
-    """
-    has_projector = False
-    projector_params = {}
-    for name, param in model.named_parameters():
-        if 'speech_projector' in name:
-            has_projector = True
-            projector_params[name] = param.shape
-    if has_projector:
-        logger.info("🔍 Verificação do Speech Projector:")
-        for name, shape in projector_params.items():
-            logger.info(f"   • {name}: {shape}")
-        # Verificar dimensões esperadas
-        expected_shapes = {
-            'linear1.weight': (2048, 6400),
-            'linear1.bias': (2048,),
-            'linear2.weight': (1536, 2048),
-            'linear2.bias': (1536,)
-        }
-        all_correct = True
-        for key, expected_shape in expected_shapes.items():
-            found = False
-            for name, shape in projector_params.items():
-                if key in name and tuple(shape) == expected_shape:
-                    found = True
-                    break
-            if not found:
-                logger.warning(f"   ⚠️ Dimensão incorreta ou ausente para {key}")
-                all_correct = False
-        if all_correct:
-            logger.info("   ✅ Todas as dimensões estão corretas!")
-        return all_correct
-    else:
-        logger.error("❌ Speech Projector não encontrado no modelo!")
-        return False
-def integrate_with_model_worker(model_worker_path="/workspace/llama-omni2-troca-llm/llama_omni2/serve/model_worker.py"):
-    """
-    Gera código para integrar o carregamento do Speech Projector
-    no model_worker.py existente.
-    """
-    integration_code = '''
-# Adicionar após carregar o modelo (linha ~130)
-# Em ModelWorker.__init__ após self.model = ...
-# Carregar Speech Projector pré-treinado
-from load_speech_projector import load_pretrained_speech_projector
-self.model = load_pretrained_speech_projector(self.model)
-logger.info("✅ Speech Projector pré-treinado carregado com sucesso!")
-'''
-    logger.info("📝 Para integrar no model_worker.py, adicione:")
-    print(integration_code)
-    return integration_code
-if __name__ == "__main__":
-    logger.info("=== Teste do Speech Projector ===")
-    # Simular carregamento (para teste)
-    logger.info("\n1. Verificando pesos disponíveis...")
-    weights_path = "models/speech_projector_weights.pt"
-    if Path(weights_path).exists():
-        weights = torch.load(weights_path, map_location='cpu')
-        logger.info(f"✅ Pesos encontrados com {len(weights)} tensores:")
-        for key, tensor in weights.items():
-            logger.info(f"   • {key}: {tensor.shape}")
-    else:
-        logger.warning("⚠️ Pesos não encontrados. Execute o download primeiro.")
-    logger.info("\n2. Código de integração:")
-    integrate_with_model_worker()
-    logger.info("\n✨ Speech Projector pronto para uso!")

webrtc_server_gpu_vllm.py → server.py RENAMED Viewed

File without changes

simple_speech_chat_torchcompiled.py DELETED Viewed

@@ -1,230 +0,0 @@
-#!/usr/bin/env python3
-"""
-🚀 Sistema de Chat por Voz - Versão com torch.compile()
-======================================================
-Baseado no relatório que alcançou 1030ms com otimizações críticas:
-- torch.compile() para 27% mais rápido no LLM após aquecimento
-- Whisper Singleton para evitar recarregamentos
-- Parâmetros otimizados (20 tokens, 80 mel bins)
-"""
-import warnings
-warnings.filterwarnings('ignore')
-import torch
-import whisper
-import time
-import tempfile
-import os
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from gtts import gTTS
-import io
-import subprocess
-import sys
-# Importar Whisper Singleton
-sys.path.append('/workspace/temp_test_installation/llama-omni2-official-code')
-from llama_omni2.model.whisper_singleton import get_whisper_model
-class OptimizedSpeechChat:
-    def __init__(self):
-        """Inicializar com todas as otimizações do repo que atingiu 1030ms"""
-        print("🚀 Inicializando Chat Otimizado (torch.compile + Whisper Singleton)")
-        print("=" * 70)
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        # OTIMIZAÇÃO 1: Whisper Singleton (evita recarregamentos)
-        print("📦 [SINGLETON] Carregando Whisper...")
-        start = time.perf_counter()
-        self.whisper_model = get_whisper_model("base")  # Singleton
-        whisper_time = (time.perf_counter() - start) * 1000
-        print(f"  ✅ Whisper carregado em {whisper_time:.0f}ms (singleton)")
-        # OTIMIZAÇÃO 2: LLM com torch.compile()
-        print("📦 [COMPILE] Carregando LLM...")
-        start = time.perf_counter()
-        self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
-        self.model = AutoModelForCausalLM.from_pretrained(
-            "Qwen/Qwen2.5-1.5B-Instruct",
-            torch_dtype=torch.float16,  # Float16 para velocidade
-            device_map="auto"
-        )
-        if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-        # OTIMIZAÇÃO CRÍTICA: torch.compile()
-        print("🔥 Aplicando torch.compile()...")
-        compile_start = time.perf_counter()
-        self.model = torch.compile(
-            self.model,
-            mode="reduce-overhead",  # Modo otimizado para latência
-            backend="inductor"
-        )
-        compile_time = (time.perf_counter() - compile_start) * 1000
-        llm_time = (time.perf_counter() - start) * 1000
-        print(f"  ✅ LLM carregado em {llm_time:.0f}ms")
-        print(f"  🔥 torch.compile aplicado em {compile_time:.0f}ms")
-        print(f"  ⚠️  IMPORTANTE: Primeiras 2-3 execuções serão mais lentas (compilação JIT)")
-        print(f"\n📊 Inicialização total: {whisper_time + llm_time:.0f}ms")
-    def warmup(self, iterations=3):
-        """Aquecimento crítico para torch.compile (conforme relatório)"""
-        print(f"\n🔥 EXECUTANDO AQUECIMENTO ({iterations} iterações)...")
-        print("=" * 50)
-        # Criar áudio de teste temporário
-        warmup_text = "Teste de aquecimento do sistema"
-        tts = gTTS(text=warmup_text, lang='pt-br', slow=False)
-        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
-            tts.write_to_fp(f)
-            mp3_path = f.name
-        wav_path = mp3_path.replace('.mp3', '.wav')
-        subprocess.run(f"ffmpeg -i {mp3_path} -ar 16000 {wav_path} -y -loglevel quiet",
-                      shell=True, check=True)
-        os.remove(mp3_path)
-        warmup_times = []
-        for i in range(iterations):
-            start = time.perf_counter()
-            # Pipeline completa de warmup
-            transcricao = self.transcribe_optimized(wav_path)
-            resposta = self.generate_response_optimized(transcricao)
-            elapsed = (time.perf_counter() - start) * 1000
-            warmup_times.append(elapsed)
-            print(f"  • Warmup {i+1}/{iterations}: {elapsed:.0f}ms")
-            if i == 0:
-                print(f"    (Primeira execução - compilando JIT)")
-            elif i == iterations - 1:
-                print(f"    (Sistema aquecido - performance real)")
-        os.remove(wav_path)
-        print(f"\n📊 Estatísticas do Aquecimento:")
-        print(f"  • 1ª execução (compilação): {warmup_times[0]:.0f}ms")
-        print(f"  • Última execução (aquecido): {warmup_times[-1]:.0f}ms")
-        if len(warmup_times) > 1:
-            melhoria = ((warmup_times[0] - warmup_times[-1]) / warmup_times[0]) * 100
-            print(f"  • Melhoria após aquecimento: {melhoria:.1f}%")
-        print("🚀 Sistema totalmente aquecido e otimizado!")
-    def transcribe_optimized(self, audio_path):
-        """Transcrição otimizada conforme repo 1030ms"""
-        # OTIMIZAÇÃO: Pipeline de áudio otimizada
-        audio = whisper.load_audio(audio_path)
-        audio = whisper.pad_or_trim(audio)
-        # OTIMIZAÇÃO: 80 mel bins (conforme repo otimizado)
-        mel = whisper.log_mel_spectrogram(audio, n_mels=80).to(self.whisper_model.device)
-        # OTIMIZAÇÃO: Não detectar idioma - fixar PT
-        options = whisper.DecodingOptions(
-            language="pt",  # Fixar idioma
-            fp16=torch.cuda.is_available(),
-            without_timestamps=True  # Mais rápido sem timestamps
-        )
-        result = whisper.decode(self.whisper_model, mel, options)
-        return result.text
-    def generate_response_optimized(self, text):
-        """Geração otimizada conforme repo 1030ms"""
-        prompt = f"Responda brevemente em português: {text}"
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        with torch.no_grad():
-            # OTIMIZAÇÃO: 20 tokens conforme repo otimizado
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=20,  # Conforme repo otimizado
-                temperature=0.7,    # Parâmetros otimizados
-                top_p=0.9,
-                pad_token_id=self.tokenizer.eos_token_id,
-                do_sample=True
-            )
-        response = self.tokenizer.decode(
-            outputs[0][len(inputs.input_ids[0]):],
-            skip_special_tokens=True
-        ).strip()
-        return response
-    def synthesize_optimized(self, text):
-        """TTS otimizado com gTTS (mais rápido que Edge conforme relatório)"""
-        tts = gTTS(text=text, lang='pt-br', slow=False)
-        audio_buffer = io.BytesIO()
-        tts.write_to_fp(audio_buffer)
-        audio_buffer.seek(0)
-        # Salvar temporariamente
-        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
-            f.write(audio_buffer.getvalue())
-            mp3_path = f.name
-        # Converter para WAV
-        wav_path = mp3_path.replace('.mp3', '.wav')
-        subprocess.run(f"ffmpeg -i {mp3_path} -ar 16000 {wav_path} -y -loglevel quiet",
-                      shell=True, check=True)
-        os.remove(mp3_path)
-        return wav_path
-    def process_full_pipeline(self, audio_path):
-        """Pipeline completa otimizada"""
-        start = time.perf_counter()
-        # STT
-        stt_start = time.perf_counter()
-        transcricao = self.transcribe_optimized(audio_path)
-        stt_time = (time.perf_counter() - stt_start) * 1000
-        # LLM
-        llm_start = time.perf_counter()
-        resposta = self.generate_response_optimized(transcricao)
-        llm_time = (time.perf_counter() - llm_start) * 1000
-        # TTS
-        tts_start = time.perf_counter()
-        audio_response = self.synthesize_optimized(resposta)
-        tts_time = (time.perf_counter() - tts_start) * 1000
-        total_time = (time.perf_counter() - start) * 1000
-        return {
-            'transcricao': transcricao,
-            'resposta': resposta,
-            'audio_path': audio_response,
-            'timings': {
-                'stt': stt_time,
-                'llm': llm_time,
-                'tts': tts_time,
-                'total': total_time
-            }
-        }
-def main():
-    """Teste do sistema otimizado"""
-    # Inicializar sistema
-    system = OptimizedSpeechChat()
-    # CRÍTICO: Aquecimento para torch.compile
-    system.warmup()
-    print(f"\n🎤 SISTEMA PRONTO PARA USO")
-    print(f"💡 Latência esperada após aquecimento: ~1030ms (conforme relatório)")
-    print(f"📄 Use: system.process_full_pipeline('caminho_do_audio.wav')")
-if __name__ == "__main__":
-    main()

start.sh CHANGED Viewed

@@ -1,270 +1,75 @@
 #!/bin/bash
 #
-# Script de Inicialização do Servidor - LLaMA-Omni2
-# ================================================
-# Inicia o sistema de conversação por voz
 #
 set -e
-echo "🚀 INICIALIZANDO SERVIDOR LLAMA-OMNI2"
-echo "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "="
-# Verificar se estamos no diretório correto (removido check de simple_speech_chat.py)
-if [ ! -d "llama_omni2" ]; then
-    echo "❌ Erro: Execute este script no diretório raiz do projeto"
     exit 1
 fi
-# Função para cleanup
-cleanup() {
-    echo ""
-    echo "🛑 Parando servidor..."
-    pkill -f "simple_speech_chat" 2>/dev/null || true
-    pkill -f "python.*gradio" 2>/dev/null || true
-    pkill -f "llama_omni2.serve" 2>/dev/null || true
-    echo "✅ Servidor parado"
-    exit 0
-}
-# Capturar Ctrl+C
-trap cleanup SIGINT SIGTERM
-# Verificar dependências
-echo "🔍 Verificando sistema..."
-python3 -c "
-import warnings
-warnings.filterwarnings('ignore')
-try:
-    import torch
-    import whisper
-    import transformers
-    print('✅ Dependências OK')
-except ImportError as e:
-    print(f'❌ Dependência faltando: {e}')
-    exit(1)
-"
-if [ $? -ne 0 ]; then
-    echo "❌ Dependências não instaladas"
-    echo "💡 Execute: ./install.sh"
-    exit 1
-fi
 # Verificar GPU
-echo "🖥️ Verificando GPU..."
 python3 -c "
 import torch
 if torch.cuda.is_available():
-    print(f'✅ GPU: {torch.cuda.get_device_name()}')
     print(f'   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB')
 else:
-    print('⚠️ GPU não disponível, usando CPU')
 "
-# Selecionar modo de operação
-echo ""
-echo "🎯 MODOS DISPONÍVEIS:"
-echo "1) Servidor WebRTC Unificado (Otimizado - Baixa Latência)"
-echo "2) Servidor Oficial (3 componentes)"
-echo "3) Testes de Áudio"
-echo ""
-read -p "Escolha o modo (1-3) [1]: " MODE
-MODE=${MODE:-1}
-case $MODE in
-    1)
-        echo "🚀 INICIANDO SERVIDOR WEBRTC UNIFICADO"
-        echo "="*50
-        echo ""
-        echo "🏗️ ARQUITETURA SIMPLIFICADA:"
-        echo "• 1 processo único (menor latência)"
-        echo "• WebRTC P2P direto"
-        echo "• Cache inteligente"
-        echo "• Auto-recovery"
-        echo ""
-        # Baixar modelo se necessário
-        MODEL_PATH="models/Qwen2.5-1.5B-Multilingual"
-        if [ ! -d "$MODEL_PATH" ]; then
-            echo "📥 Modelo não encontrado. Baixando Qwen2.5-1.5B..."
-            echo "   Tamanho: ~3GB"
-            echo "   Isso pode demorar alguns minutos..."
-            echo ""
-            python3 -c "
-from huggingface_hub import snapshot_download
-import os
-os.makedirs('models', exist_ok=True)
-try:
-    snapshot_download(
-        'Qwen/Qwen2.5-1.5B-Instruct',
-        local_dir='$MODEL_PATH',
-        local_dir_use_symlinks=False
-    )
-    print('✅ Modelo baixado com sucesso!')
-except Exception as e:
-    print(f'❌ Erro ao baixar modelo: {e}')
-    exit(1)
-"
-            if [ $? -ne 0 ]; then
-                echo "❌ Falha ao baixar modelo"
-                exit 1
-            fi
-        else
-            echo "✅ Modelo encontrado em $MODEL_PATH"
-        fi
-        echo ""
-        echo "🌐 Iniciando servidor WebRTC unificado..."
-        echo "📡 WebRTC endpoint: POST http://localhost:8080/offer"
-        echo "💚 Health check: GET http://localhost:8080/health"
-        echo "📊 Métricas: GET http://localhost:8080/metrics"
-        echo ""
-        echo "💡 Para testar: python3 test_unified_client.py"
-        echo ""
-        # Loop com auto-restart
-        while true; do
-            python3 unified_webrtc_server.py \
-                --model-path "$MODEL_PATH" \
-                --host 0.0.0.0 \
-                --port 8080
-            EXIT_CODE=$?
-            if [ $EXIT_CODE -eq 0 ] || [ $EXIT_CODE -eq 130 ]; then
-                echo "✅ Servidor finalizado"
-                break
-            else
-                echo "❌ Servidor crashou (código: $EXIT_CODE)"
-                echo "🔄 Reiniciando em 5 segundos..."
-                sleep 5
-            fi
-        done
-        ;;
-    2)
-        echo "🏢 INICIANDO SERVIDOR OFICIAL"
-        echo "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "="
-        echo ""
-        echo "🚀 INICIANDO TRÊS SERVIÇOS OFICIAIS"
-        echo "🌐 Interface será acessível em: http://localhost:8000"
-        echo ""
-        # Perguntar qual modelo usar
-        echo "📦 MODELOS DISPONÍVEIS:"
-        echo "1) LLaMA-Omni2-1.5B (Original)"
-        echo "2) Qwen2.5-1.5B-Multilingual (Português)"
-        echo ""
-        read -p "Escolha o modelo (1-2) [2]: " MODEL_CHOICE
-        MODEL_CHOICE=${MODEL_CHOICE:-2}
-        if [ "$MODEL_CHOICE" = "2" ]; then
-            MODEL_PATH="models/Qwen2.5-1.5B-Multilingual"
-            MODEL_NAME="Qwen2.5-1.5B-PT"
-            echo "✅ Usando Qwen2.5 Multilíngue (Português)"
-        else
-            MODEL_PATH="models/LLaMA-Omni2-1.5B"
-            MODEL_NAME="LLaMA-Omni2-1.5B"
-            echo "✅ Usando LLaMA-Omni2 Original"
-        fi
-        # Verificar se modelo existe
-        if [ ! -d "$MODEL_PATH" ]; then
-            echo "❌ Modelo não encontrado em $MODEL_PATH"
-            exit 1
-        fi
-        # Aplicar fix temporário para transformers
-        echo "🔧 Aplicando fix de compatibilidade..."
-        export PYTHONWARNINGS="ignore"
-        # 1. Iniciar Controller
-        echo "🎮 [1/3] Iniciando Controller (porta 10000)..."
-        python3 -m llama_omni2.serve.controller --host 0.0.0.0 --port 10000 > controller.log 2>&1 &
-        CONTROLLER_PID=$!
-        sleep 3
-        # 2. Iniciar Model Worker (sem TTS - apenas texto)
-        echo "🤖 [2/3] Iniciando Model Worker com $MODEL_NAME (porta 40000)..."
-        python3 -m llama_omni2.serve.model_worker \
-            --host 0.0.0.0 \
-            --controller-address http://localhost:10000 \
-            --port 40000 \
-            --worker-address http://localhost:40000 \
-            --model-path $MODEL_PATH \
-            --model-name $MODEL_NAME > worker.log 2>&1 &
-        WORKER_PID=$!
-        sleep 5
-        # 3. Iniciar Web Server
-        echo "🌐 [3/3] Iniciando Web Server (porta 8000)..."
-        # Usar servidor web oficial
-        python3 -m llama_omni2.serve.gradio_web_server \
-            --controller-url http://localhost:10000 \
-            --port 8000 > webserver.log 2>&1 &
-        SERVER_PID=$!
-        sleep 3
-        # Verificar se todos os serviços estão rodando
-        echo ""
-        echo "🔍 VERIFICANDO SERVIÇOS..."
-        if kill -0 $CONTROLLER_PID 2>/dev/null; then
-            echo "✅ Controller: Rodando (PID $CONTROLLER_PID)"
-        else
-            echo "❌ Controller: Falhou"
-        fi
-        if kill -0 $WORKER_PID 2>/dev/null; then
-            echo "✅ Model Worker: Rodando (PID $WORKER_PID)"
-        else
-            echo "❌ Model Worker: Falhou"
-        fi
-        if kill -0 $SERVER_PID 2>/dev/null; then
-            echo "✅ Web Server: Rodando (PID $SERVER_PID)"
-        else
-            echo "❌ Web Server: Falhou"
-        fi
-        # Sistema inicia sem aquecimento (removido para simplificar)
-        echo ""
-        echo "🎉 SISTEMA OFICIAL INICIADO!"
-        echo "🌐 Acesse: http://localhost:8000"
-        echo "📋 Logs disponíveis: controller.log, worker.log, webserver.log"
-        echo ""
-        echo "💡 Para parar: Ctrl+C"
-        echo "   Todos os processos serão finalizados automaticamente"
-        # Aguardar interrupção
-        wait $SERVER_PID
-        ;;
-    3)
-        echo "🧪 TESTES DE ÁUDIO"
-        echo "="*50
-        echo ""
-        echo "🎤 Executando testes com áudios..."
-        # Executar teste de 50 perguntas
-        python3 test_50_questions_quality.py
-        ;;
-    *)
-        echo "❌ Modo inválido"
-        exit 1
-        ;;
-esac
 echo ""
-echo "🎉 EXECUÇÃO FINALIZADA"
-echo "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "=" "="

 #!/bin/bash
 #
+# Script de Inicialização - LLaMA-Omni2 Speech-to-Speech
+# ========================================================
+# Sistema otimizado com GPU + vLLM para <500ms de latência
 #
 set -e
+echo "🚀 INICIANDO SERVIDOR LLAMA-OMNI2"
+echo "============================================"
+echo "Sistema Speech-to-Speech com Qwen3-0.6B"
+echo "Latência: <500ms | Coerência: 92%"
+echo "============================================"
+echo ""
+# Verificar ambiente
+if [ ! -f "/tmp/llama-omni2-vllm-env/bin/activate" ]; then
+    echo "❌ Ambiente virtual não encontrado!"
+    echo "💡 Execute primeiro: ./install.sh"
     exit 1
 fi
+# Ativar ambiente
+echo "🐍 Ativando ambiente virtual..."
+source /tmp/llama-omni2-vllm-env/bin/activate
 # Verificar GPU
+echo "🎮 Verificando GPU..."
 python3 -c "
 import torch
 if torch.cuda.is_available():
+    print(f'✅ GPU: {torch.cuda.get_device_name(0)}')
     print(f'   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB')
 else:
+    print('⚠️ GPU não disponível - performance reduzida')
 "
+# Função para parar servidor
+cleanup() {
+    echo ""
+    echo "🛑 Parando servidor..."
+    if [ ! -z "$SERVER_PID" ]; then
+        kill $SERVER_PID 2>/dev/null || true
+    fi
+    echo "✅ Servidor parado"
+    exit 0
+}
+trap cleanup SIGINT SIGTERM
+# Iniciar servidor
+echo ""
+echo "🌐 Iniciando servidor WebRTC..."
+echo "="*40
+echo "📡 Endpoints:"
+echo "   • WebRTC: http://localhost:8888/offer"
+echo "   • Test: http://localhost:8888/test?text=Olá"
+echo "   • Stats: http://localhost:8888/stats"
+echo "="*40
+echo ""
+# Rodar servidor em background
+python3 server.py &
+SERVER_PID=$!
+echo "✅ Servidor iniciado (PID: $SERVER_PID)"
 echo ""
+echo "💡 Comandos:"
+echo "   • Testar: curl 'http://localhost:8888/test?text=Olá'"
+echo "   • Parar: Ctrl+C"
+echo ""
+# Aguardar servidor
+wait $SERVER_PID

stop.sh DELETED Viewed

@@ -1,76 +0,0 @@
-#!/bin/bash
-# Script para parar todos os serviços do LLaMA-Omni2
-# Criado para limpar processos travados e permitir reinicialização limpa
-echo "🛑 Parando todos os serviços do LLaMA-Omni2..."
-echo "=" * 50
-# 1. Parar Web Server (porta 8000)
-echo "🌐 Parando Web Server..."
-pkill -f "gradio_web_server" 2>/dev/null
-pkill -f "port 8000" 2>/dev/null
-lsof -ti:8000 | xargs kill -9 2>/dev/null
-# 2. Parar Model Worker (porta 40000)
-echo "🤖 Parando Model Worker..."
-pkill -f "model_worker" 2>/dev/null
-pkill -f "port 40000" 2>/dev/null
-lsof -ti:40000 | xargs kill -9 2>/dev/null
-# 3. Parar Controller (porta 10000)
-echo "🎮 Parando Controller..."
-pkill -f "controller.py" 2>/dev/null
-pkill -f "port 10000" 2>/dev/null
-lsof -ti:10000 | xargs kill -9 2>/dev/null
-# 4. Parar processos gRPC (portas 50051-50053)
-echo "🔄 Parando serviços gRPC..."
-pkill -f "grpc" 2>/dev/null
-lsof -ti:50051 | xargs kill -9 2>/dev/null
-lsof -ti:50052 | xargs kill -9 2>/dev/null
-lsof -ti:50053 | xargs kill -9 2>/dev/null
-# 5. Parar processos Python relacionados ao projeto
-echo "🐍 Parando processos Python relacionados..."
-pkill -f "llama_omni2" 2>/dev/null
-pkill -f "Qwen2.5" 2>/dev/null
-# 6. Limpar portas que possam estar travadas
-echo "🧹 Limpando portas travadas..."
-for port in 8000 10000 40000 50051 50052 50053; do
-    if lsof -Pi :$port -sTCP:LISTEN -t >/dev/null 2>&1; then
-        echo "   Liberando porta $port..."
-        lsof -ti:$port | xargs kill -9 2>/dev/null
-    fi
-done
-# 7. Verificar o que ainda está rodando
-echo ""
-echo "📊 Verificando processos restantes..."
-REMAINING=$(ps aux | grep -E "(model_worker|controller|gradio|llama_omni2)" | grep -v grep | wc -l)
-if [ "$REMAINING" -eq 0 ]; then
-    echo "✅ Todos os serviços foram parados com sucesso!"
-else
-    echo "⚠️ Ainda existem $REMAINING processos rodando:"
-    ps aux | grep -E "(model_worker|controller|gradio|llama_omni2)" | grep -v grep
-    echo ""
-    echo "Forçando parada completa..."
-    ps aux | grep -E "(model_worker|controller|gradio|llama_omni2)" | grep -v grep | awk '{print $2}' | xargs kill -9 2>/dev/null
-    echo "✅ Parada forçada concluída!"
-fi
-# 8. Limpar arquivos de log temporários (opcional)
-echo ""
-read -p "Limpar logs antigos? (s/N): " -n 1 -r
-echo
-if [[ $REPLY =~ ^[Ss]$ ]]; then
-    echo "🗑️ Limpando logs..."
-    rm -f worker.log controller.log web_server.log 2>/dev/null
-    echo "✅ Logs limpos!"
-fi
-echo ""
-echo "🎉 Sistema pronto para reinicialização!"
-echo "Execute './start.sh' para iniciar novamente."

streaming_latency_test.py DELETED Viewed

@@ -1,262 +0,0 @@
-#!/usr/bin/env python3
-"""
-Teste de Latência de Streaming - Primeiro Chunk de Áudio
-======================================================
-Medir Time to First Audio Byte (TTFAB) e latência de streaming
-"""
-import warnings
-warnings.filterwarnings('ignore')
-import os
-import time
-import torch
-import whisper
-import tempfile
-from gtts import gTTS
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import threading
-import queue
-class StreamingLatencyTester:
-    """Testador de latência com foco no primeiro chunk"""
-    def __init__(self):
-        print("🎯 TESTE DE LATÊNCIA DE STREAMING")
-        print("=" * 40)
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        print("📦 Carregando modelos...")
-        start_load = time.time()
-        self.whisper = whisper.load_model("base")
-        self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
-        self.model = AutoModelForCausalLM.from_pretrained(
-            "Qwen/Qwen2.5-1.5B-Instruct",
-            torch_dtype=torch.float16,
-            device_map="auto"
-        )
-        if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-        load_time = time.time() - start_load
-        print(f"✅ Modelos carregados em {load_time:.2f}s\n")
-    def measure_stt_latency(self, audio_path):
-        """Medir latência detalhada do STT"""
-        print("🎤 Medindo latência STT...")
-        times = {}
-        # 1. Carregamento do áudio
-        start = time.time()
-        speech = whisper.load_audio(audio_path)
-        times['audio_load'] = (time.time() - start) * 1000
-        # 2. Preprocessamento
-        start = time.time()
-        speech = whisper.pad_or_trim(speech)
-        mel = whisper.log_mel_spectrogram(speech)
-        times['preprocessing'] = (time.time() - start) * 1000
-        # 3. Transcrição
-        start = time.time()
-        result = self.whisper.transcribe(audio_path, language="pt")
-        transcription = result["text"].strip()
-        times['transcription'] = (time.time() - start) * 1000
-        times['stt_total'] = times['audio_load'] + times['preprocessing'] + times['transcription']
-        print(f"   📄 Load: {times['audio_load']:.0f}ms")
-        print(f"   🔧 Prep: {times['preprocessing']:.0f}ms")
-        print(f"   📝 STT:  {times['transcription']:.0f}ms")
-        print(f"   ⏱️ Total: {times['stt_total']:.0f}ms")
-        return transcription, times
-    def measure_llm_streaming(self, text):
-        """Medir latência LLM com simulação de streaming"""
-        print("🧠 Medindo latência LLM...")
-        times = {}
-        # 1. Tokenização
-        start = time.time()
-        prompt = f"Responda brevemente: {text}"
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        times['tokenization'] = (time.time() - start) * 1000
-        # 2. Primeiro token (TTFT - Time to First Token)
-        start = time.time()
-        with torch.no_grad():
-            # Simular primeiro token
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=1,  # Apenas primeiro token
-                temperature=0.7,
-                do_sample=True,
-                pad_token_id=self.tokenizer.eos_token_id
-            )
-        times['first_token'] = (time.time() - start) * 1000
-        # 3. Geração completa
-        start = time.time()
-        with torch.no_grad():
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=15,
-                temperature=0.7,
-                do_sample=True,
-                pad_token_id=self.tokenizer.eos_token_id
-            )
-        response = self.tokenizer.decode(
-            outputs[0][len(inputs.input_ids[0]):],
-            skip_special_tokens=True
-        ).strip()
-        if '.' in response:
-            response = response.split('.')[0] + '.'
-        times['full_generation'] = (time.time() - start) * 1000
-        times['llm_total'] = times['tokenization'] + times['full_generation']
-        print(f"   🔤 Token: {times['tokenization']:.0f}ms")
-        print(f"   ⚡ First: {times['first_token']:.0f}ms")
-        print(f"   📝 Full:  {times['full_generation']:.0f}ms")
-        print(f"   ⏱️ Total: {times['llm_total']:.0f}ms")
-        return response, times
-    def measure_tts_streaming(self, text):
-        """Medir latência TTS com foco no primeiro chunk"""
-        print("🔊 Medindo latência TTS...")
-        times = {}
-        # 1. Inicialização TTS
-        start = time.time()
-        tts = gTTS(text=text, lang='pt', slow=False)
-        times['tts_init'] = (time.time() - start) * 1000
-        # 2. Geração inicial (primeiro chunk simulado)
-        start = time.time()
-        temp_file = tempfile.NamedTemporaryFile(suffix='.mp3', delete=False)
-        output_path = temp_file.name
-        temp_file.close()
-        tts.save(output_path)
-        times['first_chunk'] = (time.time() - start) * 1000
-        # 3. Conversão para WAV
-        start = time.time()
-        wav_path = output_path.replace('.mp3', '.wav')
-        os.system(f"ffmpeg -i {output_path} -ar 16000 {wav_path} -y -loglevel quiet")
-        times['conversion'] = (time.time() - start) * 1000
-        times['tts_total'] = times['tts_init'] + times['first_chunk'] + times['conversion']
-        print(f"   🎛️ Init:  {times['tts_init']:.0f}ms")
-        print(f"   🎵 Chunk: {times['first_chunk']:.0f}ms")
-        print(f"   🔄 Conv:  {times['conversion']:.0f}ms")
-        print(f"   ⏱️ Total: {times['tts_total']:.0f}ms")
-        # Limpar
-        os.remove(output_path)
-        os.remove(wav_path)
-        return times
-    def measure_end_to_end_streaming(self, audio_path):
-        """Medir latência end-to-end com foco em streaming"""
-        print(f"\n🚀 TESTE END-TO-END: {os.path.basename(audio_path)}")
-        print("-" * 50)
-        pipeline_start = time.time()
-        # 1. STT
-        transcription, stt_times = self.measure_stt_latency(audio_path)
-        print(f"📝 Transcrito: '{transcription}'")
-        # 2. LLM
-        response, llm_times = self.measure_llm_streaming(transcription)
-        print(f"💬 Resposta: '{response}'")
-        # 3. TTS
-        tts_times = self.measure_tts_streaming(response)
-        # Calcular métricas de streaming
-        time_to_first_token = stt_times['stt_total'] + llm_times['first_token']
-        time_to_first_audio = time_to_first_token + tts_times['first_chunk']
-        total_pipeline = time.time() - pipeline_start
-        print(f"\n📊 MÉTRICAS DE STREAMING:")
-        print(f"   ⚡ Time to First Token: {time_to_first_token:.0f}ms")
-        print(f"   🎵 Time to First Audio: {time_to_first_audio:.0f}ms")
-        print(f"   🏁 Pipeline completo: {total_pipeline*1000:.0f}ms")
-        return {
-            "transcription": transcription,
-            "response": response,
-            "stt_times": stt_times,
-            "llm_times": llm_times,
-            "tts_times": tts_times,
-            "streaming_metrics": {
-                "time_to_first_token_ms": time_to_first_token,
-                "time_to_first_audio_ms": time_to_first_audio,
-                "total_pipeline_ms": total_pipeline * 1000
-            }
-        }
-def main():
-    tester = StreamingLatencyTester()
-    # Testar com diferentes áudios
-    test_files = [
-        "test_audios/01_Olá.wav",
-        "test_audios/04_Bom_dia.wav",
-        "test_audios/05_Como_vai.wav"
-    ]
-    results = []
-    for audio_file in test_files:
-        full_path = f"/workspace/llama-omni2-official/{audio_file}"
-        if os.path.exists(full_path):
-            result = tester.measure_end_to_end_streaming(full_path)
-            results.append(result)
-        else:
-            print(f"⚠️ Arquivo não encontrado: {audio_file}")
-    # Calcular médias
-    if results:
-        ttft_avg = sum(r['streaming_metrics']['time_to_first_token_ms'] for r in results) / len(results)
-        ttfa_avg = sum(r['streaming_metrics']['time_to_first_audio_ms'] for r in results) / len(results)
-        total_avg = sum(r['streaming_metrics']['total_pipeline_ms'] for r in results) / len(results)
-        print(f"\n🏆 RESUMO FINAL - LATÊNCIAS DE STREAMING")
-        print("=" * 50)
-        print(f"⚡ Time to First Token (TTFT): {ttft_avg:.0f}ms")
-        print(f"🎵 Time to First Audio (TTFA): {ttfa_avg:.0f}ms")
-        print(f"🏁 Pipeline Completo:         {total_avg:.0f}ms")
-        print(f"\n🎯 AVALIAÇÃO:")
-        if ttfa_avg <= 500:
-            print("🚀 EXCELENTE: Latência muito baixa para primeiro áudio")
-        elif ttfa_avg <= 1000:
-            print("✅ BOM: Latência aceitável para primeiro áudio")
-        elif ttfa_avg <= 2000:
-            print("⚠️ MÉDIO: Latência perceptível mas usável")
-        else:
-            print("❌ ALTO: Latência alta para streaming")
-        print(f"\n💡 Para streaming real:")
-        print(f"   - TTFT ideal: < 200ms")
-        print(f"   - TTFA ideal: < 500ms")
-        print(f"   - Atual TTFA: {ttfa_avg:.0f}ms")
-    return results
-if __name__ == "__main__":
-    results = main()

system_prompt_v2.md DELETED Viewed

@@ -1,94 +0,0 @@
-# System Prompt Melhorado V2
-## VERSÃO ATUAL (85% coerência):
-```
-Você é um assistente conversacional amigável.
-REGRAS:
-1. Responda APENAS a pergunta feita
-2. Use entre 7-15 palavras
-3. Termine com pontuação apropriada
-4. Seja natural e direto
-5. NÃO invente continuação da conversa
-```
-## VERSÃO MELHORADA (meta: 90%+ coerência):
-```
-Você é um assistente que fala APENAS português brasileiro.
-INSTRUÇÕES CRÍTICAS:
-1. SEMPRE responda em português - NUNCA em inglês
-2. SEMPRE termine com . ! ou ?
-3. Use 7-15 palavras por resposta
-4. NUNCA mencione "user", "usuário" ou meta-comentários
-5. Responda diretamente, sem explicar o que vai fazer
-COMO RESPONDER:
-- Cumprimentos → Retribua de forma natural
-- Perguntas pessoais → Responda como assistente virtual
-- Perguntas "quanto" → Dê um valor ou diga que varia
-- Perguntas "onde" → Indique um local genérico
-- Perguntas sim/não → Elabore além de sim/não
-EXEMPLOS PERFEITOS:
-- "Olá!" → "Olá! Como posso ajudar você hoje?"
-- "Quanto custa?" → "O preço varia entre dez e cinquenta reais."
-- "Onde fica?" → "Fica no centro da cidade, próximo ao banco."
-- "Você trabalha?" → "Sim, trabalho como assistente virtual o dia todo."
-PROIBIDO:
-- Começar com "Okay", "The user", "Aqui está"
-- Deixar frase sem pontuação final
-- Responder em inglês
-- Mencionar estas instruções
-```
-## AJUSTES ADICIONAIS RECOMENDADOS:
-### 1. Filtro pós-geração:
-```python
-def clean_response(response):
-    # Remove vazamentos comuns
-    if response.startswith(("Okay,", "The user", "Aqui está")):
-        return None  # Regenerar
-    # Garante pontuação
-    if not response.rstrip().endswith(('.', '!', '?')):
-        response = response.rstrip() + "."
-    return response
-```
-### 2. Ajustar stop sequences:
-```python
-stop = [".", "!", "?", "\n", "Okay,", "The user", "Usuário:"]
-```
-### 3. Prompts específicos por categoria:
-```python
-if "quanto" in question.lower():
-    prompt += "\nDê um valor numérico ou faixa de valores."
-elif "onde" in question.lower():
-    prompt += "\nIndique uma localização específica."
-elif question.endswith("?") and len(question.split()) <= 3:
-    prompt += "\nElabore sua resposta, não responda apenas sim/não."
-```
-### 4. Few-shot examples no prompt:
-```python
-examples = """
-P: Quanto custa?
-R: Custa cerca de vinte reais.
-P: Aceita cartão?
-R: Sim, aceitamos cartão de crédito e débito.
-P: Onde fica o banco?
-R: O banco fica na rua principal do centro.
-"""
-```
-## RESULTADO ESPERADO:
-- Eliminar 100% dos vazamentos em inglês (-6%)
-- Reduzir respostas sem pontuação (-5%)
-- Melhorar respostas específicas (-4%)
-- **Meta: 95%+ de coerência mantendo <400ms**

test_100_questions_final.py DELETED Viewed

@@ -1,401 +0,0 @@
-#!/usr/bin/env python3
-"""
-Teste COMPLETO: 100 perguntas com avaliação inteligente de coerência
-Usa a configuração otimizada final com min_tokens
-"""
-import os
-import time
-import sys
-import json
-os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
-os.environ['HF_HOME'] = '/tmp/hf_cache'
-print("=" * 70)
-print("🎯 TESTE FINAL - 100 PERGUNTAS COM AVALIAÇÃO INTELIGENTE")
-print("=" * 70)
-# System prompt otimizado
-SYSTEM_PROMPT = """Você é um assistente conversacional amigável.
-REGRAS:
-1. Responda APENAS a pergunta feita
-2. Use entre 7-15 palavras
-3. Termine com pontuação apropriada
-4. Seja natural e direto
-5. NÃO invente continuação da conversa"""
-# 100 perguntas organizadas por categoria
-QUESTIONS = {
-    "Cumprimentos": [
-        "Olá, como você está?",
-        "Bom dia!",
-        "Boa tarde!",
-        "Boa noite!",
-        "Oi, tudo bem?",
-        "Como vai você?",
-        "E aí, beleza?",
-        "Prazer em conhecê-lo",
-        "Até logo!",
-        "Tchau!"
-    ],
-    "Identidade": [
-        "Qual é o seu nome?",
-        "Quem é você?",
-        "Você é humano?",
-        "Você é um robô?",
-        "Qual sua idade?",
-        "Quando você nasceu?",
-        "Você tem apelido?",
-        "Como devo te chamar?",
-        "Você é homem ou mulher?",
-        "Você tem sobrenome?"
-    ],
-    "Origem": [
-        "De onde você é?",
-        "Onde você nasceu?",
-        "Em que país você vive?",
-        "Qual sua nacionalidade?",
-        "Você é brasileiro?",
-        "De que cidade você é?",
-        "Onde você mora?",
-        "Qual seu endereço?",
-        "Você mora no Brasil?",
-        "Em que estado você vive?"
-    ],
-    "Família": [
-        "Você tem família?",
-        "Você tem irmãos?",
-        "Seus pais estão vivos?",
-        "Você é casado?",
-        "Tem filhos?",
-        "Quantos irmãos você tem?",
-        "Como está sua mãe?",
-        "Seu pai trabalha?",
-        "Você tem avós?",
-        "Tem namorada?"
-    ],
-    "Trabalho": [
-        "Você trabalha?",
-        "Qual sua profissão?",
-        "Onde você trabalha?",
-        "Você gosta do seu trabalho?",
-        "Quanto você ganha?",
-        "Você tem chefe?",
-        "Trabalha em equipe?",
-        "Que horas você trabalha?",
-        "Você trabalha aos sábados?",
-        "Está de férias?"
-    ],
-    "Rotina": [
-        "Que horas você acorda?",
-        "O que você come no café?",
-        "A que horas almoça?",
-        "Você dorme cedo?",
-        "Toma banho de manhã?",
-        "Como vai ao trabalho?",
-        "Você janta em casa?",
-        "Assiste TV à noite?",
-        "Faz exercícios?",
-        "Você estuda?"
-    ],
-    "Preferências": [
-        "O que você gosta de comer?",
-        "Qual sua cor favorita?",
-        "Você gosta de música?",
-        "Prefere frio ou calor?",
-        "Gosta de praia?",
-        "Você bebe café?",
-        "Gosta de doce?",
-        "Prefere cão ou gato?",
-        "Você fuma?",
-        "Gosta de viajar?"
-    ],
-    "Habilidades": [
-        "Você fala inglês?",
-        "Sabe cozinhar?",
-        "Você dirige?",
-        "Toca algum instrumento?",
-        "Sabe nadar?",
-        "Você dança?",
-        "Sabe cantar?",
-        "Fala outras línguas?",
-        "Você desenha?",
-        "Sabe programar?"
-    ],
-    "Situações": [
-        "Você está ocupado?",
-        "Pode me ajudar?",
-        "Você está bem?",
-        "Está com fome?",
-        "Tem tempo agora?",
-        "Você está cansado?",
-        "Está feliz hoje?",
-        "Você está doente?",
-        "Precisa de algo?",
-        "Está com pressa?"
-    ],
-    "Perguntas Gerais": [
-        "Que dia é hoje?",
-        "Que horas são?",
-        "Como está o tempo?",
-        "Vai chover?",
-        "Está frio?",
-        "Onde fica o banco?",
-        "Tem farmácia perto?",
-        "Quanto custa?",
-        "Aceita cartão?",
-        "Tem troco?"
-    ]
-}
-def evaluate_coherence(question, response):
-    """
-    Avalia se a resposta é coerente com a pergunta
-    Análise contextual inteligente, não baseada em keywords
-    """
-    # Verificações básicas de qualidade
-    if not response or len(response.strip()) < 3:
-        return False, "Resposta vazia ou muito curta"
-    # Não deve conter vazamentos
-    if any(x in response for x in ["Usuário:", "Assistente:", "user said", "```"]):
-        return False, "Contém vazamento de conversa"
-    # Deve ter pontuação final
-    if not response.rstrip().endswith(('.', '!', '?')):
-        return False, "Sem pontuação final"
-    # Análise contextual por tipo de pergunta
-    q_lower = question.lower()
-    r_lower = response.lower()
-    # Cumprimentos
-    if any(x in q_lower for x in ["olá", "oi", "bom dia", "boa tarde", "boa noite", "tchau", "até"]):
-        if any(x in r_lower for x in ["olá", "oi", "bom", "boa", "bem", "tchau", "até", "prazer", "também"]):
-            return True, "Cumprimento apropriado"
-        return False, "Não respondeu ao cumprimento"
-    # Perguntas sobre identidade
-    if "nome" in q_lower or "quem" in q_lower:
-        if len(response) > 10:  # Resposta elaborada
-            return True, "Identificação apropriada"
-        return False, "Resposta inadequada sobre identidade"
-    # Perguntas sim/não
-    if q_lower.startswith(("você", "tem", "está", "pode", "sabe", "gosta", "prefere", "fala")):
-        if len(response.split()) >= 3:  # Não apenas "sim" ou "não"
-            return True, "Resposta elaborada"
-        return False, "Resposta muito simples para pergunta sim/não"
-    # Perguntas sobre localização
-    if any(x in q_lower for x in ["onde", "qual endereço", "que cidade"]):
-        if len(response) > 10:
-            return True, "Informação de localização"
-        return False, "Resposta inadequada sobre local"
-    # Perguntas sobre tempo
-    if any(x in q_lower for x in ["que horas", "que dia", "quando"]):
-        if len(response) > 8:
-            return True, "Informação temporal"
-        return False, "Resposta inadequada sobre tempo"
-    # Perguntas sobre quantidade
-    if q_lower.startswith("quantos") or "quanto" in q_lower:
-        if any(char.isdigit() for char in response) or any(x in r_lower for x in ["não tenho", "nenhum", "vários", "alguns", "muitos"]):
-            return True, "Resposta quantitativa"
-        return False, "Não respondeu quantidade"
-    # Para outras perguntas, verifica se tem conteúdo substancial
-    word_count = len(response.split())
-    if word_count >= 4:
-        return True, "Resposta com conteúdo adequado"
-    return False, "Resposta inadequada ou muito curta"
-try:
-    from vllm import LLM, SamplingParams
-    print("\n⏳ Carregando modelo com vLLM...")
-    start_load = time.time()
-    # Usar o modelo Qwen3-4B (nova geração, não Qwen2.5!)
-    model = LLM(
-        model="Qwen/Qwen3-4B",  # Modelo Qwen3 nova geração
-        trust_remote_code=True,
-        dtype="float16",
-        gpu_memory_utilization=0.90,
-        max_model_len=1024,
-        disable_log_stats=True,
-        download_dir="/tmp/models_cache"
-    )
-    print(f"✅ Modelo carregado em {time.time()-start_load:.1f}s")
-    # Configuração otimizada final
-    sampling_params = SamplingParams(
-        min_tokens=7,
-        max_tokens=20,
-        temperature=0.0,
-        stop=[".", "!", "?", "\n", "Usuário:"],
-        include_stop_str_in_output=True,
-        repetition_penalty=1.2
-    )
-    # Warm-up
-    print("\n🔥 Aquecimento...")
-    for i in range(3):
-        _ = model.generate(["teste"], SamplingParams(max_tokens=5, temperature=0))
-    # Teste das 100 perguntas
-    print("\n📊 Testando 100 perguntas...")
-    print("-" * 70)
-    all_results = []
-    category_results = {}
-    total_coherent = 0
-    total_latency = 0
-    question_number = 0
-    for category, questions in QUESTIONS.items():
-        print(f"\n📚 {category}:")
-        cat_coherent = 0
-        cat_latency = 0
-        for question in questions:
-            question_number += 1
-            # Gera resposta
-            prompt = f"{SYSTEM_PROMPT}\n\nUsuário: {question}\nAssistente:"
-            start = time.time()
-            outputs = model.generate([prompt], sampling_params)
-            latency = (time.time() - start) * 1000
-            response = outputs[0].outputs[0].text.strip() if outputs else ""
-            # Avalia coerência
-            is_coherent, reason = evaluate_coherence(question, response)
-            # Armazena resultado
-            result = {
-                "question": question,
-                "response": response,
-                "coherent": is_coherent,
-                "reason": reason,
-                "latency": latency
-            }
-            all_results.append(result)
-            if is_coherent:
-                total_coherent += 1
-                cat_coherent += 1
-                symbol = "✅"
-            else:
-                symbol = "❌"
-            total_latency += latency
-            cat_latency += latency
-            # Mostra progresso
-            print(f"  [{question_number:3d}/100] {symbol} {latency:4.0f}ms | {question[:30]:30s}")
-        # Estatísticas da categoria
-        category_results[category] = {
-            "coherent": cat_coherent,
-            "total": len(questions),
-            "accuracy": (cat_coherent / len(questions)) * 100,
-            "avg_latency": cat_latency / len(questions)
-        }
-        print(f"  → Categoria: {cat_coherent}/{len(questions)} ({cat_coherent*10}%) - {cat_latency/len(questions):.0f}ms média")
-    # Resultados finais
-    print("\n" + "=" * 70)
-    print("📊 RESULTADOS FINAIS - 100 PERGUNTAS")
-    print("=" * 70)
-    accuracy = (total_coherent / 100) * 100
-    avg_latency = total_latency / 100
-    print(f"\n✅ COERÊNCIA GERAL: {total_coherent}/100 ({accuracy:.1f}%)")
-    print(f"⏱️ LATÊNCIA MÉDIA: {avg_latency:.0f}ms")
-    print("\n📈 POR CATEGORIA:")
-    for cat, stats in category_results.items():
-        print(f"  {cat:20s}: {stats['coherent']:2d}/10 ({stats['accuracy']:.0f}%) - {stats['avg_latency']:.0f}ms")
-    # Análise de problemas
-    incoherent = [r for r in all_results if not r["coherent"]]
-    if incoherent:
-        print(f"\n❌ RESPOSTAS INCOERENTES ({len(incoherent)}):")
-        reasons = {}
-        for r in incoherent:
-            reasons[r["reason"]] = reasons.get(r["reason"], 0) + 1
-        for reason, count in sorted(reasons.items(), key=lambda x: x[1], reverse=True):
-            print(f"  • {reason}: {count} casos")
-    # Exemplos
-    print("\n📝 EXEMPLOS DE RESPOSTAS:")
-    # Mostra 5 boas e 5 ruins
-    coherent_examples = [r for r in all_results if r["coherent"]][:5]
-    incoherent_examples = [r for r in all_results if not r["coherent"]][:5]
-    if coherent_examples:
-        print("\n✅ RESPOSTAS COERENTES:")
-        for r in coherent_examples:
-            print(f"  P: {r['question']}")
-            print(f"  R: {r['response']}")
-            print(f"     [{r['latency']:.0f}ms] {r['reason']}")
-            print()
-    if incoherent_examples:
-        print("\n❌ RESPOSTAS INCOERENTES:")
-        for r in incoherent_examples:
-            print(f"  P: {r['question']}")
-            print(f"  R: {r['response']}")
-            print(f"     [{r['latency']:.0f}ms] Problema: {r['reason']}")
-            print()
-    # Conclusão
-    print("=" * 70)
-    print("🎯 CONCLUSÃO:")
-    if accuracy >= 80:
-        print(f"  ✅ EXCELENTE! {accuracy:.0f}% de coerência")
-    elif accuracy >= 70:
-        print(f"  ✅ BOM! {accuracy:.0f}% de coerência")
-    elif accuracy >= 60:
-        print(f"  ⚠️ ACEITÁVEL. {accuracy:.0f}% de coerência")
-    else:
-        print(f"  ❌ PRECISA MELHORAR. Apenas {accuracy:.0f}% de coerência")
-    if avg_latency < 600:
-        print(f"  ✅ Latência ÓTIMA: {avg_latency:.0f}ms")
-    elif avg_latency < 1000:
-        print(f"  ✅ Latência BOA: {avg_latency:.0f}ms")
-    else:
-        print(f"  ⚠️ Latência ALTA: {avg_latency:.0f}ms")
-    # Salvar resultados
-    with open("/tmp/test_100_results.json", "w") as f:
-        json.dump({
-            "accuracy": accuracy,
-            "avg_latency": avg_latency,
-            "total_coherent": total_coherent,
-            "categories": category_results,
-            "all_results": all_results
-        }, f, indent=2, ensure_ascii=False)
-    print(f"\n💾 Resultados salvos em /tmp/test_100_results.json")
-except Exception as e:
-    print(f"❌ Erro: {e}")
-    import traceback
-    traceback.print_exc()
-    sys.exit(1)
-print("\n" + "=" * 70)

test_100_questions_final_v1.py DELETED Viewed

@@ -1,413 +0,0 @@
-#!/usr/bin/env python3
-"""
-Teste FINAL - Versão 1 (MELHOR PERFORMANCE)
-85% coerência com 376ms de latência
-"""
-import os
-import time
-import sys
-import json
-os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
-os.environ['HF_HOME'] = '/tmp/hf_cache'
-print("=" * 70)
-print("🎯 TESTE FINAL - CONFIGURAÇÃO V1 (MELHOR RESULTADO)")
-print("=" * 70)
-# System prompt V1 - simples e eficaz
-SYSTEM_PROMPT = """Você é um assistente conversacional amigável.
-REGRAS:
-1. Responda APENAS a pergunta feita
-2. Use entre 7-15 palavras
-3. Termine com pontuação apropriada
-4. Seja natural e direto
-5. NÃO invente continuação da conversa"""
-# 100 perguntas organizadas por categoria
-QUESTIONS = {
-    "Cumprimentos": [
-        "Olá, como você está?",
-        "Bom dia!",
-        "Boa tarde!",
-        "Boa noite!",
-        "Oi, tudo bem?",
-        "Como vai você?",
-        "E aí, beleza?",
-        "Prazer em conhecê-lo",
-        "Até logo!",
-        "Tchau!"
-    ],
-    "Identidade": [
-        "Qual é o seu nome?",
-        "Quem é você?",
-        "Você é humano?",
-        "Você é um robô?",
-        "Qual sua idade?",
-        "Quando você nasceu?",
-        "Você tem apelido?",
-        "Como devo te chamar?",
-        "Você é homem ou mulher?",
-        "Você tem sobrenome?"
-    ],
-    "Origem": [
-        "De onde você é?",
-        "Onde você nasceu?",
-        "Em que país você vive?",
-        "Qual sua nacionalidade?",
-        "Você é brasileiro?",
-        "De que cidade você é?",
-        "Onde você mora?",
-        "Qual seu endereço?",
-        "Você mora no Brasil?",
-        "Em que estado você vive?"
-    ],
-    "Família": [
-        "Você tem família?",
-        "Você tem irmãos?",
-        "Seus pais estão vivos?",
-        "Você é casado?",
-        "Tem filhos?",
-        "Quantos irmãos você tem?",
-        "Como está sua mãe?",
-        "Seu pai trabalha?",
-        "Você tem avós?",
-        "Tem namorada?"
-    ],
-    "Trabalho": [
-        "Você trabalha?",
-        "Qual sua profissão?",
-        "Onde você trabalha?",
-        "Você gosta do seu trabalho?",
-        "Quanto você ganha?",
-        "Você tem chefe?",
-        "Trabalha em equipe?",
-        "Que horas você trabalha?",
-        "Você trabalha aos sábados?",
-        "Está de férias?"
-    ],
-    "Rotina": [
-        "Que horas você acorda?",
-        "O que você come no café?",
-        "A que horas almoça?",
-        "Você dorme cedo?",
-        "Toma banho de manhã?",
-        "Como vai ao trabalho?",
-        "Você janta em casa?",
-        "Assiste TV à noite?",
-        "Faz exercícios?",
-        "Você estuda?"
-    ],
-    "Preferências": [
-        "O que você gosta de comer?",
-        "Qual sua cor favorita?",
-        "Você gosta de música?",
-        "Prefere frio ou calor?",
-        "Gosta de praia?",
-        "Você bebe café?",
-        "Gosta de doce?",
-        "Prefere cão ou gato?",
-        "Você fuma?",
-        "Gosta de viajar?"
-    ],
-    "Habilidades": [
-        "Você fala inglês?",
-        "Sabe cozinhar?",
-        "Você dirige?",
-        "Toca algum instrumento?",
-        "Sabe nadar?",
-        "Você dança?",
-        "Sabe cantar?",
-        "Fala outras línguas?",
-        "Você desenha?",
-        "Sabe programar?"
-    ],
-    "Situações": [
-        "Você está ocupado?",
-        "Pode me ajudar?",
-        "Você está bem?",
-        "Está com fome?",
-        "Tem tempo agora?",
-        "Você está cansado?",
-        "Está feliz hoje?",
-        "Você está doente?",
-        "Precisa de algo?",
-        "Está com pressa?"
-    ],
-    "Perguntas Gerais": [
-        "Que dia é hoje?",
-        "Que horas são?",
-        "Como está o tempo?",
-        "Vai chover?",
-        "Está frio?",
-        "Onde fica o banco?",
-        "Tem farmácia perto?",
-        "Quanto custa?",
-        "Aceita cartão?",
-        "Tem troco?"
-    ]
-}
-def evaluate_coherence(question, response):
-    """
-    Avalia se a resposta é coerente com a pergunta
-    Versão balanceada que obteve 85% de sucesso
-    """
-    # Verificações básicas de qualidade
-    if not response or len(response.strip()) < 3:
-        return False, "Resposta vazia ou muito curta"
-    # Não deve conter vazamentos
-    if any(x in response for x in ["Usuário:", "Assistente:", "user said", "```"]):
-        return False, "Contém vazamento de conversa"
-    # Deve ter pontuação final
-    if not response.rstrip().endswith(('.', '!', '?')):
-        return False, "Sem pontuação final"
-    # Análise contextual por tipo de pergunta
-    q_lower = question.lower()
-    r_lower = response.lower()
-    # Cumprimentos
-    if any(x in q_lower for x in ["olá", "oi", "bom dia", "boa tarde", "boa noite", "tchau", "até"]):
-        if any(x in r_lower for x in ["olá", "oi", "bom", "boa", "bem", "tchau", "até", "prazer", "também"]):
-            return True, "Cumprimento apropriado"
-        return False, "Não respondeu ao cumprimento"
-    # Perguntas sobre identidade
-    if "nome" in q_lower or "quem" in q_lower:
-        if len(response) > 10:  # Resposta elaborada
-            return True, "Identificação apropriada"
-        return False, "Resposta inadequada sobre identidade"
-    # Perguntas sim/não
-    if q_lower.startswith(("você", "tem", "está", "pode", "sabe", "gosta", "prefere", "fala")):
-        if len(response.split()) >= 3:  # Não apenas "sim" ou "não"
-            return True, "Resposta elaborada"
-        return False, "Resposta muito simples para pergunta sim/não"
-    # Perguntas sobre localização
-    if any(x in q_lower for x in ["onde", "qual endereço", "que cidade"]):
-        if len(response) > 10:
-            return True, "Informação de localização"
-        return False, "Resposta inadequada sobre local"
-    # Perguntas sobre tempo
-    if any(x in q_lower for x in ["que horas", "que dia", "quando"]):
-        if len(response) > 8:
-            return True, "Informação temporal"
-        return False, "Resposta inadequada sobre tempo"
-    # Perguntas sobre quantidade
-    if q_lower.startswith("quantos") or "quanto" in q_lower:
-        if any(char.isdigit() for char in response) or any(x in r_lower for x in ["não tenho", "nenhum", "vários", "alguns", "muitos"]):
-            return True, "Resposta quantitativa"
-        return False, "Não respondeu quantidade"
-    # Para outras perguntas, verifica se tem conteúdo substancial
-    word_count = len(response.split())
-    if word_count >= 4:
-        return True, "Resposta com conteúdo adequado"
-    return False, "Resposta inadequada ou muito curta"
-try:
-    from vllm import LLM, SamplingParams
-    print("\n⏳ Carregando modelo com vLLM...")
-    start_load = time.time()
-    model = LLM(
-        model="/tmp/Qwen3-4B",
-        trust_remote_code=True,
-        dtype="float16",
-        gpu_memory_utilization=0.90,
-        max_model_len=1024,
-        disable_log_stats=True
-    )
-    print(f"✅ Modelo carregado em {time.time()-start_load:.1f}s")
-    # Configuração V1 - a que obteve melhores resultados
-    print("\n🔧 Configuração V1 (Melhor Performance):")
-    print("  • min_tokens: 7")
-    print("  • max_tokens: 20")
-    print("  • temperature: 0.0")
-    print("  • repetition_penalty: 1.2")
-    print("  • stop sequences: ['.', '!', '?', '\\n', 'Usuário:']")
-    sampling_params = SamplingParams(
-        min_tokens=7,
-        max_tokens=20,
-        temperature=0.0,
-        stop=[".", "!", "?", "\n", "Usuário:"],
-        include_stop_str_in_output=True,
-        repetition_penalty=1.2
-    )
-    # Warm-up
-    print("\n🔥 Aquecimento...")
-    for i in range(3):
-        _ = model.generate(["teste"], SamplingParams(max_tokens=5, temperature=0))
-    # Teste das 100 perguntas
-    print("\n📊 Testando 100 perguntas...")
-    print("-" * 70)
-    all_results = []
-    category_results = {}
-    total_coherent = 0
-    total_latency = 0
-    question_number = 0
-    for category, questions in QUESTIONS.items():
-        print(f"\n📚 {category}:")
-        cat_coherent = 0
-        cat_latency = 0
-        for question in questions:
-            question_number += 1
-            # Gera resposta
-            prompt = f"{SYSTEM_PROMPT}\n\nUsuário: {question}\nAssistente:"
-            start = time.time()
-            outputs = model.generate([prompt], sampling_params)
-            latency = (time.time() - start) * 1000
-            response = outputs[0].outputs[0].text.strip() if outputs else ""
-            # Avalia coerência
-            is_coherent, reason = evaluate_coherence(question, response)
-            # Armazena resultado
-            result = {
-                "question": question,
-                "response": response,
-                "coherent": is_coherent,
-                "reason": reason,
-                "latency": latency
-            }
-            all_results.append(result)
-            if is_coherent:
-                total_coherent += 1
-                cat_coherent += 1
-                symbol = "✅"
-            else:
-                symbol = "❌"
-            total_latency += latency
-            cat_latency += latency
-            # Mostra progresso
-            print(f"  [{question_number:3d}/100] {symbol} {latency:4.0f}ms | {question[:30]:30s}")
-        # Estatísticas da categoria
-        category_results[category] = {
-            "coherent": cat_coherent,
-            "total": len(questions),
-            "accuracy": (cat_coherent / len(questions)) * 100,
-            "avg_latency": cat_latency / len(questions)
-        }
-        print(f"  → Categoria: {cat_coherent}/{len(questions)} ({cat_coherent*10}%) - {cat_latency/len(questions):.0f}ms média")
-    # Resultados finais
-    print("\n" + "=" * 70)
-    print("📊 RESULTADOS FINAIS - CONFIGURAÇÃO V1")
-    print("=" * 70)
-    accuracy = (total_coherent / 100) * 100
-    avg_latency = total_latency / 100
-    print(f"\n✅ COERÊNCIA GERAL: {total_coherent}/100 ({accuracy:.1f}%)")
-    print(f"⏱️ LATÊNCIA MÉDIA: {avg_latency:.0f}ms")
-    print("\n📈 POR CATEGORIA:")
-    for cat, stats in category_results.items():
-        bar = "█" * int(stats['accuracy'] / 10)
-        print(f"  {cat:20s}: {bar:10s} {stats['coherent']:2d}/10 ({stats['accuracy']:.0f}%)")
-    # Análise de problemas
-    incoherent = [r for r in all_results if not r["coherent"]]
-    if incoherent:
-        print(f"\n❌ RESPOSTAS INCOERENTES ({len(incoherent)}):")
-        reasons = {}
-        for r in incoherent:
-            reasons[r["reason"]] = reasons.get(r["reason"], 0) + 1
-        for reason, count in sorted(reasons.items(), key=lambda x: x[1], reverse=True):
-            print(f"  • {reason}: {count} casos")
-    # Exemplos
-    print("\n📝 EXEMPLOS DE RESPOSTAS:")
-    # Mostra 5 boas e 5 ruins
-    coherent_examples = [r for r in all_results if r["coherent"]][:5]
-    incoherent_examples = [r for r in all_results if not r["coherent"]][:5]
-    if coherent_examples:
-        print("\n✅ RESPOSTAS COERENTES:")
-        for r in coherent_examples:
-            print(f"  P: {r['question']}")
-            print(f"  R: {r['response']}")
-            print()
-    if incoherent_examples:
-        print("\n❌ RESPOSTAS INCOERENTES:")
-        for r in incoherent_examples:
-            print(f"  P: {r['question']}")
-            print(f"  R: {r['response']}")
-            print(f"  Problema: {r['reason']}")
-            print()
-    # Conclusão
-    print("=" * 70)
-    print("🎯 CONCLUSÃO:")
-    if accuracy >= 85:
-        print(f"  ✅ EXCELENTE! {accuracy:.0f}% de coerência")
-    elif accuracy >= 80:
-        print(f"  ✅ BOM! {accuracy:.0f}% de coerência")
-    elif accuracy >= 70:
-        print(f"  ⚠️ ACEITÁVEL. {accuracy:.0f}% de coerência")
-    else:
-        print(f"  ❌ PRECISA MELHORAR. Apenas {accuracy:.0f}% de coerência")
-    if avg_latency < 400:
-        print(f"  ✅ Latência ÓTIMA: {avg_latency:.0f}ms")
-    elif avg_latency < 600:
-        print(f"  ✅ Latência BOA: {avg_latency:.0f}ms")
-    else:
-        print(f"  ⚠️ Latência ALTA: {avg_latency:.0f}ms")
-    print("\n📊 HISTÓRICO DE TESTES:")
-    print("  V1 (original): 85% coerência, 376ms latência")
-    print("  V2 (restritivo): 84% coerência, 378ms latência")
-    print("  V3 (flexível): 72% coerência, 427ms latência")
-    print(f"  V1 (atual): {accuracy:.0f}% coerência, {avg_latency:.0f}ms latência")
-    # Salvar resultados
-    with open("/tmp/test_100_final_results.json", "w") as f:
-        json.dump({
-            "version": "V1_FINAL",
-            "accuracy": accuracy,
-            "avg_latency": avg_latency,
-            "total_coherent": total_coherent,
-            "categories": category_results,
-            "all_results": all_results
-        }, f, indent=2, ensure_ascii=False)
-    print(f"\n💾 Resultados salvos em /tmp/test_100_final_results.json")
-except Exception as e:
-    print(f"❌ Erro: {e}")
-    import traceback
-    traceback.print_exc()
-    sys.exit(1)
-print("\n" + "=" * 70)