SamiKoen commited on
Commit
152b885
·
1 Parent(s): bc81313

Sadelestirme: OpenAI native interrupt davranisi (anlik kesilir), client-side barge-in mantigi tamamen kaldirildi. Sistem promptu: kisa cevap kurali

Browse files
Files changed (2) hide show
  1. app.py +7 -4
  2. static/index.html +7 -68
app.py CHANGED
@@ -59,12 +59,15 @@ def build_session_instructions() -> str:
59
  base = "Trek Bisiklet uzmani bir satis temsilcisisin."
60
 
61
  voice_addon = (
62
- "\n\nSESLI SOHBET KURALLARI:\n"
63
- "- Cevaplarin SES ile okunacagi icin KISA, DOGAL ve KONUSUR DILDE ol.\n"
 
 
64
  "- Markdown, * veya emoji KULLANMA.\n"
65
  "- HER ZAMAN 'siz' ile hitap et, soru ile bitirme.\n"
66
  "- Stok/fiyat sorulari geldiginde get_warehouse_stock fonksiyonunu cagir.\n"
67
- "- Fonksiyondan gelen sonucu DOGAL bir cumleyle ozetle, ham veriyi okuma.\n"
 
68
  )
69
  return apply_pronunciation_fixes(base + voice_addon)
70
 
@@ -145,7 +148,7 @@ async def realtime_relay(client_ws: WebSocket):
145
  "threshold": 0.5,
146
  "prefix_padding_ms": 300,
147
  "silence_duration_ms": 700,
148
- "interrupt_response": False, # Otomatik kesinti KAPALI bizim 2sn timer'imiz sorumlu
149
  "create_response": True,
150
  },
151
  "tools": TOOLS,
 
59
  base = "Trek Bisiklet uzmani bir satis temsilcisisin."
60
 
61
  voice_addon = (
62
+ "\n\nSESLI SOHBET KURALLARI (cok onemli):\n"
63
+ "- Cevaplarin TELEFON GORUSMESI gibi olsun: KISA, NET, ozet.\n"
64
+ "- En fazla 1-2 cumle. Liste yapma, detayli aciklama yapma.\n"
65
+ "- Musteri detay isterse o zaman uzat.\n"
66
  "- Markdown, * veya emoji KULLANMA.\n"
67
  "- HER ZAMAN 'siz' ile hitap et, soru ile bitirme.\n"
68
  "- Stok/fiyat sorulari geldiginde get_warehouse_stock fonksiyonunu cagir.\n"
69
+ "- Fonksiyon sonucunu DOGAL ve KISA bir cumleyle ozetle.\n"
70
+ "- Ornek: 'Caddebostan magazasinda mevcut, 250 bin lira.' (uzun aciklama yok).\n"
71
  )
72
  return apply_pronunciation_fixes(base + voice_addon)
73
 
 
148
  "threshold": 0.5,
149
  "prefix_padding_ms": 300,
150
  "silence_duration_ms": 700,
151
+ "interrupt_response": True, # Kullanici konusunca asistan ANINDA kesilir
152
  "create_response": True,
153
  },
154
  "tools": TOOLS,
static/index.html CHANGED
@@ -167,21 +167,7 @@ let analyserData = null;
167
  let freqData = null;
168
  let assistantSpeaking = false; // Asistan ses ciktiyor mu?
169
 
170
- // ----- Barge-in detection (cumulative) -----
171
- // "Son SPEAKING_WINDOW icinde toplam SPEAKING_REQUIRED konustuysan tetikle"
172
- // Boylece dogal duraklamalar timer'i sifirlamaz — toplam aktif konusma sayilir.
173
- const BARGE_IN_THRESHOLD = 0.012;
174
- const SPEAKING_WINDOW_MS = 4000; // 4sn pencere
175
- const SPEAKING_REQUIRED_MS = 2000; // toplam 2sn konusma
176
- const FRAME_DURATION_MS = 100; // worklet frame ~100ms
177
- const LEVEL_SMOOTH_WINDOW = 5;
178
- let bargeInTriggered = false;
179
- let openAiVadActive = false;
180
- let levelHistory = [];
181
- let speakingFrames = []; // konusma frame'lerinin timestamp listesi
182
- let lastBargeDebugLog = 0;
183
-
184
- // Aktif audio playback source'lari (cancel icin)
185
  let activeAudioSources = [];
186
 
187
  const $ = (id) => document.getElementById(id);
@@ -628,35 +614,6 @@ async function connect() {
628
  if (ws?.readyState !== WebSocket.OPEN) return;
629
  const b64 = arrayBufferToBase64(e.data.pcm);
630
  ws.send(JSON.stringify({ type: 'input_audio_buffer.append', audio: b64 }));
631
-
632
- // Barge-in: cumulative — son 4sn icinde toplam 2sn konusma
633
- if (assistantSpeaking && !bargeInTriggered) {
634
- const now = Date.now();
635
- levelHistory.push(e.data.level);
636
- if (levelHistory.length > LEVEL_SMOOTH_WINDOW) levelHistory.shift();
637
- const avgLevel = levelHistory.reduce((a, b) => a + b, 0) / levelHistory.length;
638
-
639
- const speaking = avgLevel > BARGE_IN_THRESHOLD || openAiVadActive;
640
- if (speaking) speakingFrames.push(now);
641
- // Penceren disindaki frame'leri sil
642
- speakingFrames = speakingFrames.filter(t => now - t < SPEAKING_WINDOW_MS);
643
- const totalSpeakingMs = speakingFrames.length * FRAME_DURATION_MS;
644
-
645
- // Saniyelik debug
646
- if (now - lastBargeDebugLog > 1000) {
647
- lastBargeDebugLog = now;
648
- if (totalSpeakingMs > 100) {
649
- console.log(`[barge-in] toplam=${totalSpeakingMs}ms avgLevel=${avgLevel.toFixed(3)} vad=${openAiVadActive}`);
650
- }
651
- }
652
-
653
- if (totalSpeakingMs >= SPEAKING_REQUIRED_MS) {
654
- triggerBargeIn();
655
- }
656
- } else if (!assistantSpeaking) {
657
- levelHistory = [];
658
- speakingFrames = [];
659
- }
660
  };
661
  src.connect(workletNode);
662
 
@@ -682,26 +639,22 @@ function handleEvent(evt) {
682
  if (evt.delta) playPCM16(base64ToInt16(evt.delta));
683
  break;
684
  case 'input_audio_buffer.speech_started':
 
685
  if (assistantSpeaking) {
686
- openAiVadActive = true;
687
- console.log('[VAD] OpenAI server kullaniciyi duydu (asistan konusurken)');
688
- } else {
689
- setStatus('Sizi dinliyorum...', 'connected');
690
  }
 
691
  break;
692
  case 'input_audio_buffer.speech_stopped':
693
- openAiVadActive = false;
694
  if (!assistantSpeaking) setStatus('Dusunuyor...', 'connecting');
695
  break;
696
  case 'response.created':
697
  setStatus('Yanitliyor', 'connected');
698
  assistantSpeaking = true;
699
- bargeInTriggered = false;
700
- speakingFrames = [];
701
  break;
702
  case 'response.done':
703
  assistantSpeaking = false;
704
- speakingFrames = [];
705
  setStatus('Bagli — konusabilirsiniz', 'connected');
706
  if (evt.response?.status === 'failed')
707
  console.error('[error]', evt.response?.status_details);
@@ -715,8 +668,8 @@ function handleEvent(evt) {
715
 
716
  function playPCM16(i16) {
717
  if (!playbackCtx || !analyser) return;
718
- // Barge-in olduysa yeni audio chunk gelirse ignore et
719
- if (bargeInTriggered) return;
720
 
721
  const f32 = new Float32Array(i16.length);
722
  for (let i = 0; i < i16.length; i++) f32[i] = i16[i] / 0x8000;
@@ -745,26 +698,12 @@ function stopAllAudio() {
745
  if (playbackCtx) playbackTime = playbackCtx.currentTime;
746
  }
747
 
748
- function triggerBargeIn() {
749
- console.log('[barge-in] toplam 2sn+ konusuldu — asistan susturuluyor');
750
- if (ws?.readyState === WebSocket.OPEN) {
751
- ws.send(JSON.stringify({ type: 'response.cancel' }));
752
- }
753
- stopAllAudio();
754
- bargeInTriggered = true;
755
- speakingFrames = [];
756
- assistantSpeaking = false;
757
- openAiVadActive = false;
758
- setStatus('Sizi dinliyorum...', 'connected');
759
- }
760
 
761
  function disconnect() {
762
  setStatus('Baglanti kesildi', 'disconnected');
763
  $('btnConnect').disabled = false;
764
  $('btnDisconnect').disabled = true;
765
  assistantSpeaking = false;
766
- bargeInTriggered = false;
767
- speakingFrames = [];
768
  if (workletNode) { try { workletNode.disconnect(); } catch {} }
769
  if (audioCtx) { try { audioCtx.close(); } catch {} }
770
  if (playbackCtx) { try { playbackCtx.close(); } catch {} }
 
167
  let freqData = null;
168
  let assistantSpeaking = false; // Asistan ses ciktiyor mu?
169
 
170
+ // Aktif audio playback source'lari (kullanici konusmaya baslayinca temizlenir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  let activeAudioSources = [];
172
 
173
  const $ = (id) => document.getElementById(id);
 
614
  if (ws?.readyState !== WebSocket.OPEN) return;
615
  const b64 = arrayBufferToBase64(e.data.pcm);
616
  ws.send(JSON.stringify({ type: 'input_audio_buffer.append', audio: b64 }));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
  };
618
  src.connect(workletNode);
619
 
 
639
  if (evt.delta) playPCM16(base64ToInt16(evt.delta));
640
  break;
641
  case 'input_audio_buffer.speech_started':
642
+ // Kullanici konusmaya basladi — eger asistan konusuyorsa pending PCM'leri at
643
  if (assistantSpeaking) {
644
+ stopAllAudio();
645
+ assistantSpeaking = false;
 
 
646
  }
647
+ setStatus('Sizi dinliyorum...', 'connected');
648
  break;
649
  case 'input_audio_buffer.speech_stopped':
 
650
  if (!assistantSpeaking) setStatus('Dusunuyor...', 'connecting');
651
  break;
652
  case 'response.created':
653
  setStatus('Yanitliyor', 'connected');
654
  assistantSpeaking = true;
 
 
655
  break;
656
  case 'response.done':
657
  assistantSpeaking = false;
 
658
  setStatus('Bagli — konusabilirsiniz', 'connected');
659
  if (evt.response?.status === 'failed')
660
  console.error('[error]', evt.response?.status_details);
 
668
 
669
  function playPCM16(i16) {
670
  if (!playbackCtx || !analyser) return;
671
+ // Asistan susturulduysa yeni chunk'lari oynatma
672
+ if (!assistantSpeaking) return;
673
 
674
  const f32 = new Float32Array(i16.length);
675
  for (let i = 0; i < i16.length; i++) f32[i] = i16[i] / 0x8000;
 
698
  if (playbackCtx) playbackTime = playbackCtx.currentTime;
699
  }
700
 
 
 
 
 
 
 
 
 
 
 
 
 
701
 
702
  function disconnect() {
703
  setStatus('Baglanti kesildi', 'disconnected');
704
  $('btnConnect').disabled = false;
705
  $('btnDisconnect').disabled = true;
706
  assistantSpeaking = false;
 
 
707
  if (workletNode) { try { workletNode.disconnect(); } catch {} }
708
  if (audioCtx) { try { audioCtx.close(); } catch {} }
709
  if (playbackCtx) { try { playbackCtx.close(); } catch {} }