psychxD commited on
Commit
db4323e
·
verified ·
1 Parent(s): 3ca9f65

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +298 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Voice Analysis API for Salesforce
3
+ ==================================
4
+ Endpoints:
5
+ /analyze - Full analysis (diarization + overlap + voice metrics)
6
+
7
+ Returns JSON that Salesforce can parse.
8
+
9
+ Models used:
10
+ - pyannote/speaker-diarization-3.1 (who spoke when)
11
+ - pyannote/overlapped-speech-detection (coaching detection)
12
+ """
13
+
14
+ import gradio as gr
15
+ import os
16
+ import json
17
+ import torch
18
+ from pyannote.audio import Pipeline
19
+ from pyannote.audio.pipelines import OverlappedSpeechDetection
20
+ import scipy.io.wavfile as wavfile
21
+ import numpy as np
22
+
23
+ # ============================================================
24
+ # CONFIGURATION
25
+ # ============================================================
26
+
27
+ HF_TOKEN = os.environ.get("HF_TOKEN")
28
+
29
+ if not HF_TOKEN:
30
+ print("WARNING: HF_TOKEN not set. Gated models will fail.")
31
+
32
+ # ============================================================
33
+ # LOAD MODELS (runs once at startup)
34
+ # ============================================================
35
+
36
+ print("Loading diarization model...")
37
+ try:
38
+ diarization_pipeline = Pipeline.from_pretrained(
39
+ "pyannote/speaker-diarization-3.1",
40
+ use_auth_token=HF_TOKEN
41
+ )
42
+ print("✅ Diarization model loaded")
43
+ except Exception as e:
44
+ print(f"❌ Diarization model failed: {e}")
45
+ diarization_pipeline = None
46
+
47
+ print("Loading overlap detection model...")
48
+ try:
49
+ overlap_pipeline = Pipeline.from_pretrained(
50
+ "pyannote/overlapped-speech-detection",
51
+ use_auth_token=HF_TOKEN
52
+ )
53
+ print("✅ Overlap detection model loaded")
54
+ except Exception as e:
55
+ print(f"❌ Overlap detection failed: {e}")
56
+ overlap_pipeline = None
57
+
58
+
59
+ # ============================================================
60
+ # ANALYSIS FUNCTIONS
61
+ # ============================================================
62
+
63
+ def analyze_diarization(audio_path):
64
+ """
65
+ Identifies different speakers and their timestamps.
66
+ Returns list of segments with speaker labels.
67
+ """
68
+ if diarization_pipeline is None:
69
+ return {"error": "Diarization model not loaded"}
70
+
71
+ try:
72
+ diarization = diarization_pipeline(audio_path)
73
+
74
+ segments = []
75
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
76
+ segments.append({
77
+ "speaker": speaker,
78
+ "start": round(turn.start, 2),
79
+ "end": round(turn.end, 2),
80
+ "duration": round(turn.end - turn.start, 2)
81
+ })
82
+
83
+ # Identify borrower (assumes agent speaks first)
84
+ speakers = list(set([s["speaker"] for s in segments]))
85
+ agent_speaker = segments[0]["speaker"] if segments else None
86
+ borrower_speaker = None
87
+ for s in speakers:
88
+ if s != agent_speaker:
89
+ borrower_speaker = s
90
+ break
91
+
92
+ return {
93
+ "segments": segments,
94
+ "speaker_count": len(speakers),
95
+ "agent_speaker": agent_speaker,
96
+ "borrower_speaker": borrower_speaker,
97
+ "total_segments": len(segments)
98
+ }
99
+
100
+ except Exception as e:
101
+ return {"error": str(e)}
102
+
103
+
104
+ def analyze_overlap(audio_path):
105
+ """
106
+ Detects overlapping speech (multiple people talking at once).
107
+ Used for coaching detection.
108
+ """
109
+ if overlap_pipeline is None:
110
+ return {"error": "Overlap detection model not loaded"}
111
+
112
+ try:
113
+ overlap = overlap_pipeline(audio_path)
114
+
115
+ overlap_segments = []
116
+ for segment, _, label in overlap.itertracks(yield_label=True):
117
+ overlap_segments.append({
118
+ "start": round(segment.start, 2),
119
+ "end": round(segment.end, 2),
120
+ "duration": round(segment.end - segment.start, 2)
121
+ })
122
+
123
+ total_overlap_duration = sum([s["duration"] for s in overlap_segments])
124
+
125
+ return {
126
+ "overlap_segments": overlap_segments,
127
+ "overlap_count": len(overlap_segments),
128
+ "total_overlap_duration": round(total_overlap_duration, 2)
129
+ }
130
+
131
+ except Exception as e:
132
+ return {"error": str(e)}
133
+
134
+
135
+ def detect_coaching(diarization_result, overlap_result):
136
+ """
137
+ Cross-references overlap with borrower segments.
138
+ Overlap during borrower's speech = potential coaching.
139
+ """
140
+ coaching_flags = []
141
+
142
+ if "error" in diarization_result or "error" in overlap_result:
143
+ return {
144
+ "coaching_detected": False,
145
+ "error": "Could not analyze - model error"
146
+ }
147
+
148
+ borrower_speaker = diarization_result.get("borrower_speaker")
149
+
150
+ if not borrower_speaker:
151
+ return {
152
+ "coaching_detected": False,
153
+ "reason": "Could not identify borrower"
154
+ }
155
+
156
+ # Get borrower segments
157
+ borrower_segments = [
158
+ s for s in diarization_result["segments"]
159
+ if s["speaker"] == borrower_speaker
160
+ ]
161
+
162
+ # Get overlap segments
163
+ overlap_segments = overlap_result.get("overlap_segments", [])
164
+
165
+ # Check if any overlap falls within borrower's speaking time
166
+ for overlap in overlap_segments:
167
+ for borrower_seg in borrower_segments:
168
+ # Check if overlap is during borrower's speech
169
+ if (overlap["start"] >= borrower_seg["start"] and
170
+ overlap["start"] <= borrower_seg["end"]):
171
+ coaching_flags.append({
172
+ "overlap_time": f"{overlap['start']}-{overlap['end']}",
173
+ "during_borrower_segment": f"{borrower_seg['start']}-{borrower_seg['end']}",
174
+ "duration": overlap["duration"]
175
+ })
176
+
177
+ return {
178
+ "coaching_detected": len(coaching_flags) > 0,
179
+ "coaching_instances": len(coaching_flags),
180
+ "coaching_flags": coaching_flags,
181
+ "borrower_segments_analyzed": len(borrower_segments)
182
+ }
183
+
184
+
185
+ def analyze_voice_metrics(audio_path):
186
+ """
187
+ Basic voice analysis - pause detection, speaking rate.
188
+ For hesitation indicators.
189
+ """
190
+ try:
191
+ # Read audio file
192
+ sample_rate, audio_data = wavfile.read(audio_path)
193
+
194
+ # Convert to mono if stereo
195
+ if len(audio_data.shape) > 1:
196
+ audio_data = audio_data.mean(axis=1)
197
+
198
+ # Calculate basic metrics
199
+ duration = len(audio_data) / sample_rate
200
+
201
+ # Simple energy-based silence detection
202
+ energy = np.abs(audio_data).astype(float)
203
+ threshold = np.mean(energy) * 0.1
204
+ silence_samples = np.sum(energy < threshold)
205
+ silence_ratio = silence_samples / len(audio_data)
206
+
207
+ return {
208
+ "duration_seconds": round(duration, 2),
209
+ "silence_ratio": round(silence_ratio, 3),
210
+ "has_long_pauses": silence_ratio > 0.3
211
+ }
212
+
213
+ except Exception as e:
214
+ return {"error": str(e)}
215
+
216
+
217
+ # ============================================================
218
+ # MAIN ANALYSIS FUNCTION
219
+ # ============================================================
220
+
221
+ def full_analysis(audio_file):
222
+ """
223
+ Complete audio analysis - called by Gradio/API.
224
+ Returns JSON with all results.
225
+ """
226
+ if audio_file is None:
227
+ return json.dumps({"error": "No audio file provided"}, indent=2)
228
+
229
+ results = {
230
+ "status": "success",
231
+ "analysis": {}
232
+ }
233
+
234
+ try:
235
+ # Run all analyses
236
+ print(f"Analyzing: {audio_file}")
237
+
238
+ # 1. Diarization
239
+ print("Running diarization...")
240
+ diarization_result = analyze_diarization(audio_file)
241
+ results["analysis"]["diarization"] = diarization_result
242
+
243
+ # 2. Overlap detection
244
+ print("Running overlap detection...")
245
+ overlap_result = analyze_overlap(audio_file)
246
+ results["analysis"]["overlap"] = overlap_result
247
+
248
+ # 3. Coaching detection (cross-reference)
249
+ print("Analyzing coaching...")
250
+ coaching_result = detect_coaching(diarization_result, overlap_result)
251
+ results["analysis"]["coaching"] = coaching_result
252
+
253
+ # 4. Voice metrics
254
+ print("Analyzing voice metrics...")
255
+ voice_result = analyze_voice_metrics(audio_file)
256
+ results["analysis"]["voice_metrics"] = voice_result
257
+
258
+ # 5. Summary
259
+ results["summary"] = {
260
+ "speaker_count": diarization_result.get("speaker_count", 0),
261
+ "coaching_detected": coaching_result.get("coaching_detected", False),
262
+ "coaching_instances": coaching_result.get("coaching_instances", 0),
263
+ "has_long_pauses": voice_result.get("has_long_pauses", False),
264
+ "total_overlap_duration": overlap_result.get("total_overlap_duration", 0)
265
+ }
266
+
267
+ print("Analysis complete!")
268
+
269
+ except Exception as e:
270
+ results["status"] = "error"
271
+ results["error"] = str(e)
272
+
273
+ return json.dumps(results, indent=2)
274
+
275
+
276
+ # ============================================================
277
+ # GRADIO INTERFACE
278
+ # ============================================================
279
+
280
+ demo = gr.Interface(
281
+ fn=full_analysis,
282
+ inputs=gr.Audio(type="filepath", label="Upload Audio (MP3, WAV, M4A)"),
283
+ outputs=gr.JSON(label="Analysis Results"),
284
+ title="🎙️ Voice Analysis API for Salesforce",
285
+ description="""
286
+ Upload a call recording to analyze:
287
+ - **Speaker Diarization**: Who spoke when
288
+ - **Coaching Detection**: Overlapping speech during borrower's responses
289
+ - **Voice Metrics**: Pause detection, silence ratio
290
+
291
+ Returns JSON that Salesforce can parse via Apex callout.
292
+ """,
293
+ examples=[],
294
+ allow_flagging="never"
295
+ )
296
+
297
+ # Launch with API enabled
298
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ pyannote.audio
4
+ gradio>=4.0.0
5
+ pydub
6
+ scipy