Ratnesh-dev commited on
Commit
035bf47
·
1 Parent(s): 4ea7fc4

Remove Unused Code From OpenAI Pipeline Stage

Browse files
Files changed (3) hide show
  1. README.md +0 -4
  2. app.py +1 -9
  3. src/openai_cleanup_service.py +0 -396
README.md CHANGED
@@ -29,8 +29,6 @@ Model setup is global/outside `@spaces.GPU` so setup time is not billed to ZeroG
29
  ## `/run_complete_pipeline` inputs
30
  - `audio_file` (file path from Gradio client upload)
31
  - `huggingface_token`
32
- - `openai_api_key` (accepted for compatibility, unused in Space)
33
- - `executive_names_csv` (accepted for compatibility, unused in Space)
34
 
35
  Returns: merged transcript JSON only.
36
 
@@ -56,8 +54,6 @@ client = Client(SPACE)
56
  merged_transcript = client.predict(
57
  audio_file=handle_file(AUDIO_FILE),
58
  huggingface_token="hf_xxx",
59
- openai_api_key="", # unused
60
- executive_names_csv="", # unused
61
  api_name="/run_complete_pipeline",
62
  )
63
 
 
29
  ## `/run_complete_pipeline` inputs
30
  - `audio_file` (file path from Gradio client upload)
31
  - `huggingface_token`
 
 
32
 
33
  Returns: merged transcript JSON only.
34
 
 
54
  merged_transcript = client.predict(
55
  audio_file=handle_file(AUDIO_FILE),
56
  huggingface_token="hf_xxx",
 
 
57
  api_name="/run_complete_pipeline",
58
  )
59
 
app.py CHANGED
@@ -109,12 +109,7 @@ def _gpu_infer_pyannote_chunk(audio_file: str, model_options: dict[str, Any]):
109
  def run_complete_pipeline(
110
  audio_file: str,
111
  huggingface_token: str,
112
- openai_api_key: str,
113
- executive_names_csv: str,
114
  ):
115
- # Kept in signature for compatibility with existing clients; not used on Space.
116
- _ = openai_api_key
117
- _ = executive_names_csv
118
  _parse_main_request(audio_file, huggingface_token)
119
  _raise_preload_error_if_any(PARAKEET_V3)
120
 
@@ -213,15 +208,12 @@ with gr.Blocks(title="Parakeet + Pyannote Pipeline") as demo:
213
  label="HuggingFace token",
214
  type="password",
215
  )
216
- openai_api_key = gr.Textbox(label="OpenAI API key (unused in Space)", type="password")
217
- executive_names_csv = gr.Textbox(label="Executive names / terms (unused in Space)")
218
-
219
  run_btn = gr.Button("Run full pipeline")
220
  output = gr.JSON(label="Combined transcript JSON")
221
 
222
  run_btn.click(
223
  fn=run_complete_pipeline,
224
- inputs=[audio_file, huggingface_token, openai_api_key, executive_names_csv],
225
  outputs=output,
226
  api_name="run_complete_pipeline",
227
  )
 
109
  def run_complete_pipeline(
110
  audio_file: str,
111
  huggingface_token: str,
 
 
112
  ):
 
 
 
113
  _parse_main_request(audio_file, huggingface_token)
114
  _raise_preload_error_if_any(PARAKEET_V3)
115
 
 
208
  label="HuggingFace token",
209
  type="password",
210
  )
 
 
 
211
  run_btn = gr.Button("Run full pipeline")
212
  output = gr.JSON(label="Combined transcript JSON")
213
 
214
  run_btn.click(
215
  fn=run_complete_pipeline,
216
+ inputs=[audio_file, huggingface_token],
217
  outputs=output,
218
  api_name="run_complete_pipeline",
219
  )
src/openai_cleanup_service.py DELETED
@@ -1,396 +0,0 @@
1
- import json
2
- from typing import Any
3
-
4
-
5
- def _dumps_compact(payload: Any) -> str:
6
- return json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
7
-
8
-
9
- def _response_to_dict(response: Any) -> dict[str, Any]:
10
- if hasattr(response, "model_dump") and callable(response.model_dump):
11
- return response.model_dump()
12
- if hasattr(response, "to_dict") and callable(response.to_dict):
13
- return response.to_dict()
14
- return {"raw_response": str(response)}
15
-
16
-
17
- def _response_text(response: Any) -> str:
18
- output_text = getattr(response, "output_text", None)
19
- if isinstance(output_text, str) and output_text.strip():
20
- return output_text
21
-
22
- data = _response_to_dict(response)
23
- if isinstance(data, dict):
24
- for key in ("output_text", "text"):
25
- val = data.get(key)
26
- if isinstance(val, str) and val.strip():
27
- return val
28
- return ""
29
-
30
-
31
- def _extract_json_object(text: str) -> dict[str, Any]:
32
- text = text.strip()
33
- if not text:
34
- raise ValueError("Model returned empty text.")
35
-
36
- try:
37
- parsed = json.loads(text)
38
- if isinstance(parsed, dict):
39
- return parsed
40
- except Exception:
41
- pass
42
-
43
- start = text.find("{")
44
- while start >= 0:
45
- depth = 0
46
- for idx in range(start, len(text)):
47
- ch = text[idx]
48
- if ch == "{":
49
- depth += 1
50
- elif ch == "}":
51
- depth -= 1
52
- if depth == 0:
53
- candidate = text[start : idx + 1]
54
- try:
55
- parsed = json.loads(candidate)
56
- if isinstance(parsed, dict):
57
- return parsed
58
- except Exception:
59
- break
60
- start = text.find("{", start + 1)
61
- raise ValueError("Could not parse a JSON object from model output.")
62
-
63
-
64
- def _usage_from_response_dict(payload: dict[str, Any]) -> dict[str, int | None]:
65
- usage = payload.get("usage")
66
- if not isinstance(usage, dict):
67
- return {
68
- "input_tokens": None,
69
- "output_tokens": None,
70
- "total_tokens": None,
71
- "cached_input_tokens": None,
72
- "reasoning_tokens": None,
73
- }
74
-
75
- input_details = usage.get("input_tokens_details", {})
76
- output_details = usage.get("output_tokens_details", {})
77
- return {
78
- "input_tokens": usage.get("input_tokens"),
79
- "output_tokens": usage.get("output_tokens"),
80
- "total_tokens": usage.get("total_tokens"),
81
- "cached_input_tokens": input_details.get("cached_tokens") if isinstance(input_details, dict) else None,
82
- "reasoning_tokens": output_details.get("reasoning_tokens") if isinstance(output_details, dict) else None,
83
- }
84
-
85
-
86
- def _sum_usage(
87
- first: dict[str, int | None],
88
- second: dict[str, int | None],
89
- ) -> dict[str, int | None]:
90
- def _sum_key(key: str) -> int | None:
91
- a = first.get(key)
92
- b = second.get(key)
93
- if isinstance(a, int) and isinstance(b, int):
94
- return a + b
95
- if isinstance(a, int):
96
- return a
97
- if isinstance(b, int):
98
- return b
99
- return None
100
-
101
- total = _sum_key("total_tokens")
102
- input_tokens = _sum_key("input_tokens")
103
- output_tokens = _sum_key("output_tokens")
104
- if total is None and isinstance(input_tokens, int) and isinstance(output_tokens, int):
105
- total = input_tokens + output_tokens
106
-
107
- return {
108
- "input_tokens": input_tokens,
109
- "output_tokens": output_tokens,
110
- "total_tokens": total,
111
- "cached_input_tokens": _sum_key("cached_input_tokens"),
112
- "reasoning_tokens": _sum_key("reasoning_tokens"),
113
- }
114
-
115
-
116
- def _parse_executive_names(names_csv: str | None) -> list[str]:
117
- out: list[str] = []
118
- if names_csv:
119
- for item in names_csv.split(","):
120
- name = item.strip().strip('"').strip("'")
121
- if name:
122
- out.append(name)
123
- seen = set()
124
- deduped: list[str] = []
125
- for name in out:
126
- k = name.lower()
127
- if k in seen:
128
- continue
129
- seen.add(k)
130
- deduped.append(name)
131
- return deduped
132
-
133
-
134
- def _build_chunk_plan(
135
- turns: list[dict[str, Any]],
136
- max_turns_per_chunk: int,
137
- max_chars_per_chunk: int,
138
- ) -> list[dict[str, int]]:
139
- if max_turns_per_chunk <= 0:
140
- max_turns_per_chunk = 1
141
- if max_chars_per_chunk <= 0:
142
- max_chars_per_chunk = 12000
143
-
144
- plan: list[dict[str, int]] = []
145
- n = len(turns)
146
- start = 0
147
- while start < n:
148
- end = start
149
- turns_count = 0
150
- chars_count = 0
151
- while end < n:
152
- t = turns[end]
153
- text_len = len(str(t.get("text", "")))
154
- est = text_len + 60
155
- if turns_count > 0 and (turns_count >= max_turns_per_chunk or chars_count + est > max_chars_per_chunk):
156
- break
157
- turns_count += 1
158
- chars_count += est
159
- end += 1
160
- if end == start:
161
- end = min(n, start + 1)
162
- plan.append({"start": start, "end": end})
163
- start = end
164
- return plan
165
-
166
-
167
- def _normalize_final_label(final_label: str, source_label: str) -> str:
168
- label = str(final_label or "").strip()
169
- if not label:
170
- return source_label
171
- if "|" in label:
172
- left = label.split("|", 1)[0].strip()
173
- if left:
174
- label = left
175
- suffix = f"({source_label})"
176
- if label.endswith(suffix):
177
- label = label[: -len(suffix)].strip()
178
- if not label:
179
- return source_label
180
- return label
181
-
182
-
183
- def _extract_map_updates(parsed: dict[str, Any]) -> list[dict[str, str]]:
184
- candidates = parsed.get("speaker_label_map_updates")
185
- if not isinstance(candidates, list):
186
- candidates = parsed.get("speaker_mapping_final")
187
- if not isinstance(candidates, list):
188
- return []
189
-
190
- updates: list[dict[str, str]] = []
191
- for item in candidates:
192
- if not isinstance(item, dict):
193
- continue
194
- source = str(item.get("source_label") or item.get("speaker_label") or "").strip()
195
- final = str(item.get("final_label") or item.get("inferred_name") or "").strip()
196
- if not source:
197
- continue
198
- updates.append({"source_label": source, "final_label": final})
199
- return updates
200
-
201
-
202
- def _coerce_turns(
203
- source_turns: list[dict[str, Any]],
204
- parsed_turns: Any,
205
- speaker_label_map: dict[str, str],
206
- ) -> list[dict[str, Any]]:
207
- out: list[dict[str, Any]] = []
208
- parsed_list = parsed_turns if isinstance(parsed_turns, list) else []
209
-
210
- for idx, source in enumerate(source_turns):
211
- source_speaker = str(source.get("speaker", "SPEAKER_XX"))
212
- mapped_default = speaker_label_map.get(source_speaker, source_speaker)
213
-
214
- parsed_item = parsed_list[idx] if idx < len(parsed_list) and isinstance(parsed_list[idx], dict) else {}
215
- candidate_speaker = _normalize_final_label(str(parsed_item.get("speaker", "")), source_speaker)
216
- final_speaker = candidate_speaker or mapped_default
217
- if final_speaker == source_speaker:
218
- final_speaker = mapped_default
219
-
220
- text = str(parsed_item.get("text", "")).strip() or str(source.get("text", "")).strip()
221
- start = parsed_item.get("start", source.get("start"))
222
- end = parsed_item.get("end", source.get("end"))
223
-
224
- out.append(
225
- {
226
- "speaker": final_speaker,
227
- "start": start,
228
- "end": end,
229
- "text": text,
230
- }
231
- )
232
- return out
233
-
234
-
235
- def run_openai_cleanup_pipeline(
236
- merged_transcript: dict[str, Any],
237
- openai_api_key: str,
238
- executive_names_csv: str | None,
239
- *,
240
- cleanup_model: str = "gpt-5",
241
- timeout_seconds: float = 600.0,
242
- max_turns_per_chunk: int = 80,
243
- max_chars_per_chunk: int = 22000,
244
- ) -> dict[str, Any]:
245
- """
246
- Single-pass per chunk: each OpenAI call does both speaker naming and transcript cleanup.
247
- Avoids a separate full-document speaker inference pass for long audio reliability.
248
- """
249
- try:
250
- from openai import OpenAI
251
- except ImportError as exc:
252
- raise RuntimeError("Missing dependency: openai. Install with `pip install openai`.") from exc
253
-
254
- turns = merged_transcript.get("turns")
255
- if not isinstance(turns, list) or not turns:
256
- raise ValueError("Merged transcript must contain a non-empty `turns` list.")
257
-
258
- executive_names = _parse_executive_names(executive_names_csv)
259
- chunk_plan = _build_chunk_plan(
260
- turns=turns,
261
- max_turns_per_chunk=max_turns_per_chunk,
262
- max_chars_per_chunk=max_chars_per_chunk,
263
- )
264
-
265
- client = OpenAI(api_key=openai_api_key, timeout=timeout_seconds, max_retries=0)
266
-
267
- # Global mapping across chunks.
268
- speaker_label_map: dict[str, str] = {}
269
- for turn in turns:
270
- source = str(turn.get("speaker", "")).strip()
271
- if source:
272
- speaker_label_map.setdefault(source, source)
273
-
274
- combined_usage = {
275
- "input_tokens": 0,
276
- "output_tokens": 0,
277
- "total_tokens": 0,
278
- "cached_input_tokens": 0,
279
- "reasoning_tokens": 0,
280
- }
281
- per_chunk_usage: list[dict[str, Any]] = []
282
- cleaned_turns: list[dict[str, Any]] = []
283
- chunk_notes: list[str] = []
284
- chunk_raw_responses: list[dict[str, Any]] = []
285
-
286
- for i, chunk in enumerate(chunk_plan):
287
- start = chunk["start"]
288
- end = chunk["end"]
289
- source_chunk_turns = turns[start:end]
290
-
291
- payload = {
292
- "task": "For this chunk only: infer speaker names and clean transcript text in one pass.",
293
- "rules": [
294
- "Keep turn order and count exactly the same as input chunk.",
295
- "Keep start/end timestamps aligned to input turns.",
296
- "Correct misspellings and punctuation/casing.",
297
- "Only remove filler words (uh, um, you know, like) and clear false-start words/phrases.",
298
- "Do not aggressively summarize, compress, or paraphrase full sentences.",
299
- "Preserve substantive wording and as much original content as possible.",
300
- "If uncertain whether text is filler, keep it.",
301
- "Infer speaker names from this chunk context only; do not guess beyond evidence.",
302
- "If first name matches in `executive_names` but last name is uncertain, first name alone is allowed.",
303
- "If speaker is call-control voice, label as Operator.",
304
- "If speaker name is unknown, keep generic label SPEAKER_XX.",
305
- "Never output combined labels like Name|SPEAKER_XX.",
306
- "Use `existing_speaker_label_map` as source of truth for labels already resolved in prior chunks.",
307
- ],
308
- "output_schema": {
309
- "speaker_label_map_updates": [
310
- {"source_label": "SPEAKER_XX", "final_label": "Name or SPEAKER_XX", "reason": "short"}
311
- ],
312
- "turns": [
313
- {
314
- "source_speaker": "SPEAKER_XX",
315
- "speaker": "Name or SPEAKER_XX",
316
- "start": "float",
317
- "end": "float",
318
- "text": "cleaned text",
319
- }
320
- ],
321
- "notes": ["string"],
322
- },
323
- "executive_names": executive_names,
324
- "existing_speaker_label_map": speaker_label_map,
325
- "chunk_index": i,
326
- "chunk_start_turn_index": start,
327
- "chunk_turns": source_chunk_turns,
328
- }
329
-
330
- response = client.responses.create(
331
- model=cleanup_model,
332
- input=[
333
- {
334
- "role": "system",
335
- "content": "You are a transcript cleanup and speaker-label assistant. Return strict JSON only.",
336
- },
337
- {"role": "user", "content": _dumps_compact(payload)},
338
- ],
339
- )
340
-
341
- raw = _response_to_dict(response)
342
- parsed = _extract_json_object(_response_text(response))
343
- usage = _usage_from_response_dict(raw)
344
- for k in combined_usage:
345
- combined_usage[k] += int(usage.get(k) or 0)
346
- per_chunk_usage.append({"chunk_index": i, "usage": usage, "turn_range": [start, end]})
347
- chunk_raw_responses.append({"chunk_index": i, "raw_response": raw})
348
-
349
- for upd in _extract_map_updates(parsed):
350
- source_label = upd["source_label"]
351
- final_label = _normalize_final_label(upd["final_label"], source_label)
352
- speaker_label_map[source_label] = final_label
353
-
354
- notes = parsed.get("notes", [])
355
- if isinstance(notes, list):
356
- chunk_notes.extend([str(n) for n in notes if str(n).strip()])
357
-
358
- cleaned_chunk_turns = _coerce_turns(
359
- source_turns=source_chunk_turns,
360
- parsed_turns=parsed.get("turns"),
361
- speaker_label_map=speaker_label_map,
362
- )
363
- cleaned_turns.extend(cleaned_chunk_turns)
364
-
365
- final_mapping = [
366
- {"source_label": source, "final_label": final}
367
- for source, final in sorted(speaker_label_map.items(), key=lambda x: x[0])
368
- ]
369
-
370
- summary = {
371
- "turn_count": len(cleaned_turns),
372
- "speaker_count": len({str(t.get("speaker", "")) for t in cleaned_turns}),
373
- "chunk_count": len(chunk_plan),
374
- "notes": chunk_notes[:200],
375
- }
376
- cleaned_json = {
377
- "speaker_mapping_final": final_mapping,
378
- "turns": cleaned_turns,
379
- "summary": summary,
380
- "openai_token_usage": {
381
- "combined": combined_usage,
382
- "per_chunk": per_chunk_usage,
383
- },
384
- }
385
-
386
- return {
387
- "cleaned_transcript": cleaned_json,
388
- "debug": {
389
- "cleanup_model": cleanup_model,
390
- "executive_names": executive_names,
391
- "chunk_plan": chunk_plan,
392
- "speaker_label_map_final": speaker_label_map,
393
- "openai_token_usage": cleaned_json["openai_token_usage"],
394
- "openai_raw_responses": chunk_raw_responses,
395
- },
396
- }