Spaces:

Ratnesh-dev
/

transcribe-diarize

Build error

App Files Files Community

Ratnesh-dev commited on Feb 25

Commit

035bf47

1 Parent(s): 4ea7fc4

Remove Unused Code From OpenAI Pipeline Stage

Browse files

Files changed (3) hide show

README.md +0 -4
app.py +1 -9
src/openai_cleanup_service.py +0 -396

README.md CHANGED Viewed

@@ -29,8 +29,6 @@ Model setup is global/outside `@spaces.GPU` so setup time is not billed to ZeroG
 ## `/run_complete_pipeline` inputs
 - `audio_file` (file path from Gradio client upload)
 - `huggingface_token`
-- `openai_api_key` (accepted for compatibility, unused in Space)
-- `executive_names_csv` (accepted for compatibility, unused in Space)
 Returns: merged transcript JSON only.
@@ -56,8 +54,6 @@ client = Client(SPACE)
 merged_transcript = client.predict(
     audio_file=handle_file(AUDIO_FILE),
     huggingface_token="hf_xxx",
-    openai_api_key="",       # unused
-    executive_names_csv="",  # unused
     api_name="/run_complete_pipeline",
 )

 ## `/run_complete_pipeline` inputs
 - `audio_file` (file path from Gradio client upload)
 - `huggingface_token`
 Returns: merged transcript JSON only.
 merged_transcript = client.predict(
     audio_file=handle_file(AUDIO_FILE),
     huggingface_token="hf_xxx",
     api_name="/run_complete_pipeline",
 )

app.py CHANGED Viewed

@@ -109,12 +109,7 @@ def _gpu_infer_pyannote_chunk(audio_file: str, model_options: dict[str, Any]):
 def run_complete_pipeline(
     audio_file: str,
     huggingface_token: str,
-    openai_api_key: str,
-    executive_names_csv: str,
 ):
-    # Kept in signature for compatibility with existing clients; not used on Space.
-    _ = openai_api_key
-    _ = executive_names_csv
     _parse_main_request(audio_file, huggingface_token)
     _raise_preload_error_if_any(PARAKEET_V3)
@@ -213,15 +208,12 @@ with gr.Blocks(title="Parakeet + Pyannote Pipeline") as demo:
         label="HuggingFace token",
         type="password",
     )
-    openai_api_key = gr.Textbox(label="OpenAI API key (unused in Space)", type="password")
-    executive_names_csv = gr.Textbox(label="Executive names / terms (unused in Space)")
     run_btn = gr.Button("Run full pipeline")
     output = gr.JSON(label="Combined transcript JSON")
     run_btn.click(
         fn=run_complete_pipeline,
-        inputs=[audio_file, huggingface_token, openai_api_key, executive_names_csv],
         outputs=output,
         api_name="run_complete_pipeline",
     )

 def run_complete_pipeline(
     audio_file: str,
     huggingface_token: str,
 ):
     _parse_main_request(audio_file, huggingface_token)
     _raise_preload_error_if_any(PARAKEET_V3)
         label="HuggingFace token",
         type="password",
     )
     run_btn = gr.Button("Run full pipeline")
     output = gr.JSON(label="Combined transcript JSON")
     run_btn.click(
         fn=run_complete_pipeline,
+        inputs=[audio_file, huggingface_token],
         outputs=output,
         api_name="run_complete_pipeline",
     )

src/openai_cleanup_service.py DELETED Viewed

@@ -1,396 +0,0 @@
-import json
-from typing import Any
-def _dumps_compact(payload: Any) -> str:
-    return json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
-def _response_to_dict(response: Any) -> dict[str, Any]:
-    if hasattr(response, "model_dump") and callable(response.model_dump):
-        return response.model_dump()
-    if hasattr(response, "to_dict") and callable(response.to_dict):
-        return response.to_dict()
-    return {"raw_response": str(response)}
-def _response_text(response: Any) -> str:
-    output_text = getattr(response, "output_text", None)
-    if isinstance(output_text, str) and output_text.strip():
-        return output_text
-    data = _response_to_dict(response)
-    if isinstance(data, dict):
-        for key in ("output_text", "text"):
-            val = data.get(key)
-            if isinstance(val, str) and val.strip():
-                return val
-    return ""
-def _extract_json_object(text: str) -> dict[str, Any]:
-    text = text.strip()
-    if not text:
-        raise ValueError("Model returned empty text.")
-    try:
-        parsed = json.loads(text)
-        if isinstance(parsed, dict):
-            return parsed
-    except Exception:
-        pass
-    start = text.find("{")
-    while start >= 0:
-        depth = 0
-        for idx in range(start, len(text)):
-            ch = text[idx]
-            if ch == "{":
-                depth += 1
-            elif ch == "}":
-                depth -= 1
-                if depth == 0:
-                    candidate = text[start : idx + 1]
-                    try:
-                        parsed = json.loads(candidate)
-                        if isinstance(parsed, dict):
-                            return parsed
-                    except Exception:
-                        break
-        start = text.find("{", start + 1)
-    raise ValueError("Could not parse a JSON object from model output.")
-def _usage_from_response_dict(payload: dict[str, Any]) -> dict[str, int | None]:
-    usage = payload.get("usage")
-    if not isinstance(usage, dict):
-        return {
-            "input_tokens": None,
-            "output_tokens": None,
-            "total_tokens": None,
-            "cached_input_tokens": None,
-            "reasoning_tokens": None,
-        }
-    input_details = usage.get("input_tokens_details", {})
-    output_details = usage.get("output_tokens_details", {})
-    return {
-        "input_tokens": usage.get("input_tokens"),
-        "output_tokens": usage.get("output_tokens"),
-        "total_tokens": usage.get("total_tokens"),
-        "cached_input_tokens": input_details.get("cached_tokens") if isinstance(input_details, dict) else None,
-        "reasoning_tokens": output_details.get("reasoning_tokens") if isinstance(output_details, dict) else None,
-    }
-def _sum_usage(
-    first: dict[str, int | None],
-    second: dict[str, int | None],
-) -> dict[str, int | None]:
-    def _sum_key(key: str) -> int | None:
-        a = first.get(key)
-        b = second.get(key)
-        if isinstance(a, int) and isinstance(b, int):
-            return a + b
-        if isinstance(a, int):
-            return a
-        if isinstance(b, int):
-            return b
-        return None
-    total = _sum_key("total_tokens")
-    input_tokens = _sum_key("input_tokens")
-    output_tokens = _sum_key("output_tokens")
-    if total is None and isinstance(input_tokens, int) and isinstance(output_tokens, int):
-        total = input_tokens + output_tokens
-    return {
-        "input_tokens": input_tokens,
-        "output_tokens": output_tokens,
-        "total_tokens": total,
-        "cached_input_tokens": _sum_key("cached_input_tokens"),
-        "reasoning_tokens": _sum_key("reasoning_tokens"),
-    }
-def _parse_executive_names(names_csv: str | None) -> list[str]:
-    out: list[str] = []
-    if names_csv:
-        for item in names_csv.split(","):
-            name = item.strip().strip('"').strip("'")
-            if name:
-                out.append(name)
-    seen = set()
-    deduped: list[str] = []
-    for name in out:
-        k = name.lower()
-        if k in seen:
-            continue
-        seen.add(k)
-        deduped.append(name)
-    return deduped
-def _build_chunk_plan(
-    turns: list[dict[str, Any]],
-    max_turns_per_chunk: int,
-    max_chars_per_chunk: int,
-) -> list[dict[str, int]]:
-    if max_turns_per_chunk <= 0:
-        max_turns_per_chunk = 1
-    if max_chars_per_chunk <= 0:
-        max_chars_per_chunk = 12000
-    plan: list[dict[str, int]] = []
-    n = len(turns)
-    start = 0
-    while start < n:
-        end = start
-        turns_count = 0
-        chars_count = 0
-        while end < n:
-            t = turns[end]
-            text_len = len(str(t.get("text", "")))
-            est = text_len + 60
-            if turns_count > 0 and (turns_count >= max_turns_per_chunk or chars_count + est > max_chars_per_chunk):
-                break
-            turns_count += 1
-            chars_count += est
-            end += 1
-        if end == start:
-            end = min(n, start + 1)
-        plan.append({"start": start, "end": end})
-        start = end
-    return plan
-def _normalize_final_label(final_label: str, source_label: str) -> str:
-    label = str(final_label or "").strip()
-    if not label:
-        return source_label
-    if "|" in label:
-        left = label.split("|", 1)[0].strip()
-        if left:
-            label = left
-    suffix = f"({source_label})"
-    if label.endswith(suffix):
-        label = label[: -len(suffix)].strip()
-    if not label:
-        return source_label
-    return label
-def _extract_map_updates(parsed: dict[str, Any]) -> list[dict[str, str]]:
-    candidates = parsed.get("speaker_label_map_updates")
-    if not isinstance(candidates, list):
-        candidates = parsed.get("speaker_mapping_final")
-    if not isinstance(candidates, list):
-        return []
-    updates: list[dict[str, str]] = []
-    for item in candidates:
-        if not isinstance(item, dict):
-            continue
-        source = str(item.get("source_label") or item.get("speaker_label") or "").strip()
-        final = str(item.get("final_label") or item.get("inferred_name") or "").strip()
-        if not source:
-            continue
-        updates.append({"source_label": source, "final_label": final})
-    return updates
-def _coerce_turns(
-    source_turns: list[dict[str, Any]],
-    parsed_turns: Any,
-    speaker_label_map: dict[str, str],
-) -> list[dict[str, Any]]:
-    out: list[dict[str, Any]] = []
-    parsed_list = parsed_turns if isinstance(parsed_turns, list) else []
-    for idx, source in enumerate(source_turns):
-        source_speaker = str(source.get("speaker", "SPEAKER_XX"))
-        mapped_default = speaker_label_map.get(source_speaker, source_speaker)
-        parsed_item = parsed_list[idx] if idx < len(parsed_list) and isinstance(parsed_list[idx], dict) else {}
-        candidate_speaker = _normalize_final_label(str(parsed_item.get("speaker", "")), source_speaker)
-        final_speaker = candidate_speaker or mapped_default
-        if final_speaker == source_speaker:
-            final_speaker = mapped_default
-        text = str(parsed_item.get("text", "")).strip() or str(source.get("text", "")).strip()
-        start = parsed_item.get("start", source.get("start"))
-        end = parsed_item.get("end", source.get("end"))
-        out.append(
-            {
-                "speaker": final_speaker,
-                "start": start,
-                "end": end,
-                "text": text,
-            }
-        )
-    return out
-def run_openai_cleanup_pipeline(
-    merged_transcript: dict[str, Any],
-    openai_api_key: str,
-    executive_names_csv: str | None,
-    *,
-    cleanup_model: str = "gpt-5",
-    timeout_seconds: float = 600.0,
-    max_turns_per_chunk: int = 80,
-    max_chars_per_chunk: int = 22000,
-) -> dict[str, Any]:
-    """
-    Single-pass per chunk: each OpenAI call does both speaker naming and transcript cleanup.
-    Avoids a separate full-document speaker inference pass for long audio reliability.
-    """
-    try:
-        from openai import OpenAI
-    except ImportError as exc:
-        raise RuntimeError("Missing dependency: openai. Install with `pip install openai`.") from exc
-    turns = merged_transcript.get("turns")
-    if not isinstance(turns, list) or not turns:
-        raise ValueError("Merged transcript must contain a non-empty `turns` list.")
-    executive_names = _parse_executive_names(executive_names_csv)
-    chunk_plan = _build_chunk_plan(
-        turns=turns,
-        max_turns_per_chunk=max_turns_per_chunk,
-        max_chars_per_chunk=max_chars_per_chunk,
-    )
-    client = OpenAI(api_key=openai_api_key, timeout=timeout_seconds, max_retries=0)
-    # Global mapping across chunks.
-    speaker_label_map: dict[str, str] = {}
-    for turn in turns:
-        source = str(turn.get("speaker", "")).strip()
-        if source:
-            speaker_label_map.setdefault(source, source)
-    combined_usage = {
-        "input_tokens": 0,
-        "output_tokens": 0,
-        "total_tokens": 0,
-        "cached_input_tokens": 0,
-        "reasoning_tokens": 0,
-    }
-    per_chunk_usage: list[dict[str, Any]] = []
-    cleaned_turns: list[dict[str, Any]] = []
-    chunk_notes: list[str] = []
-    chunk_raw_responses: list[dict[str, Any]] = []
-    for i, chunk in enumerate(chunk_plan):
-        start = chunk["start"]
-        end = chunk["end"]
-        source_chunk_turns = turns[start:end]
-        payload = {
-            "task": "For this chunk only: infer speaker names and clean transcript text in one pass.",
-            "rules": [
-                "Keep turn order and count exactly the same as input chunk.",
-                "Keep start/end timestamps aligned to input turns.",
-                "Correct misspellings and punctuation/casing.",
-                "Only remove filler words (uh, um, you know, like) and clear false-start words/phrases.",
-                "Do not aggressively summarize, compress, or paraphrase full sentences.",
-                "Preserve substantive wording and as much original content as possible.",
-                "If uncertain whether text is filler, keep it.",
-                "Infer speaker names from this chunk context only; do not guess beyond evidence.",
-                "If first name matches in `executive_names` but last name is uncertain, first name alone is allowed.",
-                "If speaker is call-control voice, label as Operator.",
-                "If speaker name is unknown, keep generic label SPEAKER_XX.",
-                "Never output combined labels like Name|SPEAKER_XX.",
-                "Use `existing_speaker_label_map` as source of truth for labels already resolved in prior chunks.",
-            ],
-            "output_schema": {
-                "speaker_label_map_updates": [
-                    {"source_label": "SPEAKER_XX", "final_label": "Name or SPEAKER_XX", "reason": "short"}
-                ],
-                "turns": [
-                    {
-                        "source_speaker": "SPEAKER_XX",
-                        "speaker": "Name or SPEAKER_XX",
-                        "start": "float",
-                        "end": "float",
-                        "text": "cleaned text",
-                    }
-                ],
-                "notes": ["string"],
-            },
-            "executive_names": executive_names,
-            "existing_speaker_label_map": speaker_label_map,
-            "chunk_index": i,
-            "chunk_start_turn_index": start,
-            "chunk_turns": source_chunk_turns,
-        }
-        response = client.responses.create(
-            model=cleanup_model,
-            input=[
-                {
-                    "role": "system",
-                    "content": "You are a transcript cleanup and speaker-label assistant. Return strict JSON only.",
-                },
-                {"role": "user", "content": _dumps_compact(payload)},
-            ],
-        )
-        raw = _response_to_dict(response)
-        parsed = _extract_json_object(_response_text(response))
-        usage = _usage_from_response_dict(raw)
-        for k in combined_usage:
-            combined_usage[k] += int(usage.get(k) or 0)
-        per_chunk_usage.append({"chunk_index": i, "usage": usage, "turn_range": [start, end]})
-        chunk_raw_responses.append({"chunk_index": i, "raw_response": raw})
-        for upd in _extract_map_updates(parsed):
-            source_label = upd["source_label"]
-            final_label = _normalize_final_label(upd["final_label"], source_label)
-            speaker_label_map[source_label] = final_label
-        notes = parsed.get("notes", [])
-        if isinstance(notes, list):
-            chunk_notes.extend([str(n) for n in notes if str(n).strip()])
-        cleaned_chunk_turns = _coerce_turns(
-            source_turns=source_chunk_turns,
-            parsed_turns=parsed.get("turns"),
-            speaker_label_map=speaker_label_map,
-        )
-        cleaned_turns.extend(cleaned_chunk_turns)
-    final_mapping = [
-        {"source_label": source, "final_label": final}
-        for source, final in sorted(speaker_label_map.items(), key=lambda x: x[0])
-    ]
-    summary = {
-        "turn_count": len(cleaned_turns),
-        "speaker_count": len({str(t.get("speaker", "")) for t in cleaned_turns}),
-        "chunk_count": len(chunk_plan),
-        "notes": chunk_notes[:200],
-    }
-    cleaned_json = {
-        "speaker_mapping_final": final_mapping,
-        "turns": cleaned_turns,
-        "summary": summary,
-        "openai_token_usage": {
-            "combined": combined_usage,
-            "per_chunk": per_chunk_usage,
-        },
-    }
-    return {
-        "cleaned_transcript": cleaned_json,
-        "debug": {
-            "cleanup_model": cleanup_model,
-            "executive_names": executive_names,
-            "chunk_plan": chunk_plan,
-            "speaker_label_map_final": speaker_label_map,
-            "openai_token_usage": cleaned_json["openai_token_usage"],
-            "openai_raw_responses": chunk_raw_responses,
-        },
-    }