Spaces:

LRU1
/

lec2note

Sleeping

App Files Files Community

LRU1 commited on Sep 7, 2025

Commit

9c13b61

1 Parent(s): c50abfa

add non-local whisper api

Browse files

Files changed (1) hide show

lec2note/ingestion/whisper_runner.py +24 -8

lec2note/ingestion/whisper_runner.py CHANGED Viewed

@@ -8,9 +8,9 @@ logger = logging.getLogger(__name__)
 from typing import List, Dict, Optional, Any
-import torch
 from whisper import load_model  # type: ignore
-import json
 __all__ = ["WhisperRunner"]
@@ -31,13 +31,29 @@ class WhisperRunner:  # noqa: D101
         if not audio_path.exists():
             raise FileNotFoundError(audio_path)
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        logger.info("[Whisper] loading model %s on %s", cls.model_name, device)
-        model = load_model(cls.model_name, device=device)
-        logger.info("[Whisper] transcribing %s", audio_path.name)
-        result = model.transcribe(str(audio_path), language=lang)
-        segments = result.get("segments", [])
         # convert to our schema
         logger.info("[Whisper] got %d segments", len(segments))

 from typing import List, Dict, Optional, Any
+import torch, json, os
 from whisper import load_model  # type: ignore
+from openai import OpenAI
 __all__ = ["WhisperRunner"]
         if not audio_path.exists():
             raise FileNotFoundError(audio_path)
+        use_local = os.getenv("AUDIO2TEXT_LOCAL", "true").lower() != "false"
+        if use_local:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            logger.info("[Whisper] loading model %s on %s", cls.model_name, device)
+            model = load_model(cls.model_name, device=device)
+            logger.info("[Whisper] transcribing %s (local)", audio_path.name)
+            result = model.transcribe(str(audio_path), language=lang)
+        else:
+            # remote API mode
+            api_base = os.getenv("AIHUB_API_BASE")
+            api_key = os.getenv("AIHUB_API_KEY")
+            if not api_key:
+                raise EnvironmentError("AIHUB_API_KEY not set")
+            client = OpenAI(api_key=api_key, base_url=api_base)
+            logger.info("[Whisper] uploading %s to API (whisper-large-v3)", audio_path.name)
+            with audio_path.open("rb") as f:
+                resp = client.audio.transcriptions.create(model="whisper-large-v3", file=f, language=lang)
+            # resp.text contains full text, but we need segments; assume API returns segments list if 'json' format
+            segments = resp.segments if hasattr(resp, "segments") else [{"start": 0.0, "end": 0.0, "text": resp.text}]
+            result = {"segments": segments}
         # convert to our schema
         logger.info("[Whisper] got %d segments", len(segments))