from __future__ import annotations import re _whisper_model = None MATH_PHRASE_MAP = { "square root of": "sqrt(", "squared": "^2", "cubed": "^3", "raised to the power of": "^", "raised to": "^", "to the power of": "^", "divided by": "/", "multiplied by": "*", "times": "*", "plus": "+", "minus": "-", "equals": "=", "greater than or equal to": ">=", "less than or equal to": "<=", "greater than": ">", "less than": "<", "pi": "π", "theta": "θ", "alpha": "α", "beta": "β", "sigma": "σ", "infinity": "∞", "integral of": "∫", "summation of": "Σ", "x squared": "x^2", "x cubed": "x^3", } def _get_whisper(model_name: str = "base"): global _whisper_model if _whisper_model is None: try: import whisper _whisper_model = whisper.load_model(model_name, device="cpu") except ImportError: raise RuntimeError( "Whisper is not installed. Install it with: pip install openai-whisper" ) except Exception as e: raise RuntimeError( f"Failed to load Whisper model: {e}" ) return _whisper_model def _post_process_math(text: str) -> str: result = text for phrase, replacement in MATH_PHRASE_MAP.items(): result = re.sub(re.escape(phrase), replacement, result, flags=re.IGNORECASE) open_count = result.count("sqrt(") close_after = result.count(")") if open_count > close_after: result += ")" * (open_count - close_after) return result def handle_audio_input(audio_path: str, model_name: str = "base") -> dict: model = _get_whisper(model_name) result = model.transcribe(audio_path) raw_text = result.get("text", "").strip() processed_text = _post_process_math(raw_text) segments = result.get("segments", []) if segments: avg_confidence = sum( seg.get("avg_logprob", -1.0) for seg in segments ) / len(segments) confidence = max(0.0, min(1.0, 1.0 + avg_confidence)) else: confidence = 0.0 return { "text": processed_text, "raw_transcript": raw_text, "confidence": round(float(confidence), 3), "input_type": "audio", }