Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import re | |
| _whisper_model = None | |
| MATH_PHRASE_MAP = { | |
| "square root of": "sqrt(", | |
| "squared": "^2", | |
| "cubed": "^3", | |
| "raised to the power of": "^", | |
| "raised to": "^", | |
| "to the power of": "^", | |
| "divided by": "/", | |
| "multiplied by": "*", | |
| "times": "*", | |
| "plus": "+", | |
| "minus": "-", | |
| "equals": "=", | |
| "greater than or equal to": ">=", | |
| "less than or equal to": "<=", | |
| "greater than": ">", | |
| "less than": "<", | |
| "pi": "π", | |
| "theta": "θ", | |
| "alpha": "α", | |
| "beta": "β", | |
| "sigma": "σ", | |
| "infinity": "∞", | |
| "integral of": "∫", | |
| "summation of": "Σ", | |
| "x squared": "x^2", | |
| "x cubed": "x^3", | |
| } | |
| def _get_whisper(model_name: str = "base"): | |
| global _whisper_model | |
| if _whisper_model is None: | |
| try: | |
| import whisper | |
| _whisper_model = whisper.load_model(model_name, device="cpu") | |
| except ImportError: | |
| raise RuntimeError( | |
| "Whisper is not installed. Install it with: pip install openai-whisper" | |
| ) | |
| except Exception as e: | |
| raise RuntimeError( | |
| f"Failed to load Whisper model: {e}" | |
| ) | |
| return _whisper_model | |
| def _post_process_math(text: str) -> str: | |
| result = text | |
| for phrase, replacement in MATH_PHRASE_MAP.items(): | |
| result = re.sub(re.escape(phrase), replacement, result, flags=re.IGNORECASE) | |
| open_count = result.count("sqrt(") | |
| close_after = result.count(")") | |
| if open_count > close_after: | |
| result += ")" * (open_count - close_after) | |
| return result | |
| def handle_audio_input(audio_path: str, model_name: str = "base") -> dict: | |
| model = _get_whisper(model_name) | |
| result = model.transcribe(audio_path) | |
| raw_text = result.get("text", "").strip() | |
| processed_text = _post_process_math(raw_text) | |
| segments = result.get("segments", []) | |
| if segments: | |
| avg_confidence = sum( | |
| seg.get("avg_logprob", -1.0) for seg in segments | |
| ) / len(segments) | |
| confidence = max(0.0, min(1.0, 1.0 + avg_confidence)) | |
| else: | |
| confidence = 0.0 | |
| return { | |
| "text": processed_text, | |
| "raw_transcript": raw_text, | |
| "confidence": round(float(confidence), 3), | |
| "input_type": "audio", | |
| } | |