Amit-kr26's picture
Initial commit: Multimodal Math Mentor
3c25c17
from __future__ import annotations
import re
_whisper_model = None
MATH_PHRASE_MAP = {
"square root of": "sqrt(",
"squared": "^2",
"cubed": "^3",
"raised to the power of": "^",
"raised to": "^",
"to the power of": "^",
"divided by": "/",
"multiplied by": "*",
"times": "*",
"plus": "+",
"minus": "-",
"equals": "=",
"greater than or equal to": ">=",
"less than or equal to": "<=",
"greater than": ">",
"less than": "<",
"pi": "π",
"theta": "θ",
"alpha": "α",
"beta": "β",
"sigma": "σ",
"infinity": "∞",
"integral of": "∫",
"summation of": "Σ",
"x squared": "x^2",
"x cubed": "x^3",
}
def _get_whisper(model_name: str = "base"):
global _whisper_model
if _whisper_model is None:
try:
import whisper
_whisper_model = whisper.load_model(model_name, device="cpu")
except ImportError:
raise RuntimeError(
"Whisper is not installed. Install it with: pip install openai-whisper"
)
except Exception as e:
raise RuntimeError(
f"Failed to load Whisper model: {e}"
)
return _whisper_model
def _post_process_math(text: str) -> str:
result = text
for phrase, replacement in MATH_PHRASE_MAP.items():
result = re.sub(re.escape(phrase), replacement, result, flags=re.IGNORECASE)
open_count = result.count("sqrt(")
close_after = result.count(")")
if open_count > close_after:
result += ")" * (open_count - close_after)
return result
def handle_audio_input(audio_path: str, model_name: str = "base") -> dict:
model = _get_whisper(model_name)
result = model.transcribe(audio_path)
raw_text = result.get("text", "").strip()
processed_text = _post_process_math(raw_text)
segments = result.get("segments", [])
if segments:
avg_confidence = sum(
seg.get("avg_logprob", -1.0) for seg in segments
) / len(segments)
confidence = max(0.0, min(1.0, 1.0 + avg_confidence))
else:
confidence = 0.0
return {
"text": processed_text,
"raw_transcript": raw_text,
"confidence": round(float(confidence), 3),
"input_type": "audio",
}