File size: 2,487 Bytes
ec5faa7 07462e8 ec5faa7 07462e8 ec5faa7 33cafa8 07462e8 33cafa8 07462e8 33cafa8 07462e8 33cafa8 07462e8 33cafa8 07462e8 33cafa8 07462e8 33cafa8 07462e8 33cafa8 ec5faa7 33cafa8 07462e8 a723498 07462e8 a723498 07462e8 a723498 07462e8 33cafa8 07462e8 ec5faa7 33cafa8 07462e8 33cafa8 07462e8 33cafa8 07462e8 ec5faa7 07462e8 33cafa8 07462e8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | """
VoiceNote AI - Utilities
WER calculation, VIPS formatting, evaluation export
"""
import json
import logging
from datetime import datetime
from config import Config
logger = logging.getLogger(__name__)
def calculate_wer(reference: str, hypothesis: str) -> float:
"""
Calculate Word Error Rate (WER).
WER = (Substitutions + Deletions + Insertions) / N
where N = total words in reference.
Args:
reference: Ground truth transcription
hypothesis: Whisper output
Returns:
WER as percentage (0–100). Returns 0.0 if reference is empty.
"""
if not reference or not reference.strip():
return 0.0
ref_words = reference.strip().lower().split()
hyp_words = hypothesis.strip().lower().split()
# Dynamic programming edit distance
d = [[0] * (len(hyp_words) + 1) for _ in range(len(ref_words) + 1)]
for i in range(len(ref_words) + 1):
d[i][0] = i
for j in range(len(hyp_words) + 1):
d[0][j] = j
for i in range(1, len(ref_words) + 1):
for j in range(1, len(hyp_words) + 1):
if ref_words[i - 1] == hyp_words[j - 1]:
d[i][j] = d[i - 1][j - 1]
else:
d[i][j] = 1 + min(d[i - 1][j - 1], d[i][j - 1], d[i - 1][j])
wer = (d[len(ref_words)][len(hyp_words)] / len(ref_words)) * 100
return round(wer, 2)
def format_vips_output(vips: dict) -> str:
"""Format a VIPS dict as readable Swedish clinical text."""
labels = {
"V": "V (Välbefinnande)",
"I": "I (Integritet)",
"P": "P (Prevention)",
"S": "S (Säkerhet)",
}
lines = [f"{labels.get(k, k)}: {vips.get(k, 'Ingen relevant information.')}"
for k in ["V", "I", "P", "S"]]
return "\n".join(lines)
def save_evaluation(entry: dict) -> str:
"""
Append an evaluation entry to the JSONL file.
Args:
entry: Dict with evaluation answers
Returns:
Status message
"""
entry["timestamp"] = datetime.utcnow().isoformat() + "Z"
try:
with open(Config.EVAL_FILE, "a", encoding="utf-8") as f:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
logger.info(f"Evaluation saved: {entry}")
return "✅ Utvärdering sparad!"
except Exception as e:
logger.error(f"Failed to save evaluation: {e}")
return f"⚠️ Kunde inte spara: {e}"
def format_timestamp() -> str:
return datetime.now().strftime("%H:%M:%S") |