Vurtnec
/

eot-detector-smollm2

@@ -33,7 +33,7 @@ import torch
 # ============================================================
 def normalize_text_multilingual(text: str) -> str:
-    """Normalize text for multilingual model (from base.py:56-67)"""
     if not text:
         return ""
     text = unicodedata.normalize("NFKC", text.lower())
@@ -45,7 +45,7 @@ def normalize_text_multilingual(text: str) -> str:
     return text
 def format_chat_for_livekit(messages: list, tokenizer) -> str:
-    """Format chat context for LiveKit model (from base.py:69-93)"""
     new_chat_ctx = []
     last_msg = None
@@ -55,7 +55,6 @@ def format_chat_for_livekit(messages: list, tokenizer) -> str:
         content = normalize_text_multilingual(msg["content"])
-        # Combine adjacent turns
         if last_msg and last_msg["role"] == msg["role"]:
             last_msg["content"] += f" {content}"
         else:
@@ -70,7 +69,6 @@ def format_chat_for_livekit(messages: list, tokenizer) -> str:
         tokenize=False
     )
-    # Remove the EOU token from current utterance
     ix = convo_text.rfind("<|im_end|>")
     text = convo_text[:ix]
     return text
@@ -83,7 +81,7 @@ def predict_livekit(session, tokenizer, messages: list) -> float:
         text,
         add_special_tokens=False,
         return_tensors="np",
-        max_length=128,  # MAX_HISTORY_TOKENS from base.py
         truncation=True,
     )
@@ -97,7 +95,6 @@ def predict_livekit(session, tokenizer, messages: list) -> float:
 def predict_finetuned(model, tokenizer, messages: list, device: str) -> float:
     """Run inference with fine-tuned model"""
-    # Format as ChatML
     formatted = ""
     for msg in messages:
         role = msg["role"]
@@ -117,14 +114,13 @@ def predict_finetuned(model, tokenizer, messages: list, device: str) -> float:
     generated = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-    # Parse prediction
     generated_lower = generated.strip().lower()
     if "<|eot|>" in generated_lower or "eot" in generated_lower:
-        return 1.0  # Complete turn
     elif "<|continue|>" in generated_lower or "continue" in generated_lower:
-        return 0.0  # Incomplete turn
     else:
-        return 0.5  # Uncertain
 # ============================================================
 # Main Evaluation
@@ -139,6 +135,7 @@ def main():
     print("\n[1/4] Loading test dataset...")
     dataset = load_dataset("Vurtnec/eot-detection-testset", split="train")
     print(f"   Loaded {len(dataset)} test samples")
     # Load fine-tuned model
     print("\n[2/4] Loading fine-tuned model (Vurtnec/eot-detector-smollm2)...")
@@ -155,8 +152,7 @@ def main():
     # Load LiveKit model
     print("\n[3/4] Loading LiveKit model (livekit/turn-detector)...")
-    # Download ONNX model (using multilingual version)
-    revision = "v0.4.1-intl"  # multilingual
     onnx_path = hf_hub_download(
         repo_id="livekit/turn-detector",
         filename="model_q8.onnx",
@@ -180,34 +176,14 @@ def main():
     ground_truth = []
     for i, sample in enumerate(dataset):
-        text = sample["text"]
-        # Parse ground truth from text
-        if "<|eot|>" in text:
-            label = 1  # Complete
-        else:
-            label = 0  # Incomplete
         ground_truth.append(label)
-        # Extract conversation from text
-        messages = []
-        parts = text.split("<|im_end|>")
-        for part in parts[:-1]:  # Skip the label part
-            if "<|im_start|>" in part:
-                idx = part.find("<|im_start|>")
-                content_part = part[idx + len("<|im_start|>"):]
-                if "\n" in content_part:
-                    role, content = content_part.split("\n", 1)
-                    role = role.strip()
-                    content = content.strip()
-                    if role in ["user", "assistant"] and content:
-                        messages.append({"role": role, "content": content})
-        if not messages:
-            # Fallback: treat as user message
-            clean_text = text.split("<|eot|>")[0].split("<|continue|>")[0].strip()
-            messages = [{"role": "user", "content": clean_text}]
         # Fine-tuned prediction
         try:
             ft_prob = predict_finetuned(ft_model, ft_tokenizer, messages, device)
@@ -220,11 +196,10 @@ def main():
         # LiveKit prediction
         try:
             lk_prob = predict_livekit(lk_session, lk_tokenizer, messages)
-            # LiveKit uses 0.5 as default threshold
             lk_pred = 1 if lk_prob >= 0.5 else 0
         except Exception as e:
             print(f"   Warning: LiveKit model error on sample {i}: {e}")
-            lk_pred = 1  # Default to complete on error
         lk_predictions.append(lk_pred)
         if (i + 1) % 10 == 0:
@@ -268,25 +243,25 @@ def main():
     print(f"Actual Incomplete    {lk_cm[0][0]:3d}       {lk_cm[0][1]:3d}")
     print(f"       Complete      {lk_cm[1][0]:3d}       {lk_cm[1][1]:3d}")
-    # Save results
-    results = {
-        "fine_tuned": {
-            "accuracy": accuracy_score(ground_truth, ft_predictions),
-            "precision": precision_score(ground_truth, ft_predictions, zero_division=0),
-            "recall": recall_score(ground_truth, ft_predictions, zero_division=0),
-            "f1": f1_score(ground_truth, ft_predictions, zero_division=0),
-        },
-        "livekit": {
-            "accuracy": accuracy_score(ground_truth, lk_predictions),
-            "precision": precision_score(ground_truth, lk_predictions, zero_division=0),
-            "recall": recall_score(ground_truth, lk_predictions, zero_division=0),
-            "f1": f1_score(ground_truth, lk_predictions, zero_division=0),
-        }
-    }
-    with open("comparison_results.json", "w") as f:
-        json.dump(results, f, indent=2)
-    print("\nResults saved to comparison_results.json")
 if __name__ == "__main__":
     main()

 # ============================================================
 def normalize_text_multilingual(text: str) -> str:
+    """Normalize text for multilingual model"""
     if not text:
         return ""
     text = unicodedata.normalize("NFKC", text.lower())
     return text
 def format_chat_for_livekit(messages: list, tokenizer) -> str:
+    """Format chat context for LiveKit model"""
     new_chat_ctx = []
     last_msg = None
         content = normalize_text_multilingual(msg["content"])
         if last_msg and last_msg["role"] == msg["role"]:
             last_msg["content"] += f" {content}"
         else:
         tokenize=False
     )
     ix = convo_text.rfind("<|im_end|>")
     text = convo_text[:ix]
     return text
         text,
         add_special_tokens=False,
         return_tensors="np",
+        max_length=128,
         truncation=True,
     )
 def predict_finetuned(model, tokenizer, messages: list, device: str) -> float:
     """Run inference with fine-tuned model"""
     formatted = ""
     for msg in messages:
         role = msg["role"]
     generated = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
     generated_lower = generated.strip().lower()
     if "<|eot|>" in generated_lower or "eot" in generated_lower:
+        return 1.0
     elif "<|continue|>" in generated_lower or "continue" in generated_lower:
+        return 0.0
     else:
+        return 0.5
 # ============================================================
 # Main Evaluation
     print("\n[1/4] Loading test dataset...")
     dataset = load_dataset("Vurtnec/eot-detection-testset", split="train")
     print(f"   Loaded {len(dataset)} test samples")
+    print(f"   Columns: {dataset.column_names}")
     # Load fine-tuned model
     print("\n[2/4] Loading fine-tuned model (Vurtnec/eot-detector-smollm2)...")
     # Load LiveKit model
     print("\n[3/4] Loading LiveKit model (livekit/turn-detector)...")
+    revision = "v0.4.1-intl"
     onnx_path = hf_hub_download(
         repo_id="livekit/turn-detector",
         filename="model_q8.onnx",
     ground_truth = []
     for i, sample in enumerate(dataset):
+        # Dataset structure: messages (list), is_complete (bool)
+        messages = sample["messages"]
+        is_complete = sample["is_complete"]
+        # Ground truth: 1 = complete, 0 = incomplete
+        label = 1 if is_complete else 0
         ground_truth.append(label)
         # Fine-tuned prediction
         try:
             ft_prob = predict_finetuned(ft_model, ft_tokenizer, messages, device)
         # LiveKit prediction
         try:
             lk_prob = predict_livekit(lk_session, lk_tokenizer, messages)
             lk_pred = 1 if lk_prob >= 0.5 else 0
         except Exception as e:
             print(f"   Warning: LiveKit model error on sample {i}: {e}")
+            lk_pred = 1
         lk_predictions.append(lk_pred)
         if (i + 1) % 10 == 0:
     print(f"Actual Incomplete    {lk_cm[0][0]:3d}       {lk_cm[0][1]:3d}")
     print(f"       Complete      {lk_cm[1][0]:3d}       {lk_cm[1][1]:3d}")
+    # Final summary
+    print("\n" + "=" * 60)
+    print("FINAL COMPARISON SUMMARY")
+    print("=" * 60)
+    ft_acc = accuracy_score(ground_truth, ft_predictions)
+    ft_f1 = f1_score(ground_truth, ft_predictions, zero_division=0)
+    lk_acc = accuracy_score(ground_truth, lk_predictions)
+    lk_f1 = f1_score(ground_truth, lk_predictions, zero_division=0)
+    print(f"\nFine-tuned Model: Accuracy={ft_acc*100:.2f}%, F1={ft_f1*100:.2f}%")
+    print(f"LiveKit Official: Accuracy={lk_acc*100:.2f}%, F1={lk_f1*100:.2f}%")
+    diff_acc = (lk_acc - ft_acc) * 100
+    diff_f1 = (lk_f1 - ft_f1) * 100
+    print(f"\nDifference (LiveKit - Fine-tuned):")
+    print(f"  Accuracy: {'+' if diff_acc >= 0 else ''}{diff_acc:.2f}%")
+    print(f"  F1 Score: {'+' if diff_f1 >= 0 else ''}{diff_f1:.2f}%")
 if __name__ == "__main__":
     main()