Spaces:
Sleeping
Sleeping
Commit
·
915cc29
1
Parent(s):
b0a3faf
Redunt conversationals
Browse files- test_conversational_cleaning.py +164 -0
- utils/augment.py +82 -4
- utils/processor.py +10 -1
test_conversational_cleaning.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test conversational element cleaning and failed response handling
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import logging
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Add the project root to Python path
|
| 12 |
+
project_root = Path(__file__).parent
|
| 13 |
+
sys.path.insert(0, str(project_root))
|
| 14 |
+
|
| 15 |
+
from utils import augment as A
|
| 16 |
+
|
| 17 |
+
# Set up logging
|
| 18 |
+
logging.basicConfig(level=logging.INFO)
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
def test_conversational_cleaning():
|
| 22 |
+
"""Test conversational element cleaning"""
|
| 23 |
+
logger.info("Testing conversational element cleaning...")
|
| 24 |
+
|
| 25 |
+
test_cases = [
|
| 26 |
+
# (input, expected_contains, expected_not_contains, description)
|
| 27 |
+
("Hi, I'm a doctor. Diabetes symptoms include...", "Diabetes symptoms", ["Hi", "I'm a doctor"], "English greeting + doctor intro"),
|
| 28 |
+
("Xin chào, tôi là bác sĩ. Triệu chứng tiểu đường...", "Triệu chứng tiểu đường", ["Xin chào", "tôi là bác sĩ"], "Vietnamese greeting + doctor intro"),
|
| 29 |
+
("If you are a doctor, please answer...", "answer", ["If you are a doctor", "please"], "Doctor conditional"),
|
| 30 |
+
("Thank you for your question. The symptoms are...", "The symptoms are", ["Thank you", "for your question"], "Thank you prefix"),
|
| 31 |
+
("I hope this helps. Best regards!", "helps", ["I hope this", "Best regards"], "Thank you suffix"),
|
| 32 |
+
("Nếu bạn là bác sĩ, vui lòng trả lời...", "trả lời", ["Nếu bạn là bác sĩ", "vui lòng"], "Vietnamese doctor conditional"),
|
| 33 |
+
("As a medical professional, I can tell you...", "I can tell you", ["As a medical professional"], "Medical professional intro"),
|
| 34 |
+
("From a medical perspective, the answer is...", "the answer is", ["From a medical perspective"], "Medical perspective intro"),
|
| 35 |
+
("Medically speaking, this condition...", "this condition", ["Medically speaking"], "Medically speaking intro"),
|
| 36 |
+
("I'm here to help. The treatment is...", "The treatment is", ["I'm here to help"], "Helpful intro"),
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
all_passed = True
|
| 40 |
+
for input_text, expected_contains, expected_not_contains, description in test_cases:
|
| 41 |
+
cleaned = A.clean_conversational_elements(input_text)
|
| 42 |
+
|
| 43 |
+
# Check that expected content is preserved
|
| 44 |
+
contains_expected = all(phrase in cleaned for phrase in expected_contains)
|
| 45 |
+
|
| 46 |
+
# Check that conversational elements are removed
|
| 47 |
+
not_contains_expected = all(phrase not in cleaned for phrase in expected_not_contains)
|
| 48 |
+
|
| 49 |
+
status = "✅" if contains_expected and not_contains_expected else "❌"
|
| 50 |
+
if not (contains_expected and not_contains_expected):
|
| 51 |
+
all_passed = False
|
| 52 |
+
|
| 53 |
+
logger.info(f"{status} {description}")
|
| 54 |
+
logger.info(f" Input: '{input_text}'")
|
| 55 |
+
logger.info(f" Cleaned: '{cleaned}'")
|
| 56 |
+
logger.info(f" Contains expected: {contains_expected}, Removes unwanted: {not_contains_expected}")
|
| 57 |
+
logger.info("")
|
| 58 |
+
|
| 59 |
+
return all_passed
|
| 60 |
+
|
| 61 |
+
def test_invalid_response_detection():
|
| 62 |
+
"""Test invalid response detection"""
|
| 63 |
+
logger.info("Testing invalid response detection...")
|
| 64 |
+
|
| 65 |
+
test_cases = [
|
| 66 |
+
# (text, expected_invalid, description)
|
| 67 |
+
("FAIL", True, "Simple fail response"),
|
| 68 |
+
("I can't help you", True, "Can't help response"),
|
| 69 |
+
("I don't know", True, "Don't know response"),
|
| 70 |
+
("Sorry, I'm unable to", True, "Unable response"),
|
| 71 |
+
("Diabetes symptoms include...", False, "Valid medical response"),
|
| 72 |
+
("The treatment is...", False, "Valid treatment response"),
|
| 73 |
+
("", True, "Empty response"),
|
| 74 |
+
("Hi", True, "Too short response"),
|
| 75 |
+
("I'm sorry, I cannot determine", True, "Cannot determine response"),
|
| 76 |
+
]
|
| 77 |
+
|
| 78 |
+
all_passed = True
|
| 79 |
+
for text, expected_invalid, description in test_cases:
|
| 80 |
+
is_invalid = A.is_invalid_response(text)
|
| 81 |
+
status = "✅" if is_invalid == expected_invalid else "❌"
|
| 82 |
+
if is_invalid != expected_invalid:
|
| 83 |
+
all_passed = False
|
| 84 |
+
|
| 85 |
+
logger.info(f"{status} {description}: '{text}' -> {is_invalid} (expected {expected_invalid})")
|
| 86 |
+
|
| 87 |
+
return all_passed
|
| 88 |
+
|
| 89 |
+
def test_retry_logic():
|
| 90 |
+
"""Test retry logic for failed responses"""
|
| 91 |
+
logger.info("Testing retry logic...")
|
| 92 |
+
|
| 93 |
+
# Test that invalid responses are detected
|
| 94 |
+
invalid_responses = ["FAIL", "I can't help", "Sorry", ""]
|
| 95 |
+
|
| 96 |
+
for response in invalid_responses:
|
| 97 |
+
is_invalid = A.is_invalid_response(response)
|
| 98 |
+
if is_invalid:
|
| 99 |
+
logger.info(f"✅ Correctly detected invalid response: '{response}'")
|
| 100 |
+
else:
|
| 101 |
+
logger.error(f"❌ Failed to detect invalid response: '{response}'")
|
| 102 |
+
return False
|
| 103 |
+
|
| 104 |
+
# Test conversational cleaning
|
| 105 |
+
conversational_text = "Hi, I'm a doctor. Diabetes symptoms include increased thirst."
|
| 106 |
+
cleaned = A.clean_conversational_elements(conversational_text)
|
| 107 |
+
|
| 108 |
+
if "Diabetes symptoms include increased thirst" in cleaned and "Hi" not in cleaned:
|
| 109 |
+
logger.info("✅ Conversational cleaning working correctly")
|
| 110 |
+
else:
|
| 111 |
+
logger.error("❌ Conversational cleaning failed")
|
| 112 |
+
return False
|
| 113 |
+
|
| 114 |
+
return True
|
| 115 |
+
|
| 116 |
+
def main():
|
| 117 |
+
"""Run all tests"""
|
| 118 |
+
logger.info("Testing conversational cleaning and failed response handling...")
|
| 119 |
+
logger.info("=" * 70)
|
| 120 |
+
|
| 121 |
+
tests = [
|
| 122 |
+
("Conversational Cleaning", test_conversational_cleaning),
|
| 123 |
+
("Invalid Response Detection", test_invalid_response_detection),
|
| 124 |
+
("Retry Logic", test_retry_logic),
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
results = {}
|
| 128 |
+
for test_name, test_func in tests:
|
| 129 |
+
logger.info(f"\n--- {test_name} ---")
|
| 130 |
+
try:
|
| 131 |
+
result = test_func()
|
| 132 |
+
results[test_name] = result
|
| 133 |
+
status = "✅ PASSED" if result else "❌ FAILED"
|
| 134 |
+
logger.info(f"{test_name}: {status}")
|
| 135 |
+
except Exception as e:
|
| 136 |
+
logger.error(f"{test_name}: ❌ ERROR - {e}")
|
| 137 |
+
results[test_name] = False
|
| 138 |
+
|
| 139 |
+
# Summary
|
| 140 |
+
logger.info("\n" + "=" * 70)
|
| 141 |
+
logger.info("CONVERSATIONAL CLEANING TEST SUMMARY")
|
| 142 |
+
logger.info("=" * 70)
|
| 143 |
+
|
| 144 |
+
passed = sum(1 for result in results.values() if result)
|
| 145 |
+
total = len(results)
|
| 146 |
+
|
| 147 |
+
for test_name, result in results.items():
|
| 148 |
+
status = "✅ PASSED" if result else "❌ FAILED"
|
| 149 |
+
logger.info(f"{test_name}: {status}")
|
| 150 |
+
|
| 151 |
+
logger.info(f"\nOverall: {passed}/{total} tests passed")
|
| 152 |
+
|
| 153 |
+
if passed == total:
|
| 154 |
+
logger.info("🎉 All tests passed! Conversational cleaning is working correctly.")
|
| 155 |
+
logger.info("✅ Failed responses will be retried, not recorded!")
|
| 156 |
+
logger.info("✅ Conversational elements are properly cleaned!")
|
| 157 |
+
else:
|
| 158 |
+
logger.warning("⚠️ Some tests failed. Please check the logs above.")
|
| 159 |
+
|
| 160 |
+
return passed == total
|
| 161 |
+
|
| 162 |
+
if __name__ == "__main__":
|
| 163 |
+
success = main()
|
| 164 |
+
sys.exit(0 if success else 1)
|
utils/augment.py
CHANGED
|
@@ -142,6 +142,64 @@ def is_invalid_response(text: str) -> bool:
|
|
| 142 |
|
| 143 |
return False
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
def clean_invalid_response(text: str, fallback: str = "") -> str:
|
| 146 |
"""Clean invalid responses by returning fallback or empty string"""
|
| 147 |
if is_invalid_response(text):
|
|
@@ -153,14 +211,34 @@ def retry_invalid_response(text: str, paraphraser, max_retries: int = 3) -> str:
|
|
| 153 |
if not is_invalid_response(text):
|
| 154 |
return text
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
for attempt in range(max_retries):
|
| 157 |
try:
|
| 158 |
-
# Try
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
if retry_text and not is_invalid_response(retry_text):
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
except Exception as e:
|
| 165 |
logger.warning(f"Retry attempt {attempt + 1} failed: {e}")
|
| 166 |
continue
|
|
|
|
| 142 |
|
| 143 |
return False
|
| 144 |
|
| 145 |
+
def clean_conversational_elements(text: str) -> str:
|
| 146 |
+
"""Remove conversational elements and non-medical information smartly"""
|
| 147 |
+
if not text or not isinstance(text, str):
|
| 148 |
+
return text
|
| 149 |
+
|
| 150 |
+
# Remove common conversational prefixes
|
| 151 |
+
conversational_prefixes = [
|
| 152 |
+
r"^(hi|hello|hey|greetings?)\s*,?\s*",
|
| 153 |
+
r"^(xin chào|chào|chào bạn)\s*,?\s*",
|
| 154 |
+
r"^(if you are a doctor|if you're a doctor|as a doctor)\s*,?\s*",
|
| 155 |
+
r"^(nếu bạn là bác sĩ|nếu bạn là doctor)\s*,?\s*",
|
| 156 |
+
r"^(please|vui lòng)\s*,?\s*",
|
| 157 |
+
r"^(thank you|cảm ơn)\s*,?\s*",
|
| 158 |
+
r"^(thanks|cảm ơn)\s*,?\s*",
|
| 159 |
+
r"^(regards|best regards|cheers)\s*,?\s*",
|
| 160 |
+
r"^(i hope this helps|hy vọng điều này giúp ích)\s*,?\s*",
|
| 161 |
+
r"^(i'm sorry|tôi xin lỗi)\s*,?\s*",
|
| 162 |
+
r"^(let me help|để tôi giúp)\s*,?\s*",
|
| 163 |
+
r"^(i understand|tôi hiểu)\s*,?\s*",
|
| 164 |
+
r"^(i can help|tôi có thể giúp)\s*,?\s*",
|
| 165 |
+
r"^(i'll be happy to|tôi sẽ vui lòng)\s*,?\s*",
|
| 166 |
+
r"^(i would be glad to|tôi sẽ rất vui)\s*,?\s*",
|
| 167 |
+
r"^(i'm here to help|tôi ở đây để giúp)\s*,?\s*",
|
| 168 |
+
r"^(i'm a doctor|tôi là bác sĩ)\s*,?\s*",
|
| 169 |
+
r"^(as a medical professional|như một chuyên gia y tế)\s*,?\s*",
|
| 170 |
+
r"^(from a medical perspective|từ góc độ y tế)\s*,?\s*",
|
| 171 |
+
r"^(medically speaking|nói về mặt y tế)\s*,?\s*",
|
| 172 |
+
]
|
| 173 |
+
|
| 174 |
+
cleaned_text = text
|
| 175 |
+
for pattern in conversational_prefixes:
|
| 176 |
+
import re
|
| 177 |
+
cleaned_text = re.sub(pattern, "", cleaned_text, flags=re.IGNORECASE)
|
| 178 |
+
|
| 179 |
+
# Remove common conversational suffixes
|
| 180 |
+
conversational_suffixes = [
|
| 181 |
+
r"\s*,?\s*(hope this helps|hy vọng điều này giúp ích).*$",
|
| 182 |
+
r"\s*,?\s*(let me know if you need more|hãy cho tôi biết nếu bạn cần thêm).*$",
|
| 183 |
+
r"\s*,?\s*(feel free to ask|đừng ngại hỏi).*$",
|
| 184 |
+
r"\s*,?\s*(if you have any questions|nếu bạn có câu hỏi).*$",
|
| 185 |
+
r"\s*,?\s*(please let me know|vui lòng cho tôi biết).*$",
|
| 186 |
+
r"\s*,?\s*(i'm here to help|tôi ở đây để giúp).*$",
|
| 187 |
+
r"\s*,?\s*(best regards|trân trọng).*$",
|
| 188 |
+
r"\s*,?\s*(take care|chúc sức khỏe).*$",
|
| 189 |
+
r"\s*,?\s*(good luck|chúc may mắn).*$",
|
| 190 |
+
r"\s*,?\s*(wishing you well|chúc bạn khỏe mạnh).*$",
|
| 191 |
+
]
|
| 192 |
+
|
| 193 |
+
for pattern in conversational_suffixes:
|
| 194 |
+
import re
|
| 195 |
+
cleaned_text = re.sub(pattern, "", cleaned_text, flags=re.IGNORECASE)
|
| 196 |
+
|
| 197 |
+
# Clean up extra whitespace and punctuation
|
| 198 |
+
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
| 199 |
+
cleaned_text = re.sub(r'^[,\s]+|[,\s]+$', '', cleaned_text)
|
| 200 |
+
|
| 201 |
+
return cleaned_text if cleaned_text else text
|
| 202 |
+
|
| 203 |
def clean_invalid_response(text: str, fallback: str = "") -> str:
|
| 204 |
"""Clean invalid responses by returning fallback or empty string"""
|
| 205 |
if is_invalid_response(text):
|
|
|
|
| 211 |
if not is_invalid_response(text):
|
| 212 |
return text
|
| 213 |
|
| 214 |
+
# Clean conversational elements first
|
| 215 |
+
cleaned_text = clean_conversational_elements(text)
|
| 216 |
+
if cleaned_text != text and not is_invalid_response(cleaned_text):
|
| 217 |
+
return cleaned_text
|
| 218 |
+
|
| 219 |
for attempt in range(max_retries):
|
| 220 |
try:
|
| 221 |
+
# Try different strategies based on attempt
|
| 222 |
+
if attempt == 0:
|
| 223 |
+
# First try: Simple paraphrasing
|
| 224 |
+
retry_text = paraphraser.paraphrase(text, difficulty="easy")
|
| 225 |
+
elif attempt == 1:
|
| 226 |
+
# Second try: More aggressive paraphrasing with medical focus
|
| 227 |
+
medical_prompt = f"Rewrite this medical response to be more professional and accurate:\n\n{text}"
|
| 228 |
+
retry_text = paraphraser.paraphrase(text, difficulty="hard", custom_prompt=medical_prompt)
|
| 229 |
+
else:
|
| 230 |
+
# Third try: Direct medical content generation
|
| 231 |
+
medical_prompt = f"Provide a professional medical response to this question:\n\n{text}"
|
| 232 |
+
retry_text = paraphraser.paraphrase(text, difficulty="hard", custom_prompt=medical_prompt)
|
| 233 |
|
| 234 |
if retry_text and not is_invalid_response(retry_text):
|
| 235 |
+
# Clean conversational elements from retry
|
| 236 |
+
cleaned_retry = clean_conversational_elements(retry_text)
|
| 237 |
+
if cleaned_retry and not is_invalid_response(cleaned_retry):
|
| 238 |
+
return cleaned_retry
|
| 239 |
+
elif retry_text: # Use original retry if cleaning fails
|
| 240 |
+
return retry_text
|
| 241 |
+
|
| 242 |
except Exception as e:
|
| 243 |
logger.warning(f"Retry attempt {attempt + 1} failed: {e}")
|
| 244 |
continue
|
utils/processor.py
CHANGED
|
@@ -141,6 +141,8 @@ def _build_enriched_variants(user: str, out: str, paraphraser, opts: Dict, stats
|
|
| 141 |
enhanced_out = paraphraser.paraphrase(out, difficulty="hard", custom_prompt=style_prompt)
|
| 142 |
|
| 143 |
if enhanced_out and not A.is_invalid_response(enhanced_out):
|
|
|
|
|
|
|
| 144 |
if opts.get("style_standardize", True):
|
| 145 |
enhanced_out = A.style_standardize_answer(enhanced_out)
|
| 146 |
enhanced_out = A.ensure_terminal_punct(enhanced_out)
|
|
@@ -170,6 +172,8 @@ def _build_enriched_variants(user: str, out: str, paraphraser, opts: Dict, stats
|
|
| 170 |
enhanced_user = paraphraser.paraphrase(user, difficulty="hard", custom_prompt=style_prompt)
|
| 171 |
|
| 172 |
if enhanced_user and not A.is_invalid_response(enhanced_user):
|
|
|
|
|
|
|
| 173 |
enhanced_user = A.ensure_terminal_punct(enhanced_user)
|
| 174 |
question_variants.append((enhanced_user, tags))
|
| 175 |
stats["paraphrased_input"] += 1
|
|
@@ -237,6 +241,10 @@ def _apply_aug(instr: str, user: str, out: str, source: str, opts: Dict, paraphr
|
|
| 237 |
# Stack list of entries that has been applied augmentation and stylings
|
| 238 |
applied = []
|
| 239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
# Clean invalid responses with retry logic
|
| 241 |
if A.is_invalid_response(out):
|
| 242 |
out = A.retry_invalid_response(out, paraphraser, max_retries=3)
|
|
@@ -306,9 +314,10 @@ def _proc_med_dialog(source, path, writer, paraphraser, opts, sample_limit, stat
|
|
| 306 |
try:
|
| 307 |
instr, user, out, applied = _apply_aug(instr, user, out, source, opts, paraphraser, stats)
|
| 308 |
|
| 309 |
-
# Skip if retry failed (empty output)
|
| 310 |
if not out:
|
| 311 |
stats["dropped_invalid"] = stats.get("dropped_invalid", 0) + 1
|
|
|
|
| 312 |
continue
|
| 313 |
|
| 314 |
# 1) ALWAYS write the original (cleaned/style-standardised only)
|
|
|
|
| 141 |
enhanced_out = paraphraser.paraphrase(out, difficulty="hard", custom_prompt=style_prompt)
|
| 142 |
|
| 143 |
if enhanced_out and not A.is_invalid_response(enhanced_out):
|
| 144 |
+
# Clean conversational elements
|
| 145 |
+
enhanced_out = A.clean_conversational_elements(enhanced_out)
|
| 146 |
if opts.get("style_standardize", True):
|
| 147 |
enhanced_out = A.style_standardize_answer(enhanced_out)
|
| 148 |
enhanced_out = A.ensure_terminal_punct(enhanced_out)
|
|
|
|
| 172 |
enhanced_user = paraphraser.paraphrase(user, difficulty="hard", custom_prompt=style_prompt)
|
| 173 |
|
| 174 |
if enhanced_user and not A.is_invalid_response(enhanced_user):
|
| 175 |
+
# Clean conversational elements
|
| 176 |
+
enhanced_user = A.clean_conversational_elements(enhanced_user)
|
| 177 |
enhanced_user = A.ensure_terminal_punct(enhanced_user)
|
| 178 |
question_variants.append((enhanced_user, tags))
|
| 179 |
stats["paraphrased_input"] += 1
|
|
|
|
| 241 |
# Stack list of entries that has been applied augmentation and stylings
|
| 242 |
applied = []
|
| 243 |
|
| 244 |
+
# Clean conversational elements first
|
| 245 |
+
out = A.clean_conversational_elements(out)
|
| 246 |
+
user = A.clean_conversational_elements(user)
|
| 247 |
+
|
| 248 |
# Clean invalid responses with retry logic
|
| 249 |
if A.is_invalid_response(out):
|
| 250 |
out = A.retry_invalid_response(out, paraphraser, max_retries=3)
|
|
|
|
| 314 |
try:
|
| 315 |
instr, user, out, applied = _apply_aug(instr, user, out, source, opts, paraphraser, stats)
|
| 316 |
|
| 317 |
+
# Skip if retry failed (empty output) - DO NOT RECORD FAILED RESPONSES
|
| 318 |
if not out:
|
| 319 |
stats["dropped_invalid"] = stats.get("dropped_invalid", 0) + 1
|
| 320 |
+
logger.warning(f"[PROC] {source} dropped invalid response for item {i} - will retry in next batch")
|
| 321 |
continue
|
| 322 |
|
| 323 |
# 1) ALWAYS write the original (cleaned/style-standardised only)
|