File size: 12,319 Bytes
8e9e85e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
"""Unit tests for AudioRefiner agent."""

import pytest
from unittest.mock import AsyncMock, Mock, patch

from src.agents.audio_refiner import AudioRefiner, refine_text_for_audio


class TestAudioRefiner:
    """Test suite for AudioRefiner functionality."""

    @pytest.fixture
    def refiner(self):
        """Create AudioRefiner instance."""
        return AudioRefiner()

    def test_remove_markdown_headers(self, refiner):
        """Test removal of markdown headers."""
        text = """# Main Title
## Subtitle
### Section
Content here"""
        result = refiner._remove_markdown_syntax(text)
        assert "#" not in result
        assert "Main Title" in result
        assert "Subtitle" in result

    def test_remove_bold_italic(self, refiner):
        """Test removal of bold and italic formatting."""
        text = "**Bold text** and *italic text* and __another bold__"
        result = refiner._remove_markdown_syntax(text)
        assert "**" not in result
        assert "*" not in result
        assert "__" not in result
        assert "Bold text" in result
        assert "italic text" in result

    def test_remove_links(self, refiner):
        """Test removal of markdown links."""
        text = "Check [this link](https://example.com) for details"
        result = refiner._remove_markdown_syntax(text)
        assert "[" not in result
        assert "]" not in result
        assert "https://" not in result
        assert "this link" in result

    def test_remove_citations_numbered(self, refiner):
        """Test removal of numbered citations."""
        text = "Research shows [1] that metformin [2,3] works [4-6]."
        result = refiner._remove_citations(text)
        assert "[1]" not in result
        assert "[2,3]" not in result
        assert "[4-6]" not in result
        assert "Research shows" in result

    def test_remove_citations_author_year(self, refiner):
        """Test removal of author-year citations."""
        text = "Studies (Smith et al., 2023) and (Jones, 2022) confirm this."
        result = refiner._remove_citations(text)
        assert "(Smith et al., 2023)" not in result
        assert "(Jones, 2022)" not in result
        assert "Studies" in result
        assert "confirm this" in result

    def test_remove_first_references_section(self, refiner):
        """Test that References sections are removed while preserving other content."""
        text = """Main content here.

# References
[1] First reference
[2] Second reference

# More Content
This should remain.

## References
This second References should also be removed."""

        result = refiner._remove_references_sections(text)
        assert "Main content here" in result
        assert "References" not in result
        assert "First reference" not in result
        assert "More Content" in result  # Content after References should be preserved
        assert "This should remain" in result
        assert "second References should also be removed" not in result  # Second References section removed

    def test_roman_to_int_conversion(self, refiner):
        """Test roman numeral to integer conversion."""
        assert refiner._roman_to_int("I") == 1
        assert refiner._roman_to_int("II") == 2
        assert refiner._roman_to_int("III") == 3
        assert refiner._roman_to_int("IV") == 4
        assert refiner._roman_to_int("V") == 5
        assert refiner._roman_to_int("IX") == 9
        assert refiner._roman_to_int("X") == 10
        assert refiner._roman_to_int("XII") == 12
        assert refiner._roman_to_int("XX") == 20

    def test_int_to_word_conversion(self, refiner):
        """Test integer to word conversion."""
        assert refiner._int_to_word(1) == "One"
        assert refiner._int_to_word(2) == "Two"
        assert refiner._int_to_word(3) == "Three"
        assert refiner._int_to_word(10) == "Ten"
        assert refiner._int_to_word(20) == "Twenty"
        assert refiner._int_to_word(25) == "25"  # Falls back to digit

    def test_convert_roman_numerals_with_context(self, refiner):
        """Test roman numeral conversion with context words."""
        test_cases = [
            ("Phase I trial", "Phase One trial"),
            ("Phase II study", "Phase Two study"),
            ("Phase III data", "Phase Three data"),
            ("Type I diabetes", "Type One diabetes"),
            ("Type II error", "Type Two error"),
            ("Stage IV cancer", "Stage Four cancer"),
            ("Trial I results", "Trial One results"),
        ]

        for input_text, expected in test_cases:
            result = refiner._convert_roman_numerals(input_text)
            assert expected in result, f"Failed for: {input_text}"

    def test_convert_standalone_roman_numerals(self, refiner):
        """Test standalone roman numeral conversion."""
        text = "Results for I, II, and III are positive."
        result = refiner._convert_roman_numerals(text)
        # Standalone roman numerals should be converted
        assert "One" in result or "Two" in result or "Three" in result

    def test_dont_convert_roman_in_words(self, refiner):
        """Test that roman numerals inside words aren't converted."""
        text = "INVALID data fromIXIN compound"
        result = refiner._convert_roman_numerals(text)
        # Should not break words containing I, V, X, etc.
        assert "INVALID" in result or "Invalid" in result  # May be case-normalized

    def test_clean_special_characters(self, refiner):
        """Test special character cleanup."""
        # Using unicode escapes to avoid syntax issues
        text = "Text with \u2014 em-dash and \u201csmart quotes\u201d and \u2018apostrophes\u2019."
        result = refiner._clean_special_characters(text)
        assert "\u2014" not in result  # em-dash
        assert "\u201c" not in result  # smart quote open
        assert "\u2018" not in result  # smart apostrophe
        assert "-" in result

    def test_normalize_whitespace(self, refiner):
        """Test whitespace normalization."""
        text = "Text  with   multiple    spaces\n\n\n\nand many newlines"
        result = refiner._normalize_whitespace(text)
        assert "  " not in result  # No double spaces
        assert "\n\n\n" not in result  # Max two newlines

    async def test_full_refine_workflow(self, refiner):
        """Test complete refinement workflow."""
        markdown_text = """# Summary

**Metformin** shows promise for *long COVID* treatment [1].

## Phase I Trials

Research (Smith et al., 2023) indicates [2,3]:
- 50% improvement
- Low side effects

Check [this study](https://example.com) for details.

# References
[1] Smith, J. et al. (2023)
[2] Jones, K. (2022)
"""

        result = await refiner.refine_for_audio(markdown_text)

        # Check markdown removed
        assert "#" not in result
        assert "**" not in result
        assert "*" not in result

        # Check citations removed
        assert "[1]" not in result
        assert "(Smith et al., 2023)" not in result

        # Check roman numerals converted
        assert "Phase One" in result

        # Check references section removed
        assert "References" not in result
        assert "Smith, J. et al." not in result

        # Check content preserved
        assert "Metformin" in result
        assert "long COVID" in result

    async def test_convenience_function(self):
        """Test convenience function."""
        text = "**Bold** text with [link](url)"
        result = await refine_text_for_audio(text)
        assert "**" not in result
        assert "[link]" not in result
        assert "Bold" in result

    async def test_empty_text(self, refiner):
        """Test handling of empty text."""
        assert await refiner.refine_for_audio("") == ""
        assert await refiner.refine_for_audio("   ") == ""

    async def test_no_references_section(self, refiner):
        """Test text without References section."""
        text = "Main content without references."
        result = await refiner.refine_for_audio(text)
        assert "Main content without references" in result

    def test_multiple_reference_formats(self, refiner):
        """Test different References section formats."""
        formats = [
            ("# References\nContent", True),  # Markdown header - will be removed
            ("## References\nContent", True),  # Markdown header - will be removed
            ("**References**\nContent", True),  # Bold heading - will be removed
            ("References:\nContent", False),  # Standalone without markers - NOT removed (edge case)
        ]

        for format_text, should_remove in formats:
            text = f"Main content\n{format_text}"
            result = refiner._remove_references_sections(text)
            assert "Main content" in result
            if should_remove:
                assert "References" not in result or result.count("References") == 0
            # Standalone "References:" without markers is an edge case we don't handle

    def test_preserve_paragraph_structure(self, refiner):
        """Test that paragraph structure is preserved."""
        text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."

        result = refiner._normalize_whitespace(text)
        # Should have paragraph breaks (double newlines)
        assert "\n\n" in result
        # But not excessive newlines
        assert "\n\n\n" not in result

    @patch('src.agents.audio_refiner.get_pydantic_ai_model')
    async def test_llm_polish_disabled_by_default(self, mock_get_model, refiner):
        """Test that LLM polish is not called by default."""
        text = "Test text"
        result = await refiner.refine_for_audio(text, use_llm_polish=False)

        # LLM should not be called when disabled
        mock_get_model.assert_not_called()
        assert "Test text" in result

    @patch('src.agents.audio_refiner.Agent')
    @patch('src.agents.audio_refiner.get_pydantic_ai_model')
    async def test_llm_polish_enabled(self, mock_get_model, mock_agent_class, refiner):
        """Test that LLM polish is called when enabled."""
        # Setup mock
        mock_model = Mock()
        mock_get_model.return_value = mock_model

        mock_agent_instance = Mock()
        mock_result = Mock()
        mock_result.output = "Polished text"
        mock_agent_instance.run = AsyncMock(return_value=mock_result)
        mock_agent_class.return_value = mock_agent_instance

        # Test with LLM polish enabled
        text = "**Test** text"
        result = await refiner.refine_for_audio(text, use_llm_polish=True)

        # Verify LLM was called
        mock_get_model.assert_called_once()
        mock_agent_class.assert_called_once()
        mock_agent_instance.run.assert_called_once()

        assert result == "Polished text"

    @patch('src.agents.audio_refiner.Agent')
    @patch('src.agents.audio_refiner.get_pydantic_ai_model')
    async def test_llm_polish_graceful_fallback(self, mock_get_model, mock_agent_class, refiner):
        """Test graceful fallback when LLM polish fails."""
        # Setup mock to raise exception
        mock_get_model.return_value = Mock()
        mock_agent_instance = Mock()
        mock_agent_instance.run = AsyncMock(side_effect=Exception("API Error"))
        mock_agent_class.return_value = mock_agent_instance

        # Test with LLM polish enabled but failing
        text = "Test text"
        result = await refiner.refine_for_audio(text, use_llm_polish=True)

        # Should fall back to rule-based output
        assert "Test text" in result
        assert result != ""  # Should not be empty

    async def test_convenience_function_with_llm_polish(self):
        """Test convenience function with LLM polish parameter."""
        with patch.object(AudioRefiner, 'refine_for_audio') as mock_refine:
            mock_refine.return_value = AsyncMock(return_value="Refined text")()

            # Test without LLM polish
            result = await refine_text_for_audio("Test", use_llm_polish=False)
            mock_refine.assert_called_with("Test", use_llm_polish=False)

            # Test with LLM polish
            result = await refine_text_for_audio("Test", use_llm_polish=True)
            mock_refine.assert_called_with("Test", use_llm_polish=True)