File size: 6,361 Bytes
94ec243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import pytest
from unittest.mock import patch, MagicMock

from scrapling.parser import Selector
from scrapling.core.shell import CustomShell, CurlParser, Convertor


class TestCurlParser:
    """Test curl command parsing"""

    @pytest.fixture
    def parser(self):
        return CurlParser()

    def test_basic_curl_parse(self, parser):
        """Test parsing basic curl commands"""
        # Simple GET
        curl_cmd = 'curl https://example.com'
        request = parser.parse(curl_cmd)

        assert request.url == 'https://example.com'
        assert request.method == 'get'
        assert request.data is None

    def test_curl_with_headers(self, parser):
        """Test parsing curl with headers"""
        curl_cmd = '''curl https://example.com \
            -H "User-Agent: Mozilla/5.0" \
            -H "Accept: application/json"'''

        request = parser.parse(curl_cmd)

        assert request.headers['User-Agent'] == 'Mozilla/5.0'
        assert request.headers['Accept'] == 'application/json'

    def test_curl_with_data(self, parser):
        """Test parsing curl with data"""
        # Form data
        curl_cmd = 'curl https://example.com -X POST -d "key=value&foo=bar"'
        request = parser.parse(curl_cmd)

        assert request.method == 'post'
        assert request.data == 'key=value&foo=bar'

        # JSON data
        curl_cmd = """curl https://example.com -X POST --data-raw '{"key": "value"}'"""
        request = parser.parse(curl_cmd)

        assert request.json_data == {"key": "value"}

    def test_curl_with_cookies(self, parser):
        """Test parsing curl with cookies"""
        curl_cmd = '''curl https://example.com \
            -H "Cookie: session=abc123; user=john" \
            -b "extra=cookie"'''

        request = parser.parse(curl_cmd)

        assert request.cookies['session'] == 'abc123'
        assert request.cookies['user'] == 'john'
        assert request.cookies['extra'] == 'cookie'

    def test_curl_with_proxy(self, parser):
        """Test parsing curl with proxy"""
        curl_cmd = 'curl https://example.com -x http://proxy:8080 -U user:pass'
        request = parser.parse(curl_cmd)

        assert 'http://user:pass@proxy:8080' in request.proxy['http']

    def test_curl2fetcher(self, parser):
        """Test converting curl to fetcher request"""
        with patch('scrapling.fetchers.Fetcher.get') as mock_get:
            mock_response = MagicMock()
            mock_get.return_value = mock_response

            curl_cmd = 'curl https://example.com'
            _ = parser.convert2fetcher(curl_cmd)

            mock_get.assert_called_once()

    def test_invalid_curl_commands(self, parser):
        """Test handling invalid curl commands"""
        # Invalid format
        with pytest.raises(AttributeError):
            parser.parse('not a curl command')


class TestConvertor:
    """Test content conversion functionality"""

    @pytest.fixture
    def sample_html(self):
        return """
        <html>
            <body>
                <div class="content">
                    <h1>Title</h1>
                    <p>Some text content</p>
                </div>
            </body>
        </html>
        """

    def test_extract_markdown(self, sample_html):
        """Test extracting content as Markdown"""
        page = Selector(sample_html)
        content = list(Convertor._extract_content(page, "markdown"))

        assert len(content) > 0
        assert "Title\n=====" in content[0]  # Markdown conversion

    def test_extract_html(self, sample_html):
        """Test extracting content as HTML"""
        page = Selector(sample_html)
        content = list(Convertor._extract_content(page, "html"))

        assert len(content) > 0
        assert "<h1>Title</h1>" in content[0]

    def test_extract_text(self, sample_html):
        """Test extracting content as plain text"""
        page = Selector(sample_html)
        content = list(Convertor._extract_content(page, "text"))

        assert len(content) > 0
        assert "Title" in content[0]
        assert "Some text content" in content[0]

    def test_extract_with_selector(self, sample_html):
        """Test extracting with CSS selector"""
        page = Selector(sample_html)
        content = list(Convertor._extract_content(
            page,
            "text",
            css_selector=".content"
        ))

        assert len(content) > 0

    def test_write_to_file(self, sample_html, tmp_path):
        """Test writing content to files"""
        page = Selector(sample_html)

        # Test markdown
        md_file = tmp_path / "output.md"
        Convertor.write_content_to_file(page, str(md_file))
        assert md_file.exists()

        # Test HTML
        html_file = tmp_path / "output.html"
        Convertor.write_content_to_file(page, str(html_file))
        assert html_file.exists()

        # Test text
        txt_file = tmp_path / "output.txt"
        Convertor.write_content_to_file(page, str(txt_file))
        assert txt_file.exists()

    def test_invalid_operations(self, sample_html):
        """Test error handling in convertor"""
        page = Selector(sample_html)

        # Invalid extraction type
        with pytest.raises(ValueError):
            list(Convertor._extract_content(page, "invalid"))

        # Invalid filename
        with pytest.raises(ValueError):
            Convertor.write_content_to_file(page, "")

        # Unknown file extension
        with pytest.raises(ValueError):
            Convertor.write_content_to_file(page, "output.xyz")


class TestCustomShell:
    """Test interactive shell functionality"""

    def test_shell_initialization(self):
        """Test shell initialization"""
        shell = CustomShell(code="", log_level="debug")

        assert shell.log_level == 10  # DEBUG level
        assert shell.page is None
        assert len(shell.pages) == 0

    def test_shell_namespace(self):
        """Test shell namespace creation"""
        shell = CustomShell(code="")
        namespace = shell.get_namespace()

        # Check all expected functions/classes are available
        assert 'get' in namespace
        assert 'post' in namespace
        assert 'Fetcher' in namespace
        assert 'DynamicFetcher' in namespace
        assert 'view' in namespace
        assert 'uncurl' in namespace