""" Tests for the scraper module. These run offline by mocking `requests.Session.get` — we test parsing logic against fixture HTML, never hit Amazon/Flipkart in CI. """ from unittest.mock import patch, MagicMock import pytest from src.scraper import scrape_url, _amazon, _flipkart, _generic from bs4 import BeautifulSoup AMAZON_HTML = """ Samsung Galaxy S24 Ultra 5G

A flagship Android phone with titanium build.

RAM12 GB
Battery5000 mAh
""" FLIPKART_HTML = """

OnePlus 12R 5G (Cool Blue, 256 GB)

""" GENERIC_HTML = """ Widget Pro

Widget Pro

This paragraph has enough length to be captured by the generic extractor.

Another long paragraph describing how the widget works in practice.

""" def _mock_response(html: str, status: int = 200): """Build a mock response that behaves like requests.Response enough for scrape_url.""" resp = MagicMock() resp.status_code = status resp.encoding = "utf-8" resp.text = html resp.content = html.encode("utf-8") resp.raw.read.return_value = html.encode("utf-8") return resp def test_amazon_parser_extracts_title_features_specs(): soup = BeautifulSoup(AMAZON_HTML, "html.parser") data = _amazon(soup) assert "Samsung Galaxy S24 Ultra" in data["title"] assert "6.8-inch" in data["features"] assert "200 MP" in data["features"] assert "hidden bullet" not in data["features"] assert "titanium" in data["description"] assert "RAM: 12 GB" in data["specs"] assert "Battery: 5000 mAh" in data["specs"] def test_flipkart_parser_extracts_title_and_highlights(): soup = BeautifulSoup(FLIPKART_HTML, "html.parser") data = _flipkart(soup) assert "OnePlus 12R" in data["title"] assert "Snapdragon" in data["features"] assert "5500 mAh" in data["features"] def test_generic_parser_uses_og_metadata(): soup = BeautifulSoup(GENERIC_HTML, "html.parser") data = _generic(soup) assert "Widget Pro" in data["title"] assert "premium widget" in data["description"] def test_scrape_url_amazon_integration(): with patch("src.scraper._get_session") as mock_session: sess = MagicMock() sess.get.return_value = _mock_response(AMAZON_HTML) mock_session.return_value = sess result = scrape_url("https://www.amazon.in/dp/B0TEST") assert result["source"] == "amazon" assert "Samsung" in result["title"] assert result["char_count"] > 50 assert "error" not in result assert "Product:" in result["context"] assert "Features:" in result["context"] def test_scrape_url_handles_http_error(): with patch("src.scraper._get_session") as mock_session: sess = MagicMock() sess.get.return_value = _mock_response("", status=403) mock_session.return_value = sess result = scrape_url("https://example.com/blocked") assert "error" in result assert "403" in result["error"] or "blocked" in result["error"].lower() def test_scrape_url_rejects_empty_input(): assert "error" in scrape_url("") assert "error" in scrape_url(" ") def test_scrape_url_adds_https_scheme(): with patch("src.scraper._get_session") as mock_session: sess = MagicMock() sess.get.return_value = _mock_response(GENERIC_HTML) mock_session.return_value = sess scrape_url("example.com/product") called_url = sess.get.call_args[0][0] assert called_url.startswith("https://") def test_scrape_url_returns_warning_on_sparse_content(): with patch("src.scraper._get_session") as mock_session: sess = MagicMock() sess.get.return_value = _mock_response("") mock_session.return_value = sess result = scrape_url("https://example.com/empty") # Either an error or a warning is acceptable for an empty page assert result.get("warning") or result.get("error") if __name__ == "__main__": pytest.main([__file__, "-v"])