Spaces:

LaurentBlanc
/

PDF-summarizer-backend

Sleeping

App Files Files Community

Laurenc Kaefer commited on Nov 6, 2025

Commit

ec7ebdd

1 Parent(s): 3ec642e

ADD: tests for backend files

Browse files

Files changed (3) hide show

backend/test_extract_pdf_text.py +47 -0
backend/test_generate_summary.py +17 -0
backend/test_main.py +47 -0

backend/test_extract_pdf_text.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import pytest
+import fitz
+from .extract_pdf_text import extract_pdf_text, _clean_pages
+def test_extract_pdf_text():
+    # test for empty input
+    assert "" == extract_pdf_text(content="")
+    # test error paths
+    with pytest.raises(ValueError, match="Input corrupted, missing ','"):
+        extract_pdf_text("plain text")
+    with pytest.raises(ValueError, match=r"Input has wrong type. Should be str, was .*"):
+        extract_pdf_text(2)
+    with pytest.raises(ValueError, match=r"Invalid base64-encoded string:.*"):
+        extract_pdf_text(",JVBERi0xLjMgC")
+    with pytest.raises(fitz.FileDataError, match="Cannot open empty stream"):
+        extract_pdf_text("data,")
+    with pytest.raises(fitz.FileDataError, match="Failed to open stream"):
+        extract_pdf_text("data, asdf")
+    with pytest.raises(ValueError, match="Incorrect padding"):
+        extract_pdf_text("data,dd")
+@pytest.mark.parametrize(
+    "input, output_expected",
+    [
+        (["H"], [""]),  # 2 or less characters (usually not content)
+        (["hh"], [""]),  # 2 or less characters (usually not content)
+        (["44A"], [""]),  # more numbers than characters (usually not content)
+        (["to."], ["to."]),  # possible ending of sentence (w/ minimal amount of letters)
+        (
+            ["header\ncontent1\nfooter", "header\ncontent2\nfooter"],
+            ["content1", "content2"],
+        ),  # header and footer removed
+        (
+            ["header\ncontent1 and footer in line", "header\ncontent2\nfooter"],
+            ["content1 and footer in line", "content2\nfooter"],
+        ),  # header removed but footer in line kept (not detected on 2. page since not present on 1. page)
+        (
+            ["content1", "header\ncontent2", "header\ncontent3"],
+            ["content1", "content2", "content3"],
+        ),  # header removed (present on most pages)
+    ],
+)
+def test__clean_pages(input, output_expected):
+    assert output_expected == _clean_pages(input)

backend/test_generate_summary.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import pytest
+from generate_summary import generate_summary
+def test_generate_summary():
+    # test for empty input
+    assert "" == generate_summary(text="")
+    # test expected behavior
+    article = """The mayor of the city announced new environmental policies today aimed at reducing emissions
+    by 40% by 2035. The initiative includes expanded public transit, incentives for electric vehicles, and stricter
+    regulations on industrial polluters."""
+    assert type(generate_summary(text=article)) is str
+    # test error paths
+    with pytest.raises(ValueError, match=r"Input has wrong type. Should be str, was .*"):
+        generate_summary(2)

backend/test_main.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import pytest
+from .main import api
+@pytest.mark.parametrize(
+    "input, status_code, error_msg",
+    [
+        (4, 415, "input was not a json, was <class 'int'>"),
+        ({"wrongKey": None}, 400, "missing field 'content'"),
+        ({"content": None}, 400, "missing field 'content'"),
+        ({"content": 1}, 415, "content must be of type str, was <class 'int'>"),
+        ({"content": "", "second_key": None}, 400, "unexpected additional JSON fields"),
+        ({"content": ""}, 415, "content must be a 'data:application/pdf;' URI"),
+        ({"content": "wrong formatted"}, 415, "content must be a 'data:application/pdf;' URI"),
+        ({"content": "data:application/pdf;wrongEncodingScheme"}, 415, "content must be base64-encoded"),
+        ({"content": "data:application/pdf;base64,"}, 422, "Invalid or empty PDF"),
+        ({"content": "data:application/pdf;base64,dd"}, 422, "Incorrect padding"),
+    ],
+)
+def test_extract_text(input, status_code, error_msg):
+    client = api.test_client()
+    response = client.post("/extract-text", json=input)
+    assert response.status_code == status_code
+    assert response.get_json() == {"error": error_msg}
+@pytest.mark.parametrize(
+    "input, status_code, msg_type, msg",
+    [
+        (4, 415, "error", "input was not a json, was <class 'int'>"),
+        ({"wrongKey": None}, 400, "error", "missing field 'text'"),
+        ({"text": None}, 400, "error", "missing field 'text'"),
+        ({"text": 1}, 415, "error", "text must be of type str, was <class 'int'>"),
+        ({"text": "", "second_key": None}, 400, "error", "unexpected additional JSON fields"),
+        ({"text": ""}, 200, "summary", ""),
+        ({"text": "This is a test article to summarize."}, 200, "summary", "test"),
+    ],
+)
+def test_summarize_text(input, status_code, msg_type, msg):
+    client = api.test_client()
+    response = client.post("/summarize-text", json=input)
+    data = response.get_json()
+    assert response.status_code == status_code
+    assert msg_type in data.keys()
+    assert msg in data[msg_type]