Laurenc Kaefer commited on
Commit
ec7ebdd
·
1 Parent(s): 3ec642e

ADD: tests for backend files

Browse files
backend/test_extract_pdf_text.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ import fitz
3
+ from .extract_pdf_text import extract_pdf_text, _clean_pages
4
+
5
+
6
+ def test_extract_pdf_text():
7
+ # test for empty input
8
+ assert "" == extract_pdf_text(content="")
9
+
10
+ # test error paths
11
+ with pytest.raises(ValueError, match="Input corrupted, missing ','"):
12
+ extract_pdf_text("plain text")
13
+ with pytest.raises(ValueError, match=r"Input has wrong type. Should be str, was .*"):
14
+ extract_pdf_text(2)
15
+ with pytest.raises(ValueError, match=r"Invalid base64-encoded string:.*"):
16
+ extract_pdf_text(",JVBERi0xLjMgC")
17
+ with pytest.raises(fitz.FileDataError, match="Cannot open empty stream"):
18
+ extract_pdf_text("data,")
19
+ with pytest.raises(fitz.FileDataError, match="Failed to open stream"):
20
+ extract_pdf_text("data, asdf")
21
+ with pytest.raises(ValueError, match="Incorrect padding"):
22
+ extract_pdf_text("data,dd")
23
+
24
+
25
+ @pytest.mark.parametrize(
26
+ "input, output_expected",
27
+ [
28
+ (["H"], [""]), # 2 or less characters (usually not content)
29
+ (["hh"], [""]), # 2 or less characters (usually not content)
30
+ (["44A"], [""]), # more numbers than characters (usually not content)
31
+ (["to."], ["to."]), # possible ending of sentence (w/ minimal amount of letters)
32
+ (
33
+ ["header\ncontent1\nfooter", "header\ncontent2\nfooter"],
34
+ ["content1", "content2"],
35
+ ), # header and footer removed
36
+ (
37
+ ["header\ncontent1 and footer in line", "header\ncontent2\nfooter"],
38
+ ["content1 and footer in line", "content2\nfooter"],
39
+ ), # header removed but footer in line kept (not detected on 2. page since not present on 1. page)
40
+ (
41
+ ["content1", "header\ncontent2", "header\ncontent3"],
42
+ ["content1", "content2", "content3"],
43
+ ), # header removed (present on most pages)
44
+ ],
45
+ )
46
+ def test__clean_pages(input, output_expected):
47
+ assert output_expected == _clean_pages(input)
backend/test_generate_summary.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from generate_summary import generate_summary
3
+
4
+
5
+ def test_generate_summary():
6
+ # test for empty input
7
+ assert "" == generate_summary(text="")
8
+
9
+ # test expected behavior
10
+ article = """The mayor of the city announced new environmental policies today aimed at reducing emissions
11
+ by 40% by 2035. The initiative includes expanded public transit, incentives for electric vehicles, and stricter
12
+ regulations on industrial polluters."""
13
+ assert type(generate_summary(text=article)) is str
14
+
15
+ # test error paths
16
+ with pytest.raises(ValueError, match=r"Input has wrong type. Should be str, was .*"):
17
+ generate_summary(2)
backend/test_main.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from .main import api
3
+
4
+
5
+ @pytest.mark.parametrize(
6
+ "input, status_code, error_msg",
7
+ [
8
+ (4, 415, "input was not a json, was <class 'int'>"),
9
+ ({"wrongKey": None}, 400, "missing field 'content'"),
10
+ ({"content": None}, 400, "missing field 'content'"),
11
+ ({"content": 1}, 415, "content must be of type str, was <class 'int'>"),
12
+ ({"content": "", "second_key": None}, 400, "unexpected additional JSON fields"),
13
+ ({"content": ""}, 415, "content must be a 'data:application/pdf;' URI"),
14
+ ({"content": "wrong formatted"}, 415, "content must be a 'data:application/pdf;' URI"),
15
+ ({"content": "data:application/pdf;wrongEncodingScheme"}, 415, "content must be base64-encoded"),
16
+ ({"content": "data:application/pdf;base64,"}, 422, "Invalid or empty PDF"),
17
+ ({"content": "data:application/pdf;base64,dd"}, 422, "Incorrect padding"),
18
+ ],
19
+ )
20
+ def test_extract_text(input, status_code, error_msg):
21
+ client = api.test_client()
22
+ response = client.post("/extract-text", json=input)
23
+
24
+ assert response.status_code == status_code
25
+ assert response.get_json() == {"error": error_msg}
26
+
27
+
28
+ @pytest.mark.parametrize(
29
+ "input, status_code, msg_type, msg",
30
+ [
31
+ (4, 415, "error", "input was not a json, was <class 'int'>"),
32
+ ({"wrongKey": None}, 400, "error", "missing field 'text'"),
33
+ ({"text": None}, 400, "error", "missing field 'text'"),
34
+ ({"text": 1}, 415, "error", "text must be of type str, was <class 'int'>"),
35
+ ({"text": "", "second_key": None}, 400, "error", "unexpected additional JSON fields"),
36
+ ({"text": ""}, 200, "summary", ""),
37
+ ({"text": "This is a test article to summarize."}, 200, "summary", "test"),
38
+ ],
39
+ )
40
+ def test_summarize_text(input, status_code, msg_type, msg):
41
+ client = api.test_client()
42
+ response = client.post("/summarize-text", json=input)
43
+ data = response.get_json()
44
+
45
+ assert response.status_code == status_code
46
+ assert msg_type in data.keys()
47
+ assert msg in data[msg_type]