Spaces:
Sleeping
Sleeping
Laurenc Kaefer commited on
Commit ·
ec7ebdd
1
Parent(s): 3ec642e
ADD: tests for backend files
Browse files- backend/test_extract_pdf_text.py +47 -0
- backend/test_generate_summary.py +17 -0
- backend/test_main.py +47 -0
backend/test_extract_pdf_text.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
import fitz
|
| 3 |
+
from .extract_pdf_text import extract_pdf_text, _clean_pages
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_extract_pdf_text():
|
| 7 |
+
# test for empty input
|
| 8 |
+
assert "" == extract_pdf_text(content="")
|
| 9 |
+
|
| 10 |
+
# test error paths
|
| 11 |
+
with pytest.raises(ValueError, match="Input corrupted, missing ','"):
|
| 12 |
+
extract_pdf_text("plain text")
|
| 13 |
+
with pytest.raises(ValueError, match=r"Input has wrong type. Should be str, was .*"):
|
| 14 |
+
extract_pdf_text(2)
|
| 15 |
+
with pytest.raises(ValueError, match=r"Invalid base64-encoded string:.*"):
|
| 16 |
+
extract_pdf_text(",JVBERi0xLjMgC")
|
| 17 |
+
with pytest.raises(fitz.FileDataError, match="Cannot open empty stream"):
|
| 18 |
+
extract_pdf_text("data,")
|
| 19 |
+
with pytest.raises(fitz.FileDataError, match="Failed to open stream"):
|
| 20 |
+
extract_pdf_text("data, asdf")
|
| 21 |
+
with pytest.raises(ValueError, match="Incorrect padding"):
|
| 22 |
+
extract_pdf_text("data,dd")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@pytest.mark.parametrize(
|
| 26 |
+
"input, output_expected",
|
| 27 |
+
[
|
| 28 |
+
(["H"], [""]), # 2 or less characters (usually not content)
|
| 29 |
+
(["hh"], [""]), # 2 or less characters (usually not content)
|
| 30 |
+
(["44A"], [""]), # more numbers than characters (usually not content)
|
| 31 |
+
(["to."], ["to."]), # possible ending of sentence (w/ minimal amount of letters)
|
| 32 |
+
(
|
| 33 |
+
["header\ncontent1\nfooter", "header\ncontent2\nfooter"],
|
| 34 |
+
["content1", "content2"],
|
| 35 |
+
), # header and footer removed
|
| 36 |
+
(
|
| 37 |
+
["header\ncontent1 and footer in line", "header\ncontent2\nfooter"],
|
| 38 |
+
["content1 and footer in line", "content2\nfooter"],
|
| 39 |
+
), # header removed but footer in line kept (not detected on 2. page since not present on 1. page)
|
| 40 |
+
(
|
| 41 |
+
["content1", "header\ncontent2", "header\ncontent3"],
|
| 42 |
+
["content1", "content2", "content3"],
|
| 43 |
+
), # header removed (present on most pages)
|
| 44 |
+
],
|
| 45 |
+
)
|
| 46 |
+
def test__clean_pages(input, output_expected):
|
| 47 |
+
assert output_expected == _clean_pages(input)
|
backend/test_generate_summary.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from generate_summary import generate_summary
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def test_generate_summary():
|
| 6 |
+
# test for empty input
|
| 7 |
+
assert "" == generate_summary(text="")
|
| 8 |
+
|
| 9 |
+
# test expected behavior
|
| 10 |
+
article = """The mayor of the city announced new environmental policies today aimed at reducing emissions
|
| 11 |
+
by 40% by 2035. The initiative includes expanded public transit, incentives for electric vehicles, and stricter
|
| 12 |
+
regulations on industrial polluters."""
|
| 13 |
+
assert type(generate_summary(text=article)) is str
|
| 14 |
+
|
| 15 |
+
# test error paths
|
| 16 |
+
with pytest.raises(ValueError, match=r"Input has wrong type. Should be str, was .*"):
|
| 17 |
+
generate_summary(2)
|
backend/test_main.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from .main import api
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@pytest.mark.parametrize(
|
| 6 |
+
"input, status_code, error_msg",
|
| 7 |
+
[
|
| 8 |
+
(4, 415, "input was not a json, was <class 'int'>"),
|
| 9 |
+
({"wrongKey": None}, 400, "missing field 'content'"),
|
| 10 |
+
({"content": None}, 400, "missing field 'content'"),
|
| 11 |
+
({"content": 1}, 415, "content must be of type str, was <class 'int'>"),
|
| 12 |
+
({"content": "", "second_key": None}, 400, "unexpected additional JSON fields"),
|
| 13 |
+
({"content": ""}, 415, "content must be a 'data:application/pdf;' URI"),
|
| 14 |
+
({"content": "wrong formatted"}, 415, "content must be a 'data:application/pdf;' URI"),
|
| 15 |
+
({"content": "data:application/pdf;wrongEncodingScheme"}, 415, "content must be base64-encoded"),
|
| 16 |
+
({"content": "data:application/pdf;base64,"}, 422, "Invalid or empty PDF"),
|
| 17 |
+
({"content": "data:application/pdf;base64,dd"}, 422, "Incorrect padding"),
|
| 18 |
+
],
|
| 19 |
+
)
|
| 20 |
+
def test_extract_text(input, status_code, error_msg):
|
| 21 |
+
client = api.test_client()
|
| 22 |
+
response = client.post("/extract-text", json=input)
|
| 23 |
+
|
| 24 |
+
assert response.status_code == status_code
|
| 25 |
+
assert response.get_json() == {"error": error_msg}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@pytest.mark.parametrize(
|
| 29 |
+
"input, status_code, msg_type, msg",
|
| 30 |
+
[
|
| 31 |
+
(4, 415, "error", "input was not a json, was <class 'int'>"),
|
| 32 |
+
({"wrongKey": None}, 400, "error", "missing field 'text'"),
|
| 33 |
+
({"text": None}, 400, "error", "missing field 'text'"),
|
| 34 |
+
({"text": 1}, 415, "error", "text must be of type str, was <class 'int'>"),
|
| 35 |
+
({"text": "", "second_key": None}, 400, "error", "unexpected additional JSON fields"),
|
| 36 |
+
({"text": ""}, 200, "summary", ""),
|
| 37 |
+
({"text": "This is a test article to summarize."}, 200, "summary", "test"),
|
| 38 |
+
],
|
| 39 |
+
)
|
| 40 |
+
def test_summarize_text(input, status_code, msg_type, msg):
|
| 41 |
+
client = api.test_client()
|
| 42 |
+
response = client.post("/summarize-text", json=input)
|
| 43 |
+
data = response.get_json()
|
| 44 |
+
|
| 45 |
+
assert response.status_code == status_code
|
| 46 |
+
assert msg_type in data.keys()
|
| 47 |
+
assert msg in data[msg_type]
|