Spaces:
Running on Zero
Running on Zero
File size: 3,129 Bytes
354b37e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | import sys
import tempfile
from pathlib import Path
import fitz
from PIL import Image
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from src.document_processing import document_intake_metadata, document_to_payload_parts, validate_upload
def test_png_upload_returns_image_url_part():
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
path = tmp.name
Image.new("RGB", (32, 32), color="white").save(path)
parts = document_to_payload_parts(path)
assert len(parts) == 1
assert parts[0]["type"] == "image_url"
assert parts[0]["image_url"]["url"].startswith("data:image/jpeg;base64,")
def test_jpeg_upload_returns_image_url_part():
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
path = tmp.name
Image.new("RGB", (24, 24), color="red").save(path, format="JPEG")
parts = document_to_payload_parts(path)
assert len(parts) == 1
assert parts[0]["type"] == "image_url"
def test_pdf_upload_renders_pages_to_images():
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
path = tmp.name
document = fitz.open()
page = document.new_page()
page.insert_text((72, 72), "Hemoglobin 12.5 g/dL")
document.save(path)
document.close()
parts = document_to_payload_parts(path, max_pages=1)
assert len(parts) == 1
assert parts[0]["type"] == "image_url"
assert parts[0]["image_url"]["url"].startswith("data:image/png;base64,")
def test_text_upload_still_returns_text_part():
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False, mode="w", encoding="utf-8") as tmp:
tmp.write("Hemoglobin 13.1 g/dL")
path = tmp.name
parts = document_to_payload_parts(path)
assert len(parts) == 1
assert parts[0]["type"] == "text"
assert "Hemoglobin" in parts[0]["text"]
def test_validate_upload_rejects_unknown_extension():
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
path = tmp.name
try:
validate_upload(path)
raise AssertionError("expected ValueError")
except ValueError as error:
assert "Unsupported file type" in str(error)
def test_document_intake_metadata_for_pdf():
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
path = tmp.name
document = fitz.open()
page = document.new_page()
page.insert_text((72, 72), "Sample")
document.save(path)
document.close()
parts = document_to_payload_parts(path, max_pages=1)
metadata = document_intake_metadata(path, parts)
assert metadata["input_modality"] == "vision"
assert metadata["pages_rendered"] == 1
assert metadata["image_count"] == 1
if __name__ == "__main__":
test_png_upload_returns_image_url_part()
test_jpeg_upload_returns_image_url_part()
test_pdf_upload_renders_pages_to_images()
test_text_upload_still_returns_text_part()
test_validate_upload_rejects_unknown_extension()
test_document_intake_metadata_for_pdf()
print("test_document_processing: ok")
|