Spaces:

remdms
/

mediastorm

Sleeping

File size: 8,850 Bytes

from mediastorm.ingest.parser import (
    strip_html,
    parse_transcript,
    extract_embed_codes,
    parse_credits,
    extract_poster_images,
    parse_recognition,
    parse_press_mentions,
    extract_director,
    extract_commissioned_by,
)


def test_strip_html_removes_tags():
    html = "<p>Hello <strong>world</strong>.</p>"
    assert strip_html(html) == "Hello world."


def test_strip_html_handles_empty():
    assert strip_html("") == ""
    assert strip_html(None) == ""


def test_strip_html_preserves_whitespace():
    html = "<p>First paragraph.</p><p>Second paragraph.</p>"
    result = strip_html(html)
    assert "First paragraph." in result
    assert "Second paragraph." in result


def test_parse_transcript_extracts_speakers(sample_transcript_html):
    turns = parse_transcript(sample_transcript_html)
    assert len(turns) >= 2
    assert turns[0].speaker == "Phillip Toledano"
    assert "photographing my father" in turns[0].text
    assert turns[1].speaker == "Brian Storm"
    assert "documentary stories" in turns[1].text


def test_parse_transcript_handles_narration(sample_transcript_html):
    """Narration without speaker attribution should still be captured."""
    turns = parse_transcript(sample_transcript_html)
    narration = [t for t in turns if t.speaker is None]
    assert len(narration) >= 1
    assert "narration" in narration[0].text.lower()


def test_parse_transcript_handles_empty():
    assert parse_transcript("") == []
    assert parse_transcript(None) == []


def test_extract_embed_codes_from_structured_content(sample_structured_content):
    codes = extract_embed_codes(sample_structured_content)
    assert "832l" in codes
    assert "abc1" in codes
    assert len(codes) == 2


def test_extract_embed_codes_handles_no_embeds():
    content = [{"block_type": "text", "content": "<p>Just text</p>"}]
    assert extract_embed_codes(content) == []


def test_parse_credits():
    credits_data = [
        {"name": "John Doe", "role": "Director", "type": "individual"},
        {"name": "MediaStorm", "role": "Producer", "type": "organization"},
    ]
    credits = parse_credits(credits_data)
    assert len(credits) == 2
    assert credits[0].name == "John Doe"
    assert credits[0].role == "Director"


def test_extract_poster_images_returns_all_sizes(sample_page_data):
    urls = extract_poster_images(sample_page_data["preview_images"])
    assert urls == [
        "/media/abc123_600.jpg",
        "/media/abc123_900.jpg",
        "/media/abc123_1500.jpg",
    ]


def test_extract_poster_images_handles_missing_poster_frame():
    urls = extract_poster_images({"square": [{"url": "/media/x.jpg"}]})
    assert urls == []


def test_extract_poster_images_handles_none():
    assert extract_poster_images(None) == []


def test_extract_poster_images_handles_empty_dict():
    assert extract_poster_images({}) == []


def test_extract_poster_images_handles_empty_poster_frame_list():
    assert extract_poster_images({"poster-frame": []}) == []


def test_extract_poster_images_sorts_by_width():
    preview = {
        "poster-frame": [
            {"width": 1500, "height": 844, "url": "/media/h_1500.jpg"},
            {"width": 600, "height": 337, "url": "/media/h_600.jpg"},
        ]
    }
    urls = extract_poster_images(preview)
    assert urls == ["/media/h_600.jpg", "/media/h_1500.jpg"]


# --- parse_recognition ---

def test_parse_recognition_format1_structured():
    """Format 1: <strong>Festival</strong> followed by Year:/Place:/Category: lines."""
    structured_content = [
        {"block_type": "heading", "content": "Recognition"},
        {
            "block_type": "text",
            "content": (
                "<p><strong>NPPA's Best of Photojournalism</strong><br>"
                "Year: 2013<br>Place: First<br>"
                "Category: Multimedia Documentary</p>"
                "<p><strong>World Press Photo</strong><br>"
                "Year: 2014<br>Place: Second<br>"
                "Category: Multimedia</p>"
            ),
        },
    ]
    awards = parse_recognition(structured_content)
    assert len(awards) == 2
    assert awards[0]["festival"] == "NPPA's Best of Photojournalism"
    assert awards[0]["year"] == "2013"
    assert awards[0]["place"] == "First"
    assert awards[0]["category"] == "Multimedia Documentary"
    assert awards[1]["festival"] == "World Press Photo"


def test_parse_recognition_format2_inline():
    """Format 2: <strong>Year:</strong> Festival, Place, Category."""
    structured_content = [
        {"block_type": "heading", "content": "Recognition"},
        {
            "block_type": "text",
            "content": (
                '<p><strong>2016:</strong> <a href="#">Pictures of the Year International</a>,'
                " First Place, Multimedia Photographer of the Year</p>"
                '<p><strong>2015:</strong> <a href="#">Emmy Award</a>,'
                " Winner, New Approaches to Documentary</p>"
            ),
        },
    ]
    awards = parse_recognition(structured_content)
    assert len(awards) == 2
    assert awards[0]["festival"] == "Pictures of the Year International"
    assert awards[0]["year"] == "2016"
    assert awards[0]["place"] == "First Place"
    assert awards[1]["festival"] == "Emmy Award"
    assert awards[1]["year"] == "2015"
    assert awards[1]["place"] == "Winner"


def test_parse_recognition_no_recognition_heading():
    structured_content = [
        {"block_type": "heading", "content": "Introduction"},
        {"block_type": "text", "content": "<p>Just text.</p>"},
    ]
    assert parse_recognition(structured_content) == []


def test_parse_recognition_empty():
    assert parse_recognition(None) == []
    assert parse_recognition([]) == []


# --- parse_press_mentions ---

def test_parse_press_mentions_extracts_links():
    structured_content = [
        {"block_type": "heading", "content": "Press"},
        {
            "block_type": "text",
            "content": (
                '<p><a href="https://nytimes.com/article">The New York Times</a></p>'
                '<p><a href="https://washpost.com/story">The Washington Post</a></p>'
            ),
        },
    ]
    mentions = parse_press_mentions(structured_content)
    assert len(mentions) == 2
    assert mentions[0]["name"] == "The New York Times"
    assert mentions[0]["url"] == "https://nytimes.com/article"
    assert mentions[1]["name"] == "The Washington Post"


def test_parse_press_mentions_no_press_heading():
    structured_content = [
        {"block_type": "heading", "content": "Credits"},
        {"block_type": "text", "content": "<p>Some text.</p>"},
    ]
    assert parse_press_mentions(structured_content) == []


def test_parse_press_mentions_empty():
    assert parse_press_mentions(None) == []
    assert parse_press_mentions([]) == []


# --- extract_director ---

def test_extract_director_finds_director_role():
    credits = [
        {"name": "Alice Smith", "role": "Photography", "type": "individual"},
        {"name": "Tim McLaughlin", "role": "Director & Editor", "type": "individual"},
        {"name": "MediaStorm", "role": "Producer", "type": "organization"},
    ]
    assert extract_director(credits) == "Tim McLaughlin"


def test_extract_director_case_insensitive():
    credits = [{"name": "Jane Doe", "role": "director", "type": "individual"}]
    assert extract_director(credits) == "Jane Doe"


def test_extract_director_no_director():
    credits = [{"name": "Alice", "role": "Photography", "type": "individual"}]
    assert extract_director(credits) == ""


def test_extract_director_empty():
    assert extract_director(None) == ""
    assert extract_director([]) == ""


# --- extract_commissioned_by ---

def test_extract_commissioned_by_from_credits():
    credits = [
        {"name": "Yale Environment 360", "role": "Commissioned By", "type": "organization"},
    ]
    assert extract_commissioned_by(credits, "Some Story") == "Yale Environment 360"


def test_extract_commissioned_by_from_name_pattern():
    credits = [{"name": "Alice", "role": "Director", "type": "individual"}]
    result = extract_commissioned_by(credits, "Leveling Appalachia for Yale Environment 360")
    assert result == "Yale Environment 360"


def test_extract_commissioned_by_credits_takes_priority():
    credits = [
        {"name": "UNICEF", "role": "Commissioned by", "type": "organization"},
    ]
    result = extract_commissioned_by(credits, "Story for Some Client")
    assert result == "UNICEF"


def test_extract_commissioned_by_none():
    credits = [{"name": "Alice", "role": "Director", "type": "individual"}]
    assert extract_commissioned_by(credits, "Simple Title") == ""


def test_extract_commissioned_by_empty():
    assert extract_commissioned_by(None, "") == ""
    assert extract_commissioned_by([], "") == ""