from mediastorm.ingest.parser import ( strip_html, parse_transcript, extract_embed_codes, parse_credits, extract_poster_images, parse_recognition, parse_press_mentions, extract_director, extract_commissioned_by, ) def test_strip_html_removes_tags(): html = "

Hello world.

" assert strip_html(html) == "Hello world." def test_strip_html_handles_empty(): assert strip_html("") == "" assert strip_html(None) == "" def test_strip_html_preserves_whitespace(): html = "

First paragraph.

Second paragraph.

" result = strip_html(html) assert "First paragraph." in result assert "Second paragraph." in result def test_parse_transcript_extracts_speakers(sample_transcript_html): turns = parse_transcript(sample_transcript_html) assert len(turns) >= 2 assert turns[0].speaker == "Phillip Toledano" assert "photographing my father" in turns[0].text assert turns[1].speaker == "Brian Storm" assert "documentary stories" in turns[1].text def test_parse_transcript_handles_narration(sample_transcript_html): """Narration without speaker attribution should still be captured.""" turns = parse_transcript(sample_transcript_html) narration = [t for t in turns if t.speaker is None] assert len(narration) >= 1 assert "narration" in narration[0].text.lower() def test_parse_transcript_handles_empty(): assert parse_transcript("") == [] assert parse_transcript(None) == [] def test_extract_embed_codes_from_structured_content(sample_structured_content): codes = extract_embed_codes(sample_structured_content) assert "832l" in codes assert "abc1" in codes assert len(codes) == 2 def test_extract_embed_codes_handles_no_embeds(): content = [{"block_type": "text", "content": "

Just text

"}] assert extract_embed_codes(content) == [] def test_parse_credits(): credits_data = [ {"name": "John Doe", "role": "Director", "type": "individual"}, {"name": "MediaStorm", "role": "Producer", "type": "organization"}, ] credits = parse_credits(credits_data) assert len(credits) == 2 assert credits[0].name == "John Doe" assert credits[0].role == "Director" def test_extract_poster_images_returns_all_sizes(sample_page_data): urls = extract_poster_images(sample_page_data["preview_images"]) assert urls == [ "/media/abc123_600.jpg", "/media/abc123_900.jpg", "/media/abc123_1500.jpg", ] def test_extract_poster_images_handles_missing_poster_frame(): urls = extract_poster_images({"square": [{"url": "/media/x.jpg"}]}) assert urls == [] def test_extract_poster_images_handles_none(): assert extract_poster_images(None) == [] def test_extract_poster_images_handles_empty_dict(): assert extract_poster_images({}) == [] def test_extract_poster_images_handles_empty_poster_frame_list(): assert extract_poster_images({"poster-frame": []}) == [] def test_extract_poster_images_sorts_by_width(): preview = { "poster-frame": [ {"width": 1500, "height": 844, "url": "/media/h_1500.jpg"}, {"width": 600, "height": 337, "url": "/media/h_600.jpg"}, ] } urls = extract_poster_images(preview) assert urls == ["/media/h_600.jpg", "/media/h_1500.jpg"] # --- parse_recognition --- def test_parse_recognition_format1_structured(): """Format 1: Festival followed by Year:/Place:/Category: lines.""" structured_content = [ {"block_type": "heading", "content": "Recognition"}, { "block_type": "text", "content": ( "

NPPA's Best of Photojournalism
" "Year: 2013
Place: First
" "Category: Multimedia Documentary

" "

World Press Photo
" "Year: 2014
Place: Second
" "Category: Multimedia

" ), }, ] awards = parse_recognition(structured_content) assert len(awards) == 2 assert awards[0]["festival"] == "NPPA's Best of Photojournalism" assert awards[0]["year"] == "2013" assert awards[0]["place"] == "First" assert awards[0]["category"] == "Multimedia Documentary" assert awards[1]["festival"] == "World Press Photo" def test_parse_recognition_format2_inline(): """Format 2: Year: Festival, Place, Category.""" structured_content = [ {"block_type": "heading", "content": "Recognition"}, { "block_type": "text", "content": ( '

2016: Pictures of the Year International,' " First Place, Multimedia Photographer of the Year

" '

2015: Emmy Award,' " Winner, New Approaches to Documentary

" ), }, ] awards = parse_recognition(structured_content) assert len(awards) == 2 assert awards[0]["festival"] == "Pictures of the Year International" assert awards[0]["year"] == "2016" assert awards[0]["place"] == "First Place" assert awards[1]["festival"] == "Emmy Award" assert awards[1]["year"] == "2015" assert awards[1]["place"] == "Winner" def test_parse_recognition_no_recognition_heading(): structured_content = [ {"block_type": "heading", "content": "Introduction"}, {"block_type": "text", "content": "

Just text.

"}, ] assert parse_recognition(structured_content) == [] def test_parse_recognition_empty(): assert parse_recognition(None) == [] assert parse_recognition([]) == [] # --- parse_press_mentions --- def test_parse_press_mentions_extracts_links(): structured_content = [ {"block_type": "heading", "content": "Press"}, { "block_type": "text", "content": ( '

The New York Times

' '

The Washington Post

' ), }, ] mentions = parse_press_mentions(structured_content) assert len(mentions) == 2 assert mentions[0]["name"] == "The New York Times" assert mentions[0]["url"] == "https://nytimes.com/article" assert mentions[1]["name"] == "The Washington Post" def test_parse_press_mentions_no_press_heading(): structured_content = [ {"block_type": "heading", "content": "Credits"}, {"block_type": "text", "content": "

Some text.

"}, ] assert parse_press_mentions(structured_content) == [] def test_parse_press_mentions_empty(): assert parse_press_mentions(None) == [] assert parse_press_mentions([]) == [] # --- extract_director --- def test_extract_director_finds_director_role(): credits = [ {"name": "Alice Smith", "role": "Photography", "type": "individual"}, {"name": "Tim McLaughlin", "role": "Director & Editor", "type": "individual"}, {"name": "MediaStorm", "role": "Producer", "type": "organization"}, ] assert extract_director(credits) == "Tim McLaughlin" def test_extract_director_case_insensitive(): credits = [{"name": "Jane Doe", "role": "director", "type": "individual"}] assert extract_director(credits) == "Jane Doe" def test_extract_director_no_director(): credits = [{"name": "Alice", "role": "Photography", "type": "individual"}] assert extract_director(credits) == "" def test_extract_director_empty(): assert extract_director(None) == "" assert extract_director([]) == "" # --- extract_commissioned_by --- def test_extract_commissioned_by_from_credits(): credits = [ {"name": "Yale Environment 360", "role": "Commissioned By", "type": "organization"}, ] assert extract_commissioned_by(credits, "Some Story") == "Yale Environment 360" def test_extract_commissioned_by_from_name_pattern(): credits = [{"name": "Alice", "role": "Director", "type": "individual"}] result = extract_commissioned_by(credits, "Leveling Appalachia for Yale Environment 360") assert result == "Yale Environment 360" def test_extract_commissioned_by_credits_takes_priority(): credits = [ {"name": "UNICEF", "role": "Commissioned by", "type": "organization"}, ] result = extract_commissioned_by(credits, "Story for Some Client") assert result == "UNICEF" def test_extract_commissioned_by_none(): credits = [{"name": "Alice", "role": "Director", "type": "individual"}] assert extract_commissioned_by(credits, "Simple Title") == "" def test_extract_commissioned_by_empty(): assert extract_commissioned_by(None, "") == "" assert extract_commissioned_by([], "") == ""