Spaces:
Sleeping
Sleeping
| from mediastorm.ingest.parser import ( | |
| strip_html, | |
| parse_transcript, | |
| extract_embed_codes, | |
| parse_credits, | |
| extract_poster_images, | |
| parse_recognition, | |
| parse_press_mentions, | |
| extract_director, | |
| extract_commissioned_by, | |
| ) | |
| def test_strip_html_removes_tags(): | |
| html = "<p>Hello <strong>world</strong>.</p>" | |
| assert strip_html(html) == "Hello world." | |
| def test_strip_html_handles_empty(): | |
| assert strip_html("") == "" | |
| assert strip_html(None) == "" | |
| def test_strip_html_preserves_whitespace(): | |
| html = "<p>First paragraph.</p><p>Second paragraph.</p>" | |
| result = strip_html(html) | |
| assert "First paragraph." in result | |
| assert "Second paragraph." in result | |
| def test_parse_transcript_extracts_speakers(sample_transcript_html): | |
| turns = parse_transcript(sample_transcript_html) | |
| assert len(turns) >= 2 | |
| assert turns[0].speaker == "Phillip Toledano" | |
| assert "photographing my father" in turns[0].text | |
| assert turns[1].speaker == "Brian Storm" | |
| assert "documentary stories" in turns[1].text | |
| def test_parse_transcript_handles_narration(sample_transcript_html): | |
| """Narration without speaker attribution should still be captured.""" | |
| turns = parse_transcript(sample_transcript_html) | |
| narration = [t for t in turns if t.speaker is None] | |
| assert len(narration) >= 1 | |
| assert "narration" in narration[0].text.lower() | |
| def test_parse_transcript_handles_empty(): | |
| assert parse_transcript("") == [] | |
| assert parse_transcript(None) == [] | |
| def test_extract_embed_codes_from_structured_content(sample_structured_content): | |
| codes = extract_embed_codes(sample_structured_content) | |
| assert "832l" in codes | |
| assert "abc1" in codes | |
| assert len(codes) == 2 | |
| def test_extract_embed_codes_handles_no_embeds(): | |
| content = [{"block_type": "text", "content": "<p>Just text</p>"}] | |
| assert extract_embed_codes(content) == [] | |
| def test_parse_credits(): | |
| credits_data = [ | |
| {"name": "John Doe", "role": "Director", "type": "individual"}, | |
| {"name": "MediaStorm", "role": "Producer", "type": "organization"}, | |
| ] | |
| credits = parse_credits(credits_data) | |
| assert len(credits) == 2 | |
| assert credits[0].name == "John Doe" | |
| assert credits[0].role == "Director" | |
| def test_extract_poster_images_returns_all_sizes(sample_page_data): | |
| urls = extract_poster_images(sample_page_data["preview_images"]) | |
| assert urls == [ | |
| "/media/abc123_600.jpg", | |
| "/media/abc123_900.jpg", | |
| "/media/abc123_1500.jpg", | |
| ] | |
| def test_extract_poster_images_handles_missing_poster_frame(): | |
| urls = extract_poster_images({"square": [{"url": "/media/x.jpg"}]}) | |
| assert urls == [] | |
| def test_extract_poster_images_handles_none(): | |
| assert extract_poster_images(None) == [] | |
| def test_extract_poster_images_handles_empty_dict(): | |
| assert extract_poster_images({}) == [] | |
| def test_extract_poster_images_handles_empty_poster_frame_list(): | |
| assert extract_poster_images({"poster-frame": []}) == [] | |
| def test_extract_poster_images_sorts_by_width(): | |
| preview = { | |
| "poster-frame": [ | |
| {"width": 1500, "height": 844, "url": "/media/h_1500.jpg"}, | |
| {"width": 600, "height": 337, "url": "/media/h_600.jpg"}, | |
| ] | |
| } | |
| urls = extract_poster_images(preview) | |
| assert urls == ["/media/h_600.jpg", "/media/h_1500.jpg"] | |
| # --- parse_recognition --- | |
| def test_parse_recognition_format1_structured(): | |
| """Format 1: <strong>Festival</strong> followed by Year:/Place:/Category: lines.""" | |
| structured_content = [ | |
| {"block_type": "heading", "content": "Recognition"}, | |
| { | |
| "block_type": "text", | |
| "content": ( | |
| "<p><strong>NPPA's Best of Photojournalism</strong><br>" | |
| "Year: 2013<br>Place: First<br>" | |
| "Category: Multimedia Documentary</p>" | |
| "<p><strong>World Press Photo</strong><br>" | |
| "Year: 2014<br>Place: Second<br>" | |
| "Category: Multimedia</p>" | |
| ), | |
| }, | |
| ] | |
| awards = parse_recognition(structured_content) | |
| assert len(awards) == 2 | |
| assert awards[0]["festival"] == "NPPA's Best of Photojournalism" | |
| assert awards[0]["year"] == "2013" | |
| assert awards[0]["place"] == "First" | |
| assert awards[0]["category"] == "Multimedia Documentary" | |
| assert awards[1]["festival"] == "World Press Photo" | |
| def test_parse_recognition_format2_inline(): | |
| """Format 2: <strong>Year:</strong> Festival, Place, Category.""" | |
| structured_content = [ | |
| {"block_type": "heading", "content": "Recognition"}, | |
| { | |
| "block_type": "text", | |
| "content": ( | |
| '<p><strong>2016:</strong> <a href="#">Pictures of the Year International</a>,' | |
| " First Place, Multimedia Photographer of the Year</p>" | |
| '<p><strong>2015:</strong> <a href="#">Emmy Award</a>,' | |
| " Winner, New Approaches to Documentary</p>" | |
| ), | |
| }, | |
| ] | |
| awards = parse_recognition(structured_content) | |
| assert len(awards) == 2 | |
| assert awards[0]["festival"] == "Pictures of the Year International" | |
| assert awards[0]["year"] == "2016" | |
| assert awards[0]["place"] == "First Place" | |
| assert awards[1]["festival"] == "Emmy Award" | |
| assert awards[1]["year"] == "2015" | |
| assert awards[1]["place"] == "Winner" | |
| def test_parse_recognition_no_recognition_heading(): | |
| structured_content = [ | |
| {"block_type": "heading", "content": "Introduction"}, | |
| {"block_type": "text", "content": "<p>Just text.</p>"}, | |
| ] | |
| assert parse_recognition(structured_content) == [] | |
| def test_parse_recognition_empty(): | |
| assert parse_recognition(None) == [] | |
| assert parse_recognition([]) == [] | |
| # --- parse_press_mentions --- | |
| def test_parse_press_mentions_extracts_links(): | |
| structured_content = [ | |
| {"block_type": "heading", "content": "Press"}, | |
| { | |
| "block_type": "text", | |
| "content": ( | |
| '<p><a href="https://nytimes.com/article">The New York Times</a></p>' | |
| '<p><a href="https://washpost.com/story">The Washington Post</a></p>' | |
| ), | |
| }, | |
| ] | |
| mentions = parse_press_mentions(structured_content) | |
| assert len(mentions) == 2 | |
| assert mentions[0]["name"] == "The New York Times" | |
| assert mentions[0]["url"] == "https://nytimes.com/article" | |
| assert mentions[1]["name"] == "The Washington Post" | |
| def test_parse_press_mentions_no_press_heading(): | |
| structured_content = [ | |
| {"block_type": "heading", "content": "Credits"}, | |
| {"block_type": "text", "content": "<p>Some text.</p>"}, | |
| ] | |
| assert parse_press_mentions(structured_content) == [] | |
| def test_parse_press_mentions_empty(): | |
| assert parse_press_mentions(None) == [] | |
| assert parse_press_mentions([]) == [] | |
| # --- extract_director --- | |
| def test_extract_director_finds_director_role(): | |
| credits = [ | |
| {"name": "Alice Smith", "role": "Photography", "type": "individual"}, | |
| {"name": "Tim McLaughlin", "role": "Director & Editor", "type": "individual"}, | |
| {"name": "MediaStorm", "role": "Producer", "type": "organization"}, | |
| ] | |
| assert extract_director(credits) == "Tim McLaughlin" | |
| def test_extract_director_case_insensitive(): | |
| credits = [{"name": "Jane Doe", "role": "director", "type": "individual"}] | |
| assert extract_director(credits) == "Jane Doe" | |
| def test_extract_director_no_director(): | |
| credits = [{"name": "Alice", "role": "Photography", "type": "individual"}] | |
| assert extract_director(credits) == "" | |
| def test_extract_director_empty(): | |
| assert extract_director(None) == "" | |
| assert extract_director([]) == "" | |
| # --- extract_commissioned_by --- | |
| def test_extract_commissioned_by_from_credits(): | |
| credits = [ | |
| {"name": "Yale Environment 360", "role": "Commissioned By", "type": "organization"}, | |
| ] | |
| assert extract_commissioned_by(credits, "Some Story") == "Yale Environment 360" | |
| def test_extract_commissioned_by_from_name_pattern(): | |
| credits = [{"name": "Alice", "role": "Director", "type": "individual"}] | |
| result = extract_commissioned_by(credits, "Leveling Appalachia for Yale Environment 360") | |
| assert result == "Yale Environment 360" | |
| def test_extract_commissioned_by_credits_takes_priority(): | |
| credits = [ | |
| {"name": "UNICEF", "role": "Commissioned by", "type": "organization"}, | |
| ] | |
| result = extract_commissioned_by(credits, "Story for Some Client") | |
| assert result == "UNICEF" | |
| def test_extract_commissioned_by_none(): | |
| credits = [{"name": "Alice", "role": "Director", "type": "individual"}] | |
| assert extract_commissioned_by(credits, "Simple Title") == "" | |
| def test_extract_commissioned_by_empty(): | |
| assert extract_commissioned_by(None, "") == "" | |
| assert extract_commissioned_by([], "") == "" | |