Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| # MISSING VALUES CHECK | |
| # we need to check two levels of missing values: | |
| # level 1 - csv field is empty. | |
| # since we are converting it to a pandas df a simple pd.isna() should do the trick. | |
| # level 2 - csv field is there but json field inside is empty or missing values or null. | |
| # we need to get more creative. | |
| # for example if titles field is there but the json inside is: | |
| # 2.a [] | |
| # 2.b [{"kind": "", "language": "", "text": ""}] | |
| # 2.c [{"kind": "local", "language": null, "text": "some title"}] | |
| # 2.d [{"kind": "local", "text": "some title"}] (no language key at all) | |
| # if not isinstance(titles, (list, tuple)) or len(titles) == 0: | |
| # handles 2.a | |
| # if not t.get("kind"): | |
| # catches None, '', and NaN | |
| # hanldes 2.b, 2.c and 2.d | |
| # ###################################################################################################################### | |
| # TITLES | |
| def title_check(df, catalog_language: str, other_languages: list): | |
| issues = {} | |
| warnings = {} | |
| allowed_languages = set(other_languages + [catalog_language]) | |
| for row in df.index: | |
| titles = df.at[row, "Titles"] | |
| if not isinstance(titles, (list, tuple)) or len(titles) == 0: | |
| issues.setdefault(f"Missing title list", []).append(row) | |
| continue | |
| # --- Collect local/original for duplicate check | |
| local_titles = [] | |
| original_titles = [] | |
| for t in titles: | |
| # 1. Empty fields | |
| if not t.get("kind"): | |
| issues.setdefault(f"Empty title kind", []).append(row) | |
| if not t.get("language"): | |
| issues.setdefault(f"Empty title language", []).append(row) | |
| if not t.get("text"): | |
| issues.setdefault(f"Empty title text", []).append(row) | |
| # 2. Unexpected languages (not in catalogue) | |
| if t.get("language") not in allowed_languages: | |
| warnings.setdefault(f"unwanted_language: {t.get('language')}", []).append(row) | |
| # 3. Save for duplicate check | |
| if t.get("kind") == "local": | |
| local_titles.append((t.get("text"), t.get("language"))) | |
| if t.get("kind") == "original": | |
| original_titles.append((t.get("text"), t.get("language"))) | |
| # 4. Compare locals vs originals | |
| for l in local_titles: | |
| if l in original_titles: | |
| issues.setdefault(f"Original title same as local", []).append(row) | |
| # 5. if more than one original language --> issue | |
| original_langs = [lang for _, lang in original_titles if lang] # unpacks the languages from original_titles IF we have a language | |
| if len(set(original_langs)) > 1: # if we have more than one --> issue | |
| issues.setdefault("Multiple original languages for descriptions", []).append(row) | |
| # 6. same local title more than once with the same langauge code -> warning | |
| seen = {} | |
| for text, lang in local_titles: | |
| if (text, lang) in seen: | |
| warnings.setdefault("Duplicate local title in same language", []).append(row) | |
| else: | |
| seen[(text, lang)] = True | |
| issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()] | |
| warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()] | |
| return issues, warnings | |
| # ###################################################################################################################### | |
| # DESCRIPTION | |
| def description_check(df, catalog_language: str, other_languages: list): | |
| issues = {} | |
| warnings = {} | |
| allowed_languages = set(other_languages + [catalog_language]) | |
| for row in df.index: | |
| descriptions = df.at[row, "Descriptions"] | |
| if not isinstance(descriptions, (list, tuple)) or len(descriptions) == 0: | |
| issues.setdefault(f"Missing description list", []).append(row) | |
| continue | |
| # --- Filter episode descriptions: remove redundant kinds | |
| # if kind == "episode": | |
| # descriptions = [d for d in descriptions if d.get("kind") not in | |
| # ("season_local", "season_original", "show_local", "show_original")] | |
| # --- Collect local/original for duplicate check | |
| local_descriptions = [] | |
| original_descriptions = [] | |
| for d in descriptions: | |
| # 1. Empty fields | |
| if not d.get("kind"): | |
| issues.setdefault(f"Empty description kind", []).append(row) | |
| if not d.get("language"): | |
| issues.setdefault(f"Empty description language", []).append(row) | |
| if not d.get("text"): | |
| issues.setdefault(f"Empty description text", []).append(row) | |
| # 2. Unexpected languages (not in catalogue) | |
| if d.get("language") not in allowed_languages: | |
| warnings.setdefault(f"Unexpected languages: {d.get('language')}", []).append(row) | |
| # 3. Save for duplicate check | |
| if d.get("kind") == "local": | |
| local_descriptions.append((d.get("text"), d.get("language"))) | |
| if d.get("kind") == "original": | |
| original_descriptions.append((d.get("text"), d.get("language"))) | |
| # 4. Too long descriptions | |
| if d.get("text") and len(d.get("text").split()) > 1000: | |
| issues.setdefault("description too long (>1000 words)", []).append(row) | |
| # 5. Compare locals vs originals | |
| for l in local_descriptions: | |
| if l in original_descriptions: | |
| issues.setdefault("Original description same as local", []).append(row) | |
| # 6. if more than one original language --> issue | |
| original_langs = [lang for _, lang in original_descriptions if lang] # unpacks the languages from original_descriptions IF we have a language | |
| if len(set(original_langs)) > 1: # if we have more than one --> issue | |
| issues.setdefault("Multiple original languages for descriptions", []).append(row) | |
| # 7. same local title more than once with the same langauge code -> warning | |
| seen = {} | |
| for text, lang in local_descriptions: | |
| if (text, lang) in seen: | |
| warnings.setdefault("Duplicate local descriptions in same language", []).append(row) | |
| else: | |
| seen[(text, lang)] = True | |
| issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()] | |
| warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()] | |
| return issues, warnings | |
| # ###################################################################################################################### | |
| # CROSS-FIELD CHECKS | |
| def language_alignment_check(df): | |
| warnings = {} | |
| for row in df.index: | |
| title_langs = [] | |
| desc_langs = [] | |
| titles = df.at[row, "Titles"] | |
| descriptions = df.at[row, "Descriptions"] | |
| if titles: | |
| for t in titles: | |
| lang = t.get("language") | |
| if lang: | |
| title_langs.append(lang) | |
| if descriptions: | |
| for d in descriptions: | |
| lang = d.get("language") | |
| if lang: | |
| desc_langs.append(lang) | |
| if set(title_langs) != set(desc_langs): | |
| warnings.setdefault(f"Mismatch between title and description languages --> {(set(title_langs) ^ set(desc_langs))}", []).append(row) # absolute difference between the two sets | |
| warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()] | |
| return warnings | |
| # ###################################################################################################################### | |
| # META / ATTRIBUTES | |
| def meta_check(df): | |
| issues = {} | |
| warnings = {} | |
| for row in df.index: | |
| #DURATION | |
| # if program kind is either movie or episode and they don't have duration. raise warning. | |
| kind = df.at[row, "Kind"] | |
| if (kind == 'movie' or kind == 'episode') and pd.isna(df.at[row, "Meta - Duration"]): | |
| warnings.setdefault("No duration", []).append(row) | |
| #RELEASE DATE AND RELEASE YEAR | |
| # release date not present | |
| if pd.isna(df.at[row, "Meta - Release Date"]): | |
| warnings.setdefault("No release date", []).append(row) | |
| # release year not present but release date is | |
| if pd.isna(df.at[row, "Meta - Release Year"]) and df.at[row, "Meta - Release Date"]: # easy fix | |
| warnings.setdefault("No release year, but release date is present", []).append(row) | |
| # release year not present | |
| if pd.isna(df.at[row, "Meta - Release Year"]): | |
| warnings.setdefault("No release year", []).append(row) | |
| #GENRE | |
| genres = df.at[row, "Meta - Genres"] | |
| # genre not present | |
| if pd.isna(genres) or (isinstance(genres, (list, tuple)) and len(genres) == 0): | |
| warnings.setdefault("No genres", []).append(row) | |
| # if genres is a string, convert to list for consistency: | |
| if isinstance(genres, str): | |
| genres = [genres] | |
| genres_lower = [] | |
| if isinstance(genres, (list, tuple)): | |
| genres_lower = [g.lower() for g in genres] | |
| # sport or sports parsed as genre | |
| if "sport" in genres_lower or "sports" in genres_lower: | |
| issues.setdefault("Sport is being parsed as genre", []).append(row) | |
| #COUNTRIES | |
| # country not present | |
| if pd.isna(df.at[row, "Meta - Countries"]): | |
| warnings.setdefault("No country", []).append(row) | |
| #SEASON AND EPISODE NUMBER | |
| if kind == "season" and pd.isna(df.at[row, "Meta - Season"]): | |
| issues.setdefault("No season number", []).append(row) | |
| if kind == "episode" and pd.isna(df.at[row, "Meta - Episode"]): | |
| issues.setdefault("No episode number", []).append(row) | |
| if kind == "episode" and pd.isna(df.at[row, "Meta - Season"]): | |
| issues.setdefault("No season number on episode level", []).append(row) | |
| # PART NO ?????? what is that about? | |
| issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()] | |
| warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()] | |
| return issues, warnings | |
| # ###################################################################################################################### | |
| # CREDITS | |
| def credits_check(df): | |
| issues = {} | |
| warnings = {} | |
| season_by_id = {df.at[row, "External ID"] : row for row in df.index if df.at[row, "Kind"] == "season" } # build dictionaries to usea as look-up table by lower level assets | |
| show_by_id = {df.at[row, "External ID"] : row for row in df.index if df.at[row, "Kind"] == "show" } | |
| for row in df.index: | |
| credit = df.at[row, "Credits"] | |
| kind = df.at[row, "Kind"] | |
| # 1. Check for credits that are not there across diffrent levels. | |
| if not isinstance(credit, (list, tuple)) or len(credit) == 0: | |
| if kind == 'movie': | |
| issues.setdefault("Credits are not present in movies", []).append(row) | |
| elif kind == 'episode': | |
| season_id = df.at[row, "External Season ID"] # external season id will always match the External ID of the corresponding season (key in the season_by_id dictionary) | |
| show_id = df.at[row, "External Parent ID"] # same thing for external parent ID. External parent ID will match the External ID in the correponding show (key in the show_by_id dictionary) | |
| # Check season | |
| if season_id in season_by_id: | |
| season_row = season_by_id[season_id] # if the season corresponding to the external season ID of the episode in question has credit --> we good | |
| if isinstance(df.at[season_row, "Credits"], (list, tuple)) and len(df.at[season_row, "Credits"]) > 0: | |
| continue # season has credits --> OK | |
| # Check show | |
| if show_id in show_by_id: | |
| show_row = show_by_id[show_id] # if it has them in the show instead of the season, we are also good. | |
| if isinstance(df.at[show_row, "Credits"], (list, tuple)) and len(df.at[show_row, "Credits"]) > 0: | |
| continue # show has credits --> OK | |
| # If neither season nor show has credits --> issue | |
| warnings.setdefault("Episode and parent Season/Show have no credits", []).append(row) | |
| # elif kind == 'season': # this check and the one below might be redundant, since we go from the bottom-up in the check before. Just being safe. | |
| # show_id = df.at[row, "External Parent ID"] # here just for the edge cases where episodes are not the lowest level. | |
| # # Check show | |
| # if show_id in show_by_id: | |
| # show_row = show_by_id[show_id] | |
| # if isinstance(df.at[show_row, "Credits"], (list, tuple)) and len(df.at[show_row, "Credits"]) > 0: | |
| # continue # show has credits → OK | |
| # # If no credits in season or parent show → issue | |
| # warnings.setdefault("Season and parent Show have no credits", []).append(row) | |
| # elif kind == 'show': | |
| # warnings.setdefault("Credits are not present in show", []).append(row) | |
| # continue # skip to next row since no credits here | |
| missing_role_flag = False | |
| missing_char_name_flag = False | |
| missing_name_flag = False | |
| # 2. role, name, character name not there | |
| for c in credit: | |
| if not c.get("role") and not missing_role_flag: | |
| issues.setdefault("No role in credits", []).append(row) | |
| missing_role_flag = True | |
| if not c.get("character_name") and not missing_char_name_flag: | |
| warnings.setdefault("No character name in credits", []).append(row) | |
| missing_char_name_flag = True | |
| if not c.get("name") and not missing_name_flag: | |
| issues.setdefault("No name in credits", []).append(row) | |
| missing_name_flag = True | |
| # 3. language check if language field is present -> issue | |
| lang = c.get("language") | |
| if lang and not pd.isna(lang): | |
| issues.setdefault("Language field should not be present in credits", []).append(row) | |
| break # no need to report it more than once per program | |
| issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()] | |
| warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()] | |
| return issues, warnings | |
| # ###################################################################################################################### | |
| # QUALIFIERS | |
| def qualifiers_check(df): | |
| warnings = {} | |
| for row in df.index: | |
| # Aspect Ratio | |
| if pd.isna(df.at[row, "Qualifiers - Aspect Ratio"]): | |
| warnings.setdefault("no_aspect_ratio is being parsed:", []).append(row) | |
| # Definition | |
| if pd.isna(df.at[row, "Qualifiers - Definition"]): | |
| warnings.setdefault("no defintion is being parsed:", []).append(row) | |
| # Subtitles | |
| if pd.isna(df.at[row, "Qualifiers - Subtitles"]): | |
| warnings.setdefault("no subtitles are being parsed:", []).append(row) | |
| # Audio Languages | |
| if pd.isna(df.at[row, "Qualifiers - Audio Languages"]): | |
| warnings.setdefault("no audio languages are being parsed:", []).append(row) | |
| # Original Air Date | |
| if pd.isna(df.at[row, "Qualifiers - Original Air Date"]): | |
| warnings.setdefault("no Original Air Date is being parsed:", []).append(row) | |
| # Sound | |
| if pd.isna(df.at[row, "Qualifiers - Sound"]): | |
| warnings.setdefault("no Sound qualifier is being parsed:", []).append(row) | |
| warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()] | |
| return warnings | |
| # ###################################################################################################################### | |
| # ADVISORIES | |
| def advisories_check(df): | |
| issues = {} | |
| warnings = {} | |
| for row in df.index: | |
| advisories = df.at[row, "Advisories"] | |
| # chek for NO or FI country code. Parental rating is mandatory there. | |
| availabilities = df.at[row, "Availabilities"] | |
| if not isinstance(availabilities, (list, tuple)): | |
| availabilities = [] | |
| zones = [a.get("zone") for a in availabilities if isinstance(a, dict)] | |
| # no advisory or parental rarting: empty list or tuple | |
| if not isinstance(advisories, (list, tuple)) or len(advisories) == 0: | |
| if any(z in ("fi", "no") for z in zones): | |
| issues.setdefault("No advisory and no Parental Rating, for countries where it is mandatory (FI, NO)", []).append(row) | |
| else: | |
| warnings.setdefault("No advisory and no Parental Rating", []).append(row) | |
| continue | |
| for a in advisories: | |
| agency = a.get("agency") | |
| advisory = a.get("advisory") | |
| rating = a.get("rating") | |
| # advisory/rating present but no agency | |
| if (advisory or rating) and not agency: | |
| warnings.setdefault("agency missing when rating or advisory present", []).append(row) | |
| # agency present but no advisory or rating | |
| if agency and not (advisory or rating): | |
| issues.setdefault("rating or advisory missing when agency present", []).append(row) | |
| issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()] | |
| warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()] | |
| return issues, warnings | |
| # ###################################################################################################################### | |
| # DEEPLINKS | |
| def deeplink_check(df): | |
| issues = {} | |
| warnings = {} | |
| for row in df.index: | |
| deeplink = df.at[row, "Deeplinks"] # if season or show -> warning | |
| kind = df.at[row, "Kind"] | |
| #No deeplinks | |
| if not isinstance(deeplink, (list, tuple)) or len(deeplink) == 0: | |
| if kind == 'show' or kind == 'season': | |
| warnings.setdefault("No deeplinks parsed: ", []).append(row) | |
| else: | |
| issues.setdefault("No deeplinks parsed: ", []).append(row) | |
| continue | |
| # keep track of kinds by platform | |
| platform_kinds = {} | |
| # Flags to avoid duplicate issue creation per program | |
| missing_kind_flag = False | |
| missing_platform_flag = False | |
| for d in deeplink: | |
| platform = d.get("platform") | |
| if platform: | |
| platform_kinds.setdefault(platform, set()).add(d.get("kind")) | |
| # No kind | |
| if not d.get("kind") and not missing_kind_flag: # only add the issue once per program | |
| issues.setdefault("No kind with deeplink.", []).append(row) # for all platforms we have we should ahve both details and player | |
| missing_kind_flag = True | |
| # No platform | |
| if not platform and not missing_platform_flag: # only add the issue once per program | |
| issues.setdefault("No platform with deeplink.", []).append(row) | |
| missing_platform_flag = True | |
| # No Zone | |
| if not d.get("zone"): | |
| warnings.setdefault("No zone with deeplink: ", []).append(row) | |
| break | |
| # #No definition when qualifiers.definition is there | |
| # if df.at[row, "Qualifiers - Definition"] and pd.isna(d.get("definition")): | |
| # warnings.setdefault("No definition in deeplink, but we have qualifers.definition: ", []).append(row) | |
| #add check for pricing model. did not address it yet. | |
| # apple US noramlly does not have it. Want to be sure of the structure. | |
| for platform, kinds in platform_kinds.items(): | |
| if not {"details", "player"}.issubset(kinds): # checks if details and player is a subset of kinds for each program | |
| issues.setdefault( | |
| f'For platform {platform} we are missing either "details" or "player" deeplink', []).append(row) | |
| issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()] | |
| warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()] | |
| return issues, warnings | |
| # ###################################################################################################################### | |
| # AVAILABILITY | |
| def availability_check(df): | |
| issues = {} | |
| warnings = {} | |
| # Build dictionaries for season/show lookups | |
| season_by_id = {df.at[row, "External ID"]: row for row in df.index if df.at[row, "Kind"] == "season"} | |
| show_by_id = {df.at[row, "External ID"]: row for row in df.index if df.at[row, "Kind"] == "show"} | |
| for row in df.index: | |
| availability = df.at[row, "Availabilities"] | |
| kind = df.at[row, "Kind"] | |
| # 1. If no availability at this level, check hierarchy | |
| if not isinstance(availability, (list, tuple)) or len(availability) == 0: | |
| if kind == 'movie': | |
| issues.setdefault("No availability in movie", []).append(row) | |
| elif kind == 'episode': | |
| season_id = df.at[row, "External Season ID"] | |
| show_id = df.at[row, "External Parent ID"] | |
| # Check season | |
| if season_id in season_by_id: | |
| season_row = season_by_id[season_id] | |
| if isinstance(df.at[season_row, "Availabilities"], (list, tuple)) and len(df.at[season_row, "Availabilities"]) > 0: | |
| continue # season has availability --> OK | |
| # Check show | |
| if show_id in show_by_id: | |
| show_row = show_by_id[show_id] | |
| if isinstance(df.at[show_row, "Availabilities"], (list, tuple)) and len(df.at[show_row, "Availabilities"]) > 0: | |
| continue # show has availability --> OK | |
| # If neither season nor show has availability --> issue | |
| issues.setdefault("Episode and parent Season/Show have no availability", []).append(row) | |
| continue # skip details since no availability here | |
| # 2. Field-level checks when availability exists | |
| missing_start_end_flag = False | |
| missing_zone_flag = False | |
| for a in availability: | |
| start = a.get("period_start") | |
| end = a.get("period_end") | |
| zone = a.get("zone") | |
| # Require start + end | |
| if (not start or not end) and not missing_start_end_flag: | |
| issues.setdefault("Availability missing start or end date", []).append(row) | |
| missing_start_end_flag =True | |
| # Require zone | |
| if not zone and not missing_zone_flag: | |
| issues.setdefault("Availability missing zone", []).append(row) | |
| missing_zone_flag = True | |
| issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()] | |
| warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()] | |
| return issues, warnings | |
| # ###################################################################################################################### | |
| # SPORT | |
| def sport_check(df): | |
| issues = {} | |
| for row in df.index: | |
| kind = df.at[row, "Kind"] | |
| if kind == "sport": | |
| if df.at[row, "Sport - League"] != df.at[row, "Title"]: | |
| issues.setdefault("Sport-League does not match Title: ", []).append(row) | |
| availability = df.at[row, "Availabilities"] | |
| for a in availability: | |
| if not a.get("live_start") or not a.get("live_end"): | |
| issues.setdefault("No availabilities.live_start or no availabilities.live_end for sport: ", []).append(row) | |
| break | |
| # check for id.kind. Should be episode. | |
| # for the dictionary that contains service which does not contain "simply", kind should be == episode | |
| ids = df.at[row, "IDs"] | |
| if isinstance(ids, (list, tuple)): # iterate through the list of dict | |
| for id_dict in ids: | |
| service = id_dict.get("service", "") | |
| id_kind = id_dict.get("kind", "") | |
| if (service and not pd.isna(service)) and "simply" not in service.lower(): | |
| if id_kind != "episode": | |
| issues.setdefault(f"Sport program type is not equal to episode. It is instead: {id_kind}", []).append(row) | |
| break | |
| issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()] | |
| return issues |