catalog-review / functions_ui_version.py
mbecchis's picture
Upload 2 files
398c5d8 verified
import pandas as pd
# MISSING VALUES CHECK
# we need to check two levels of missing values:
# level 1 - csv field is empty.
# since we are converting it to a pandas df a simple pd.isna() should do the trick.
# level 2 - csv field is there but json field inside is empty or missing values or null.
# we need to get more creative.
# for example if titles field is there but the json inside is:
# 2.a []
# 2.b [{"kind": "", "language": "", "text": ""}]
# 2.c [{"kind": "local", "language": null, "text": "some title"}]
# 2.d [{"kind": "local", "text": "some title"}] (no language key at all)
# if not isinstance(titles, (list, tuple)) or len(titles) == 0:
# handles 2.a
# if not t.get("kind"):
# catches None, '', and NaN
# hanldes 2.b, 2.c and 2.d
# ######################################################################################################################
# TITLES
def title_check(df, catalog_language: str, other_languages: list):
issues = {}
warnings = {}
allowed_languages = set(other_languages + [catalog_language])
for row in df.index:
titles = df.at[row, "Titles"]
if not isinstance(titles, (list, tuple)) or len(titles) == 0:
issues.setdefault(f"Missing title list", []).append(row)
continue
# --- Collect local/original for duplicate check
local_titles = []
original_titles = []
for t in titles:
# 1. Empty fields
if not t.get("kind"):
issues.setdefault(f"Empty title kind", []).append(row)
if not t.get("language"):
issues.setdefault(f"Empty title language", []).append(row)
if not t.get("text"):
issues.setdefault(f"Empty title text", []).append(row)
# 2. Unexpected languages (not in catalogue)
if t.get("language") not in allowed_languages:
warnings.setdefault(f"unwanted_language: {t.get('language')}", []).append(row)
# 3. Save for duplicate check
if t.get("kind") == "local":
local_titles.append((t.get("text"), t.get("language")))
if t.get("kind") == "original":
original_titles.append((t.get("text"), t.get("language")))
# 4. Compare locals vs originals
for l in local_titles:
if l in original_titles:
issues.setdefault(f"Original title same as local", []).append(row)
# 5. if more than one original language --> issue
original_langs = [lang for _, lang in original_titles if lang] # unpacks the languages from original_titles IF we have a language
if len(set(original_langs)) > 1: # if we have more than one --> issue
issues.setdefault("Multiple original languages for descriptions", []).append(row)
# 6. same local title more than once with the same langauge code -> warning
seen = {}
for text, lang in local_titles:
if (text, lang) in seen:
warnings.setdefault("Duplicate local title in same language", []).append(row)
else:
seen[(text, lang)] = True
issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()]
warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()]
return issues, warnings
# ######################################################################################################################
# DESCRIPTION
def description_check(df, catalog_language: str, other_languages: list):
issues = {}
warnings = {}
allowed_languages = set(other_languages + [catalog_language])
for row in df.index:
descriptions = df.at[row, "Descriptions"]
if not isinstance(descriptions, (list, tuple)) or len(descriptions) == 0:
issues.setdefault(f"Missing description list", []).append(row)
continue
# --- Filter episode descriptions: remove redundant kinds
# if kind == "episode":
# descriptions = [d for d in descriptions if d.get("kind") not in
# ("season_local", "season_original", "show_local", "show_original")]
# --- Collect local/original for duplicate check
local_descriptions = []
original_descriptions = []
for d in descriptions:
# 1. Empty fields
if not d.get("kind"):
issues.setdefault(f"Empty description kind", []).append(row)
if not d.get("language"):
issues.setdefault(f"Empty description language", []).append(row)
if not d.get("text"):
issues.setdefault(f"Empty description text", []).append(row)
# 2. Unexpected languages (not in catalogue)
if d.get("language") not in allowed_languages:
warnings.setdefault(f"Unexpected languages: {d.get('language')}", []).append(row)
# 3. Save for duplicate check
if d.get("kind") == "local":
local_descriptions.append((d.get("text"), d.get("language")))
if d.get("kind") == "original":
original_descriptions.append((d.get("text"), d.get("language")))
# 4. Too long descriptions
if d.get("text") and len(d.get("text").split()) > 1000:
issues.setdefault("description too long (>1000 words)", []).append(row)
# 5. Compare locals vs originals
for l in local_descriptions:
if l in original_descriptions:
issues.setdefault("Original description same as local", []).append(row)
# 6. if more than one original language --> issue
original_langs = [lang for _, lang in original_descriptions if lang] # unpacks the languages from original_descriptions IF we have a language
if len(set(original_langs)) > 1: # if we have more than one --> issue
issues.setdefault("Multiple original languages for descriptions", []).append(row)
# 7. same local title more than once with the same langauge code -> warning
seen = {}
for text, lang in local_descriptions:
if (text, lang) in seen:
warnings.setdefault("Duplicate local descriptions in same language", []).append(row)
else:
seen[(text, lang)] = True
issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()]
warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()]
return issues, warnings
# ######################################################################################################################
# CROSS-FIELD CHECKS
def language_alignment_check(df):
warnings = {}
for row in df.index:
title_langs = []
desc_langs = []
titles = df.at[row, "Titles"]
descriptions = df.at[row, "Descriptions"]
if titles:
for t in titles:
lang = t.get("language")
if lang:
title_langs.append(lang)
if descriptions:
for d in descriptions:
lang = d.get("language")
if lang:
desc_langs.append(lang)
if set(title_langs) != set(desc_langs):
warnings.setdefault(f"Mismatch between title and description languages --> {(set(title_langs) ^ set(desc_langs))}", []).append(row) # absolute difference between the two sets
warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()]
return warnings
# ######################################################################################################################
# META / ATTRIBUTES
def meta_check(df):
issues = {}
warnings = {}
for row in df.index:
#DURATION
# if program kind is either movie or episode and they don't have duration. raise warning.
kind = df.at[row, "Kind"]
if (kind == 'movie' or kind == 'episode') and pd.isna(df.at[row, "Meta - Duration"]):
warnings.setdefault("No duration", []).append(row)
#RELEASE DATE AND RELEASE YEAR
# release date not present
if pd.isna(df.at[row, "Meta - Release Date"]):
warnings.setdefault("No release date", []).append(row)
# release year not present but release date is
if pd.isna(df.at[row, "Meta - Release Year"]) and df.at[row, "Meta - Release Date"]: # easy fix
warnings.setdefault("No release year, but release date is present", []).append(row)
# release year not present
if pd.isna(df.at[row, "Meta - Release Year"]):
warnings.setdefault("No release year", []).append(row)
#GENRE
genres = df.at[row, "Meta - Genres"]
# genre not present
if pd.isna(genres) or (isinstance(genres, (list, tuple)) and len(genres) == 0):
warnings.setdefault("No genres", []).append(row)
# if genres is a string, convert to list for consistency:
if isinstance(genres, str):
genres = [genres]
genres_lower = []
if isinstance(genres, (list, tuple)):
genres_lower = [g.lower() for g in genres]
# sport or sports parsed as genre
if "sport" in genres_lower or "sports" in genres_lower:
issues.setdefault("Sport is being parsed as genre", []).append(row)
#COUNTRIES
# country not present
if pd.isna(df.at[row, "Meta - Countries"]):
warnings.setdefault("No country", []).append(row)
#SEASON AND EPISODE NUMBER
if kind == "season" and pd.isna(df.at[row, "Meta - Season"]):
issues.setdefault("No season number", []).append(row)
if kind == "episode" and pd.isna(df.at[row, "Meta - Episode"]):
issues.setdefault("No episode number", []).append(row)
if kind == "episode" and pd.isna(df.at[row, "Meta - Season"]):
issues.setdefault("No season number on episode level", []).append(row)
# PART NO ?????? what is that about?
issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()]
warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()]
return issues, warnings
# ######################################################################################################################
# CREDITS
def credits_check(df):
issues = {}
warnings = {}
season_by_id = {df.at[row, "External ID"] : row for row in df.index if df.at[row, "Kind"] == "season" } # build dictionaries to usea as look-up table by lower level assets
show_by_id = {df.at[row, "External ID"] : row for row in df.index if df.at[row, "Kind"] == "show" }
for row in df.index:
credit = df.at[row, "Credits"]
kind = df.at[row, "Kind"]
# 1. Check for credits that are not there across diffrent levels.
if not isinstance(credit, (list, tuple)) or len(credit) == 0:
if kind == 'movie':
issues.setdefault("Credits are not present in movies", []).append(row)
elif kind == 'episode':
season_id = df.at[row, "External Season ID"] # external season id will always match the External ID of the corresponding season (key in the season_by_id dictionary)
show_id = df.at[row, "External Parent ID"] # same thing for external parent ID. External parent ID will match the External ID in the correponding show (key in the show_by_id dictionary)
# Check season
if season_id in season_by_id:
season_row = season_by_id[season_id] # if the season corresponding to the external season ID of the episode in question has credit --> we good
if isinstance(df.at[season_row, "Credits"], (list, tuple)) and len(df.at[season_row, "Credits"]) > 0:
continue # season has credits --> OK
# Check show
if show_id in show_by_id:
show_row = show_by_id[show_id] # if it has them in the show instead of the season, we are also good.
if isinstance(df.at[show_row, "Credits"], (list, tuple)) and len(df.at[show_row, "Credits"]) > 0:
continue # show has credits --> OK
# If neither season nor show has credits --> issue
warnings.setdefault("Episode and parent Season/Show have no credits", []).append(row)
# elif kind == 'season': # this check and the one below might be redundant, since we go from the bottom-up in the check before. Just being safe.
# show_id = df.at[row, "External Parent ID"] # here just for the edge cases where episodes are not the lowest level.
# # Check show
# if show_id in show_by_id:
# show_row = show_by_id[show_id]
# if isinstance(df.at[show_row, "Credits"], (list, tuple)) and len(df.at[show_row, "Credits"]) > 0:
# continue # show has credits → OK
# # If no credits in season or parent show → issue
# warnings.setdefault("Season and parent Show have no credits", []).append(row)
# elif kind == 'show':
# warnings.setdefault("Credits are not present in show", []).append(row)
# continue # skip to next row since no credits here
missing_role_flag = False
missing_char_name_flag = False
missing_name_flag = False
# 2. role, name, character name not there
for c in credit:
if not c.get("role") and not missing_role_flag:
issues.setdefault("No role in credits", []).append(row)
missing_role_flag = True
if not c.get("character_name") and not missing_char_name_flag:
warnings.setdefault("No character name in credits", []).append(row)
missing_char_name_flag = True
if not c.get("name") and not missing_name_flag:
issues.setdefault("No name in credits", []).append(row)
missing_name_flag = True
# 3. language check if language field is present -> issue
lang = c.get("language")
if lang and not pd.isna(lang):
issues.setdefault("Language field should not be present in credits", []).append(row)
break # no need to report it more than once per program
issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()]
warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()]
return issues, warnings
# ######################################################################################################################
# QUALIFIERS
def qualifiers_check(df):
warnings = {}
for row in df.index:
# Aspect Ratio
if pd.isna(df.at[row, "Qualifiers - Aspect Ratio"]):
warnings.setdefault("no_aspect_ratio is being parsed:", []).append(row)
# Definition
if pd.isna(df.at[row, "Qualifiers - Definition"]):
warnings.setdefault("no defintion is being parsed:", []).append(row)
# Subtitles
if pd.isna(df.at[row, "Qualifiers - Subtitles"]):
warnings.setdefault("no subtitles are being parsed:", []).append(row)
# Audio Languages
if pd.isna(df.at[row, "Qualifiers - Audio Languages"]):
warnings.setdefault("no audio languages are being parsed:", []).append(row)
# Original Air Date
if pd.isna(df.at[row, "Qualifiers - Original Air Date"]):
warnings.setdefault("no Original Air Date is being parsed:", []).append(row)
# Sound
if pd.isna(df.at[row, "Qualifiers - Sound"]):
warnings.setdefault("no Sound qualifier is being parsed:", []).append(row)
warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()]
return warnings
# ######################################################################################################################
# ADVISORIES
def advisories_check(df):
issues = {}
warnings = {}
for row in df.index:
advisories = df.at[row, "Advisories"]
# chek for NO or FI country code. Parental rating is mandatory there.
availabilities = df.at[row, "Availabilities"]
if not isinstance(availabilities, (list, tuple)):
availabilities = []
zones = [a.get("zone") for a in availabilities if isinstance(a, dict)]
# no advisory or parental rarting: empty list or tuple
if not isinstance(advisories, (list, tuple)) or len(advisories) == 0:
if any(z in ("fi", "no") for z in zones):
issues.setdefault("No advisory and no Parental Rating, for countries where it is mandatory (FI, NO)", []).append(row)
else:
warnings.setdefault("No advisory and no Parental Rating", []).append(row)
continue
for a in advisories:
agency = a.get("agency")
advisory = a.get("advisory")
rating = a.get("rating")
# advisory/rating present but no agency
if (advisory or rating) and not agency:
warnings.setdefault("agency missing when rating or advisory present", []).append(row)
# agency present but no advisory or rating
if agency and not (advisory or rating):
issues.setdefault("rating or advisory missing when agency present", []).append(row)
issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()]
warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()]
return issues, warnings
# ######################################################################################################################
# DEEPLINKS
def deeplink_check(df):
issues = {}
warnings = {}
for row in df.index:
deeplink = df.at[row, "Deeplinks"] # if season or show -> warning
kind = df.at[row, "Kind"]
#No deeplinks
if not isinstance(deeplink, (list, tuple)) or len(deeplink) == 0:
if kind == 'show' or kind == 'season':
warnings.setdefault("No deeplinks parsed: ", []).append(row)
else:
issues.setdefault("No deeplinks parsed: ", []).append(row)
continue
# keep track of kinds by platform
platform_kinds = {}
# Flags to avoid duplicate issue creation per program
missing_kind_flag = False
missing_platform_flag = False
for d in deeplink:
platform = d.get("platform")
if platform:
platform_kinds.setdefault(platform, set()).add(d.get("kind"))
# No kind
if not d.get("kind") and not missing_kind_flag: # only add the issue once per program
issues.setdefault("No kind with deeplink.", []).append(row) # for all platforms we have we should ahve both details and player
missing_kind_flag = True
# No platform
if not platform and not missing_platform_flag: # only add the issue once per program
issues.setdefault("No platform with deeplink.", []).append(row)
missing_platform_flag = True
# No Zone
if not d.get("zone"):
warnings.setdefault("No zone with deeplink: ", []).append(row)
break
# #No definition when qualifiers.definition is there
# if df.at[row, "Qualifiers - Definition"] and pd.isna(d.get("definition")):
# warnings.setdefault("No definition in deeplink, but we have qualifers.definition: ", []).append(row)
#add check for pricing model. did not address it yet.
# apple US noramlly does not have it. Want to be sure of the structure.
for platform, kinds in platform_kinds.items():
if not {"details", "player"}.issubset(kinds): # checks if details and player is a subset of kinds for each program
issues.setdefault(
f'For platform {platform} we are missing either "details" or "player" deeplink', []).append(row)
issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()]
warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()]
return issues, warnings
# ######################################################################################################################
# AVAILABILITY
def availability_check(df):
issues = {}
warnings = {}
# Build dictionaries for season/show lookups
season_by_id = {df.at[row, "External ID"]: row for row in df.index if df.at[row, "Kind"] == "season"}
show_by_id = {df.at[row, "External ID"]: row for row in df.index if df.at[row, "Kind"] == "show"}
for row in df.index:
availability = df.at[row, "Availabilities"]
kind = df.at[row, "Kind"]
# 1. If no availability at this level, check hierarchy
if not isinstance(availability, (list, tuple)) or len(availability) == 0:
if kind == 'movie':
issues.setdefault("No availability in movie", []).append(row)
elif kind == 'episode':
season_id = df.at[row, "External Season ID"]
show_id = df.at[row, "External Parent ID"]
# Check season
if season_id in season_by_id:
season_row = season_by_id[season_id]
if isinstance(df.at[season_row, "Availabilities"], (list, tuple)) and len(df.at[season_row, "Availabilities"]) > 0:
continue # season has availability --> OK
# Check show
if show_id in show_by_id:
show_row = show_by_id[show_id]
if isinstance(df.at[show_row, "Availabilities"], (list, tuple)) and len(df.at[show_row, "Availabilities"]) > 0:
continue # show has availability --> OK
# If neither season nor show has availability --> issue
issues.setdefault("Episode and parent Season/Show have no availability", []).append(row)
continue # skip details since no availability here
# 2. Field-level checks when availability exists
missing_start_end_flag = False
missing_zone_flag = False
for a in availability:
start = a.get("period_start")
end = a.get("period_end")
zone = a.get("zone")
# Require start + end
if (not start or not end) and not missing_start_end_flag:
issues.setdefault("Availability missing start or end date", []).append(row)
missing_start_end_flag =True
# Require zone
if not zone and not missing_zone_flag:
issues.setdefault("Availability missing zone", []).append(row)
missing_zone_flag = True
issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()]
warnings = [{"type": warning, "rows": rows} for warning, rows in warnings.items()]
return issues, warnings
# ######################################################################################################################
# SPORT
def sport_check(df):
issues = {}
for row in df.index:
kind = df.at[row, "Kind"]
if kind == "sport":
if df.at[row, "Sport - League"] != df.at[row, "Title"]:
issues.setdefault("Sport-League does not match Title: ", []).append(row)
availability = df.at[row, "Availabilities"]
for a in availability:
if not a.get("live_start") or not a.get("live_end"):
issues.setdefault("No availabilities.live_start or no availabilities.live_end for sport: ", []).append(row)
break
# check for id.kind. Should be episode.
# for the dictionary that contains service which does not contain "simply", kind should be == episode
ids = df.at[row, "IDs"]
if isinstance(ids, (list, tuple)): # iterate through the list of dict
for id_dict in ids:
service = id_dict.get("service", "")
id_kind = id_dict.get("kind", "")
if (service and not pd.isna(service)) and "simply" not in service.lower():
if id_kind != "episode":
issues.setdefault(f"Sport program type is not equal to episode. It is instead: {id_kind}", []).append(row)
break
issues = [{"type": issue, "rows": rows} for issue, rows in issues.items()]
return issues