Spaces:
Sleeping
Sleeping
File size: 9,671 Bytes
fa8a6e1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | import pandas as pd
import json
import os
import regex as re
import numpy as np
from functions_ui_version import (title_check, description_check, meta_check,
credits_check, qualifiers_check, advisories_check,
deeplink_check, availability_check, sport_check, language_alignment_check) #,check_images
def prep_csv(file_path, columns_to_json: list):
# Load CSV and prepare JSON columns. Set index to ID.
df = pd.read_csv(file_path)
for col in columns_to_json:
df[col] = df[col].apply(
lambda x: list(json.loads(x)) if pd.notna(x) else []
if isinstance(x, str)
else (list(x) if isinstance(x, (tuple, np.ndarray)) else [])
)
df.set_index('ID', inplace=True)
return df
def prep_csv_st(file_input, columns_to_json: list):
import io
import json
import numpy as np
import pandas as pd
# Handle both UploadedFile objects and paths
if hasattr(file_input, "getvalue"): # Streamlit UploadedFile
file_input.seek(0) # reset pointer
df = pd.read_csv(io.BytesIO(file_input.getvalue()))
else:
df = pd.read_csv(file_input)
# === Convert JSON-like columns ===
for col in columns_to_json:
df[col] = df[col].apply(
lambda x: list(json.loads(x)) if pd.notna(x) and isinstance(x, str)
else (list(x) if isinstance(x, (tuple, np.ndarray)) else [])
)
df.set_index('ID', inplace=True)
return df
def main():
# ===Inputs===
csv_path = input("Enter path to catalogue CSV: ").strip()
catalog_type = input("Is this catalog in production? y/n")
catalog_language = input("Type the language code for this catalog: ").strip()
if catalog_type.lower() == 'y':
catalog_type = 'production'
else:
catalog_type = "staging"
other_languages = input("""
List all the language codes we are interested in other than English (en),
separated by commas. Leave blank if none:
""")
other_languages = [lang.strip() for lang in other_languages.split(",") if lang.strip()]
# Extract catalogue ID
match = re.search(r"catalog_(\d+)_programs", csv_path)
if match:
catalog_id = int(match.group(1))
else:
print("No catalogue ID found in the file name. Please ensure the file name contains 'catalog_<ID>_programs.csv'.")
# ===Load Data===
columns_to_json = ["Titles", "Descriptions", "Credits", "IDs", "Extras",
"Images", "Advisories", "Deeplinks", "Availabilities"]
df = prep_csv(csv_path, columns_to_json)
# filter out deleted programs if catalog_tpye is staging
if catalog_type == "staging":
# if deleted at is not empty --> filter out item. # FILTER FOR DELETED PROGRAMS. NOT SURE IF WE WANT TO KEEP THIS.
df = df[(df["Deleted At"].isna()) | (df["Deleted At"] == "")] # Commented out for now.
# === Run Checks ===
print("\n--- Running Checks ---")
title_issues, title_warnings = title_check(df, catalog_language, other_languages)
language_alignment_warnings = language_alignment_check(df)
description_issues, description_warnings = description_check(df, catalog_language, other_languages)
meta_issues, meta_warnings = meta_check(df)
credits_issues, credits_warnings = credits_check(df)
# images_issues = image_check()
qualifiers_warnings = qualifiers_check(df)
advisories_issues, advisories_warning = advisories_check(df)
deeplinks_issues, deeplinks_warning = deeplink_check(df)
availabilites_issues, availabilites_warning = availability_check(df)
sport_issues = sport_check(df)
# === Collect Results ===
results = {
"titles": {"issues": title_issues, "warnings": title_warnings + language_alignment_warnings},
"descriptions":{"issues": description_issues, "warnings": description_warnings},
"meta": {"issues": meta_issues, "warnings": meta_warnings},
"credits": {"issues": credits_issues, "warnings": credits_warnings},
# "images": images_issues,
"qualifiers warnings": qualifiers_warnings,
"advisories": {"issues": advisories_issues, "warnings": advisories_warning},
"deeplinks": {"issues": deeplinks_issues, "warnings": deeplinks_warning},
"availabilites": {"issues": availabilites_issues, "warnings": availabilites_warning},
"sport": sport_issues
}
# === Print Report ===
for section, issues in results.items():
print(f"\n{section.upper()} RESULTS:")
num_issues = len(issues.get("issues", [])) if isinstance(issues, dict) else 0
num_warnings = len(issues.get("warnings", [])) if isinstance(issues, dict) else 0
if num_issues ==0 and num_warnings == 0:
print("No issues and no warnings found!")
else:
print(f"Number of issues: {num_issues}")
print(f"Number of warnings: {num_warnings}")
# # ===Save to JSON file=== # Keeping it in case we still want a JSON for debugging
# output_path = os.path.join("outputs", "catalogue_report1.json")
# os.makedirs("output", exist_ok=True)
# with open(output_path, "w", encoding="utf-8") as f:
# json.dump(results, f, indent=4, ensure_ascii=False)
# === Build per-program issues and warnings for Excel ===
program_issues = {row: [] for row in df.index}
program_warnings = {row: [] for row in df.index}
def collect(results, target):
# results --> [{"type": "Missing title list", "rows": [1, 5]}, {"type": "No release year", "rows": [2]}]
for entry in results:
for row_id in entry["rows"]: # access the values for rows --> [1, 5], [2]
target[row_id].append(entry["type"]) # for each row_id appends the actual issue/warnging --> the value for "type" in the dictionaries inside result
# Bundle all results into one iterable
all_checks = [ # TO BE UPDATED WHEN WE ADD IMAGE ISSUES OR MODIFY THE STRUCTURE OF "RESULTS" !!!!!!!!!
(title_issues, program_issues),
(title_warnings + language_alignment_warnings, program_warnings),
(description_issues, program_issues),
(description_warnings, program_warnings),
(meta_issues, program_issues),
(meta_warnings, program_warnings),
(credits_issues, program_issues),
(credits_warnings, program_warnings),
(qualifiers_warnings, program_warnings),
(advisories_issues, program_issues),
(advisories_warning, program_warnings),
(deeplinks_issues, program_issues),
(deeplinks_warning, program_warnings),
(availabilites_issues, program_issues),
(availabilites_warning, program_warnings),
(sport_issues, program_issues)
]
# Iterate throug all_checks and assigns (ex.) each issue/warning type to the corresponding row (program_issues/program_warnings)
issue_summary = []
for results_group, target in all_checks:
collect(results_group, target)
problem_type = "Issue" if target == program_issues else "Warning"
for entry in results_group: # add a second sheet with issues and warnings and the affected programs
issue_summary.append({
"Issue/Warning": problem_type,
"type": entry["type"],
"count": len(entry["rows"]),
"programs": entry["rows"]
})
issue_summary_df = pd.DataFrame(issue_summary)
# === Build final dataframe for Excel ===
df_export = df[["Kind", "Title"]].copy()
df_export.reset_index(inplace=True) # brings ID back as a column
# adding issues and warnings columns
df_export["Issues"] = df_export["ID"].map(lambda i: "; ".join(program_issues[i]) if program_issues[i] else "")
df_export["Warnings"] = df_export["ID"].map(lambda i: "; ".join(program_warnings[i]) if program_warnings[i] else "")
# URL
# df_export["URL"] = df_export["ID"].map(lambda program_id: f"https://streaming.simply.tv/catalogs/{catalog_id}/programs/{program_id}")
if catalog_type == 'production': # In case we ever decide to do this for staging as well
df_export["URL"] = df_export["ID"].map(lambda program_id: f"https://streaming.simply.tv/catalogs/{catalog_id}/programs/{program_id}")
else:
df_export["URL"] = df_export["ID"].map(lambda program_id: f"https://streaming.simply.wtf/catalogs/{catalog_id}/programs/{program_id}")
# === Save to Excel ===
date = pd.Timestamp.now().strftime("%Y%m%d_%H%M") # append date and time to fiel name
# Create output folder for this catalog
output_folder = os.path.join("outputs", f"catalogue_{catalog_id}")
os.makedirs(output_folder, exist_ok=True)
output_excel = os.path.join(output_folder, f"catalogue_{catalog_id}_report_{date}.xlsx")
with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
df_export.to_excel(writer, index=False, sheet_name="Program Overview")
issue_summary_df.to_excel(writer, index=False, sheet_name="Issues Summary")
print(f"\nExcel report saved to {output_excel}")
if __name__ == "__main__":
main()
|