Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from datetime import datetime | |
| def process_actors_data(keep_alive: bool = True): | |
| current_year = datetime.now().year | |
| # Read actors data | |
| df = pd.read_csv("data/name.basics.tsv", sep="\t") | |
| df["birthYear"] = pd.to_numeric(df["birthYear"], errors="coerce") | |
| df["deathYear"] = pd.to_numeric(df["deathYear"], errors="coerce") | |
| # Prepare and cleanup actors data | |
| if keep_alive: | |
| df = df[df["deathYear"].isna()] | |
| df = df[df.knownForTitles.apply(lambda x: len(x)) > 0] | |
| df = df.dropna(subset=["primaryProfession"]) | |
| df = df[df.primaryProfession.apply(lambda x: any([p in {"actor", "actress"} for p in x.split(",")]))] | |
| df = df[df.knownForTitles != "\\N"] | |
| df = df.dropna(subset=["birthYear"]) | |
| #df["knownForTitles"] = df["knownForTitles"].apply(lambda x: x.split(",")) | |
| #dfat = df[["nconst", "knownForTitles"]].explode("knownForTitles") | |
| #dfat.columns = ["nconst", "tconst"] | |
| dfat = pd.read_csv("data/title.principals.tsv.gz", sep="\t") | |
| dfat = dfat[dfat.category.isin(["actor", "actress", "self"])][["tconst", "nconst"]] | |
| # Get data for the movies/shows the actors were known for | |
| dftr = pd.read_csv("data/title.ratings.tsv", sep="\t") | |
| dftb = pd.read_csv("data/title.basics.tsv", sep="\t") | |
| dftb["startYear"] = pd.to_numeric(dftb["startYear"], errors="coerce") | |
| dftb["endYear"] = pd.to_numeric(dftb["endYear"], errors="coerce") | |
| # Estimate last year the show/movie was released (TV shows span several years and might still be active) | |
| dftb.loc[(dftb.titleType.isin(["tvSeries", "tvMiniSeries"]) & (dftb.endYear.isna())), "lastYear"] = current_year | |
| dftb["lastYear"] = dftb["lastYear"].fillna(dftb["startYear"]) | |
| dftb = dftb.dropna(subset=["lastYear"]) | |
| dftb = dftb[dftb.isAdult == 0] | |
| # Aggregate stats for all movies the actor was known for | |
| dft = pd.merge(dftb, dftr, how="inner", on="tconst") | |
| del dftb, dftr | |
| dfat = pd.merge(dfat, dft, how="inner", on="tconst") | |
| del dft | |
| dfat["totalRating"] = dfat.averageRating*dfat.numVotes | |
| dfat = dfat.groupby("nconst").agg({"averageRating": "mean", "totalRating": "sum", "numVotes": "sum", "tconst": "count", "startYear": "min", "lastYear": "max"}) | |
| # Merge everything with actor data and cleanup | |
| df = df.drop(["deathYear", "knownForTitles", "primaryProfession"], axis=1) | |
| df = pd.merge(df, dfat, how="inner", on="nconst").sort_values("totalRating", ascending=False) | |
| df = df.dropna(subset=["birthYear", "startYear", "lastYear"]) | |
| df[["birthYear", "startYear", "lastYear"]] = df[["birthYear", "startYear", "lastYear"]].astype(int) | |
| df = df.round(2) | |
| return df | |
| if __name__ == "__main__": | |
| df = process_actors_data() | |
| df.to_csv("data/imdb_actors.csv", index=False) |