File size: 5,078 Bytes
fd31c97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3e37b9
fd31c97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3e37b9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
import os
import ast

DATA_FOLDER_LOCATION = "course_project/data/"
MOVIES_METADATA_FILE_LOCATION = DATA_FOLDER_LOCATION + "movies_metadata.csv"
CREDITS_FILE_LOCATION = DATA_FOLDER_LOCATION + "credits.csv"
RATINGS_FILE_LOCATION = DATA_FOLDER_LOCATION + "title.ratings.tsv"
MOVIES_DATA_FILE_LOCATION = DATA_FOLDER_LOCATION + "javiers_movies_data.csv"

def turn_numeric_or_warn(df, column_name):
    try:
        df.loc[:, column_name] = pd.to_numeric(df[column_name])
    except ValueError as e:
        # Captura el error y muestra los registros problemáticos
        problematic_datapoints = df[pd.to_numeric(df[column_name], errors='coerce').isnull()]
        print("Problematic Datapoints:")
        print(problematic_datapoints)

def turn_boolean_or_warn(df, column_name):
    try:
        df.loc[:, column_name] = df[column_name].astype(bool)
    except ValueError as e:
        # Captura el error y muestra los registros problemáticos
        problematic_datapoints = df[~df[column_name].isin(['True', 'False', True, False])]
        print("Problematic Datapoints:")
        print(problematic_datapoints)

def get_genres_set(df):
    unique_combinations = pd.Series(df["genres"].astype(str).unique()).apply(ast.literal_eval)
    combinations_exploded = unique_combinations.explode().dropna()
    unique_values_str = combinations_exploded.astype("str").unique()
    return unique_values_str

def prepare_movies_metadata(movies_metadata):
    # Dropping registers without required fields
    needed_fields = ["adult", "budget", "genres", "id", "imdb_id", "original_title",
                    "overview", "popularity", "poster_path", "revenue", "runtime", "title"]
    movies_metadata = movies_metadata[needed_fields].dropna()

    turn_numeric_or_warn(movies_metadata, "popularity")
    turn_numeric_or_warn(movies_metadata, "budget")
    turn_numeric_or_warn(movies_metadata, "id")
    turn_boolean_or_warn(movies_metadata, "adult")

    movies_metadata.loc[:, "genres"] = format_genres(movies_metadata)

    return movies_metadata

def load_movies_metadata():
    movies_metadata = pd.read_csv(MOVIES_METADATA_FILE_LOCATION, low_memory=False)
    return prepare_movies_metadata(movies_metadata)

def prepare_credits(credits):
    # Leaving only cast names
    credits["cast"] = credits["cast"].apply(ast.literal_eval).apply(lambda cast: [actor["name"] for actor in cast])

    # Retrieving director information
    credits["director"] = credits["crew"].apply(ast.literal_eval).apply(lambda crew: [crewmate["name"] for crewmate in crew if crewmate["job"].lower() == "director"])
    
    # Crew column is not needed anymore
    credits = credits.drop(["crew"], axis=1)

    # Dropping NANs and register without a Director
    credits = credits.dropna()
    credits = credits[credits["director"].apply(len) > 0]
    return credits

def load_credits():
    credits = pd.read_csv(CREDITS_FILE_LOCATION)
    return prepare_credits(credits)

def prepare_ratings(ratings):
    ratings = ratings.rename(columns={'tconst': 'imdb_id', 'averageRating': 'rating'}).drop(["numVotes"], axis=1)
    return ratings

def load_ratings():
    ratings = pd.read_csv(RATINGS_FILE_LOCATION, delimiter='\t')
    return prepare_ratings(ratings)

def generate_movies_data():
    print("Loading Movies Metadata")
    movies_metadata = load_movies_metadata()
    print("Loading Credits")
    credits = load_credits()
    print("Merging")
    movies_data = pd.merge(movies_metadata, credits, on="id", how="inner")
    print("Loading ratings")
    ratings = load_ratings()
    movies_data = pd.merge(movies_data, ratings, on="imdb_id", how="inner")
    movies_data = movies_data.drop_duplicates(subset=['id'])
    return movies_data

def format_genres(df):
    genres_dict = df["genres"].apply(ast.literal_eval)
    return genres_dict.apply(lambda genre_list: [genre["name"] for genre in genre_list if genre["name"] != "nan"])

def read_or_generate_movies_data():
    if os.path.exists(MOVIES_DATA_FILE_LOCATION):
        movies_data = pd.read_csv(MOVIES_DATA_FILE_LOCATION)
        print("Movies Database retrieved successfully.")
    else:
        movies_data = generate_movies_data()
        movies_data.to_csv(MOVIES_DATA_FILE_LOCATION, index=False)
        print("Movies Database generated and saved successfully.")
    return movies_data

def format_list_of_strings(register):
    return (str(register)).replace("', '", ", ")[2:-2]

if __name__ == "__main__":
    movies_data = read_or_generate_movies_data()
    print("Data info: ")
    print(movies_data.info())
    print("\n\n----------\n\n")
    print("Data description: ")
    print(movies_data.describe())
    print("\n\n----------\n\n")
    print("Data preview: ")
    print(movies_data)
    # print("\n\n----------\n\n")
    # print("The Show Rating: ")
    # print(get_imdb_rating_by_id("0114435"))
    # print("\n\n----------\n\n")
    # print("Movies metadata. ID 56088: ")
    # print(movies_metadata[movies_metadata["id"] == 56088])
    # print(movies_metadata[movies_metadata["id"] == 56088]["imdb_id"])