import pandas as pd import os import ast DATA_FOLDER_LOCATION = "course_project/data/" MOVIES_METADATA_FILE_LOCATION = DATA_FOLDER_LOCATION + "movies_metadata.csv" CREDITS_FILE_LOCATION = DATA_FOLDER_LOCATION + "credits.csv" RATINGS_FILE_LOCATION = DATA_FOLDER_LOCATION + "title.ratings.tsv" MOVIES_DATA_FILE_LOCATION = DATA_FOLDER_LOCATION + "javiers_movies_data.csv" def turn_numeric_or_warn(df, column_name): try: df.loc[:, column_name] = pd.to_numeric(df[column_name]) except ValueError as e: # Captura el error y muestra los registros problemáticos problematic_datapoints = df[pd.to_numeric(df[column_name], errors='coerce').isnull()] print("Problematic Datapoints:") print(problematic_datapoints) def turn_boolean_or_warn(df, column_name): try: df.loc[:, column_name] = df[column_name].astype(bool) except ValueError as e: # Captura el error y muestra los registros problemáticos problematic_datapoints = df[~df[column_name].isin(['True', 'False', True, False])] print("Problematic Datapoints:") print(problematic_datapoints) def get_genres_set(df): unique_combinations = pd.Series(df["genres"].astype(str).unique()).apply(ast.literal_eval) combinations_exploded = unique_combinations.explode().dropna() unique_values_str = combinations_exploded.astype("str").unique() return unique_values_str def prepare_movies_metadata(movies_metadata): # Dropping registers without required fields needed_fields = ["adult", "budget", "genres", "id", "imdb_id", "original_title", "overview", "popularity", "poster_path", "revenue", "runtime", "title"] movies_metadata = movies_metadata[needed_fields].dropna() turn_numeric_or_warn(movies_metadata, "popularity") turn_numeric_or_warn(movies_metadata, "budget") turn_numeric_or_warn(movies_metadata, "id") turn_boolean_or_warn(movies_metadata, "adult") movies_metadata.loc[:, "genres"] = format_genres(movies_metadata) return movies_metadata def load_movies_metadata(): movies_metadata = pd.read_csv(MOVIES_METADATA_FILE_LOCATION, low_memory=False) return prepare_movies_metadata(movies_metadata) def prepare_credits(credits): # Leaving only cast names credits["cast"] = credits["cast"].apply(ast.literal_eval).apply(lambda cast: [actor["name"] for actor in cast]) # Retrieving director information credits["director"] = credits["crew"].apply(ast.literal_eval).apply(lambda crew: [crewmate["name"] for crewmate in crew if crewmate["job"].lower() == "director"]) # Crew column is not needed anymore credits = credits.drop(["crew"], axis=1) # Dropping NANs and register without a Director credits = credits.dropna() credits = credits[credits["director"].apply(len) > 0] return credits def load_credits(): credits = pd.read_csv(CREDITS_FILE_LOCATION) return prepare_credits(credits) def prepare_ratings(ratings): ratings = ratings.rename(columns={'tconst': 'imdb_id', 'averageRating': 'rating'}).drop(["numVotes"], axis=1) return ratings def load_ratings(): ratings = pd.read_csv(RATINGS_FILE_LOCATION, delimiter='\t') return prepare_ratings(ratings) def generate_movies_data(): print("Loading Movies Metadata") movies_metadata = load_movies_metadata() print("Loading Credits") credits = load_credits() print("Merging") movies_data = pd.merge(movies_metadata, credits, on="id", how="inner") print("Loading ratings") ratings = load_ratings() movies_data = pd.merge(movies_data, ratings, on="imdb_id", how="inner") movies_data = movies_data.drop_duplicates(subset=['id']) return movies_data def format_genres(df): genres_dict = df["genres"].apply(ast.literal_eval) return genres_dict.apply(lambda genre_list: [genre["name"] for genre in genre_list if genre["name"] != "nan"]) def read_or_generate_movies_data(): if os.path.exists(MOVIES_DATA_FILE_LOCATION): movies_data = pd.read_csv(MOVIES_DATA_FILE_LOCATION) print("Movies Database retrieved successfully.") else: movies_data = generate_movies_data() movies_data.to_csv(MOVIES_DATA_FILE_LOCATION, index=False) print("Movies Database generated and saved successfully.") return movies_data def format_list_of_strings(register): return (str(register)).replace("', '", ", ")[2:-2] if __name__ == "__main__": movies_data = read_or_generate_movies_data() print("Data info: ") print(movies_data.info()) print("\n\n----------\n\n") print("Data description: ") print(movies_data.describe()) print("\n\n----------\n\n") print("Data preview: ") print(movies_data) # print("\n\n----------\n\n") # print("The Show Rating: ") # print(get_imdb_rating_by_id("0114435")) # print("\n\n----------\n\n") # print("Movies metadata. ID 56088: ") # print(movies_metadata[movies_metadata["id"] == 56088]) # print(movies_metadata[movies_metadata["id"] == 56088]["imdb_id"])