Javier Real
Project Improvement
c3e37b9
import pandas as pd
import os
import ast
DATA_FOLDER_LOCATION = "course_project/data/"
MOVIES_METADATA_FILE_LOCATION = DATA_FOLDER_LOCATION + "movies_metadata.csv"
CREDITS_FILE_LOCATION = DATA_FOLDER_LOCATION + "credits.csv"
RATINGS_FILE_LOCATION = DATA_FOLDER_LOCATION + "title.ratings.tsv"
MOVIES_DATA_FILE_LOCATION = DATA_FOLDER_LOCATION + "javiers_movies_data.csv"
def turn_numeric_or_warn(df, column_name):
try:
df.loc[:, column_name] = pd.to_numeric(df[column_name])
except ValueError as e:
# Captura el error y muestra los registros problemáticos
problematic_datapoints = df[pd.to_numeric(df[column_name], errors='coerce').isnull()]
print("Problematic Datapoints:")
print(problematic_datapoints)
def turn_boolean_or_warn(df, column_name):
try:
df.loc[:, column_name] = df[column_name].astype(bool)
except ValueError as e:
# Captura el error y muestra los registros problemáticos
problematic_datapoints = df[~df[column_name].isin(['True', 'False', True, False])]
print("Problematic Datapoints:")
print(problematic_datapoints)
def get_genres_set(df):
unique_combinations = pd.Series(df["genres"].astype(str).unique()).apply(ast.literal_eval)
combinations_exploded = unique_combinations.explode().dropna()
unique_values_str = combinations_exploded.astype("str").unique()
return unique_values_str
def prepare_movies_metadata(movies_metadata):
# Dropping registers without required fields
needed_fields = ["adult", "budget", "genres", "id", "imdb_id", "original_title",
"overview", "popularity", "poster_path", "revenue", "runtime", "title"]
movies_metadata = movies_metadata[needed_fields].dropna()
turn_numeric_or_warn(movies_metadata, "popularity")
turn_numeric_or_warn(movies_metadata, "budget")
turn_numeric_or_warn(movies_metadata, "id")
turn_boolean_or_warn(movies_metadata, "adult")
movies_metadata.loc[:, "genres"] = format_genres(movies_metadata)
return movies_metadata
def load_movies_metadata():
movies_metadata = pd.read_csv(MOVIES_METADATA_FILE_LOCATION, low_memory=False)
return prepare_movies_metadata(movies_metadata)
def prepare_credits(credits):
# Leaving only cast names
credits["cast"] = credits["cast"].apply(ast.literal_eval).apply(lambda cast: [actor["name"] for actor in cast])
# Retrieving director information
credits["director"] = credits["crew"].apply(ast.literal_eval).apply(lambda crew: [crewmate["name"] for crewmate in crew if crewmate["job"].lower() == "director"])
# Crew column is not needed anymore
credits = credits.drop(["crew"], axis=1)
# Dropping NANs and register without a Director
credits = credits.dropna()
credits = credits[credits["director"].apply(len) > 0]
return credits
def load_credits():
credits = pd.read_csv(CREDITS_FILE_LOCATION)
return prepare_credits(credits)
def prepare_ratings(ratings):
ratings = ratings.rename(columns={'tconst': 'imdb_id', 'averageRating': 'rating'}).drop(["numVotes"], axis=1)
return ratings
def load_ratings():
ratings = pd.read_csv(RATINGS_FILE_LOCATION, delimiter='\t')
return prepare_ratings(ratings)
def generate_movies_data():
print("Loading Movies Metadata")
movies_metadata = load_movies_metadata()
print("Loading Credits")
credits = load_credits()
print("Merging")
movies_data = pd.merge(movies_metadata, credits, on="id", how="inner")
print("Loading ratings")
ratings = load_ratings()
movies_data = pd.merge(movies_data, ratings, on="imdb_id", how="inner")
movies_data = movies_data.drop_duplicates(subset=['id'])
return movies_data
def format_genres(df):
genres_dict = df["genres"].apply(ast.literal_eval)
return genres_dict.apply(lambda genre_list: [genre["name"] for genre in genre_list if genre["name"] != "nan"])
def read_or_generate_movies_data():
if os.path.exists(MOVIES_DATA_FILE_LOCATION):
movies_data = pd.read_csv(MOVIES_DATA_FILE_LOCATION)
print("Movies Database retrieved successfully.")
else:
movies_data = generate_movies_data()
movies_data.to_csv(MOVIES_DATA_FILE_LOCATION, index=False)
print("Movies Database generated and saved successfully.")
return movies_data
def format_list_of_strings(register):
return (str(register)).replace("', '", ", ")[2:-2]
if __name__ == "__main__":
movies_data = read_or_generate_movies_data()
print("Data info: ")
print(movies_data.info())
print("\n\n----------\n\n")
print("Data description: ")
print(movies_data.describe())
print("\n\n----------\n\n")
print("Data preview: ")
print(movies_data)
# print("\n\n----------\n\n")
# print("The Show Rating: ")
# print(get_imdb_rating_by_id("0114435"))
# print("\n\n----------\n\n")
# print("Movies metadata. ID 56088: ")
# print(movies_metadata[movies_metadata["id"] == 56088])
# print(movies_metadata[movies_metadata["id"] == 56088]["imdb_id"])