Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import os | |
| import ast | |
| DATA_FOLDER_LOCATION = "course_project/data/" | |
| MOVIES_METADATA_FILE_LOCATION = DATA_FOLDER_LOCATION + "movies_metadata.csv" | |
| CREDITS_FILE_LOCATION = DATA_FOLDER_LOCATION + "credits.csv" | |
| RATINGS_FILE_LOCATION = DATA_FOLDER_LOCATION + "title.ratings.tsv" | |
| MOVIES_DATA_FILE_LOCATION = DATA_FOLDER_LOCATION + "javiers_movies_data.csv" | |
| def turn_numeric_or_warn(df, column_name): | |
| try: | |
| df.loc[:, column_name] = pd.to_numeric(df[column_name]) | |
| except ValueError as e: | |
| # Captura el error y muestra los registros problemáticos | |
| problematic_datapoints = df[pd.to_numeric(df[column_name], errors='coerce').isnull()] | |
| print("Problematic Datapoints:") | |
| print(problematic_datapoints) | |
| def turn_boolean_or_warn(df, column_name): | |
| try: | |
| df.loc[:, column_name] = df[column_name].astype(bool) | |
| except ValueError as e: | |
| # Captura el error y muestra los registros problemáticos | |
| problematic_datapoints = df[~df[column_name].isin(['True', 'False', True, False])] | |
| print("Problematic Datapoints:") | |
| print(problematic_datapoints) | |
| def get_genres_set(df): | |
| unique_combinations = pd.Series(df["genres"].astype(str).unique()).apply(ast.literal_eval) | |
| combinations_exploded = unique_combinations.explode().dropna() | |
| unique_values_str = combinations_exploded.astype("str").unique() | |
| return unique_values_str | |
| def prepare_movies_metadata(movies_metadata): | |
| # Dropping registers without required fields | |
| needed_fields = ["adult", "budget", "genres", "id", "imdb_id", "original_title", | |
| "overview", "popularity", "poster_path", "revenue", "runtime", "title"] | |
| movies_metadata = movies_metadata[needed_fields].dropna() | |
| turn_numeric_or_warn(movies_metadata, "popularity") | |
| turn_numeric_or_warn(movies_metadata, "budget") | |
| turn_numeric_or_warn(movies_metadata, "id") | |
| turn_boolean_or_warn(movies_metadata, "adult") | |
| movies_metadata.loc[:, "genres"] = format_genres(movies_metadata) | |
| return movies_metadata | |
| def load_movies_metadata(): | |
| movies_metadata = pd.read_csv(MOVIES_METADATA_FILE_LOCATION, low_memory=False) | |
| return prepare_movies_metadata(movies_metadata) | |
| def prepare_credits(credits): | |
| # Leaving only cast names | |
| credits["cast"] = credits["cast"].apply(ast.literal_eval).apply(lambda cast: [actor["name"] for actor in cast]) | |
| # Retrieving director information | |
| credits["director"] = credits["crew"].apply(ast.literal_eval).apply(lambda crew: [crewmate["name"] for crewmate in crew if crewmate["job"].lower() == "director"]) | |
| # Crew column is not needed anymore | |
| credits = credits.drop(["crew"], axis=1) | |
| # Dropping NANs and register without a Director | |
| credits = credits.dropna() | |
| credits = credits[credits["director"].apply(len) > 0] | |
| return credits | |
| def load_credits(): | |
| credits = pd.read_csv(CREDITS_FILE_LOCATION) | |
| return prepare_credits(credits) | |
| def prepare_ratings(ratings): | |
| ratings = ratings.rename(columns={'tconst': 'imdb_id', 'averageRating': 'rating'}).drop(["numVotes"], axis=1) | |
| return ratings | |
| def load_ratings(): | |
| ratings = pd.read_csv(RATINGS_FILE_LOCATION, delimiter='\t') | |
| return prepare_ratings(ratings) | |
| def generate_movies_data(): | |
| print("Loading Movies Metadata") | |
| movies_metadata = load_movies_metadata() | |
| print("Loading Credits") | |
| credits = load_credits() | |
| print("Merging") | |
| movies_data = pd.merge(movies_metadata, credits, on="id", how="inner") | |
| print("Loading ratings") | |
| ratings = load_ratings() | |
| movies_data = pd.merge(movies_data, ratings, on="imdb_id", how="inner") | |
| movies_data = movies_data.drop_duplicates(subset=['id']) | |
| return movies_data | |
| def format_genres(df): | |
| genres_dict = df["genres"].apply(ast.literal_eval) | |
| return genres_dict.apply(lambda genre_list: [genre["name"] for genre in genre_list if genre["name"] != "nan"]) | |
| def read_or_generate_movies_data(): | |
| if os.path.exists(MOVIES_DATA_FILE_LOCATION): | |
| movies_data = pd.read_csv(MOVIES_DATA_FILE_LOCATION) | |
| print("Movies Database retrieved successfully.") | |
| else: | |
| movies_data = generate_movies_data() | |
| movies_data.to_csv(MOVIES_DATA_FILE_LOCATION, index=False) | |
| print("Movies Database generated and saved successfully.") | |
| return movies_data | |
| def format_list_of_strings(register): | |
| return (str(register)).replace("', '", ", ")[2:-2] | |
| if __name__ == "__main__": | |
| movies_data = read_or_generate_movies_data() | |
| print("Data info: ") | |
| print(movies_data.info()) | |
| print("\n\n----------\n\n") | |
| print("Data description: ") | |
| print(movies_data.describe()) | |
| print("\n\n----------\n\n") | |
| print("Data preview: ") | |
| print(movies_data) | |
| # print("\n\n----------\n\n") | |
| # print("The Show Rating: ") | |
| # print(get_imdb_rating_by_id("0114435")) | |
| # print("\n\n----------\n\n") | |
| # print("Movies metadata. ID 56088: ") | |
| # print(movies_metadata[movies_metadata["id"] == 56088]) | |
| # print(movies_metadata[movies_metadata["id"] == 56088]["imdb_id"]) | |