| import pandas as pd |
| import ast |
| import numpy as np |
|
|
| def clean_data(x): |
| """ |
| Helper to convert stringified lists like "[{'name': 'Action'}]" |
| into a simple string "Action" |
| """ |
| if isinstance(x, str): |
| try: |
| |
| item_list = ast.literal_eval(x) |
| if isinstance(item_list, list): |
| |
| return ' '.join([i['name'] for i in item_list if 'name' in i]) |
| except (ValueError, SyntaxError): |
| return "" |
| return "" |
|
|
| def parse_features(data_path): |
| print(f"Loading data from {data_path}...") |
| |
| |
| |
| df = pd.read_csv(data_path, low_memory=False) |
| |
| |
| |
| |
| df['id'] = pd.to_numeric(df['id'], errors='coerce') |
| df = df.dropna(subset=['id']) |
| df['id'] = df['id'].astype(int) |
|
|
| |
| df['title'] = df['title'].fillna('') |
| df['overview'] = df['overview'].fillna('') |
| df['tagline'] = df['tagline'].fillna('') |
| df['genres'] = df['genres'].fillna('[]') |
|
|
| print("Parsing genres (this might take a moment)...") |
| |
| df['genre_names'] = df['genres'].apply(clean_data) |
|
|
| |
| |
| def create_soup(x): |
| return f"{x['title']} {x['title']} {x['tagline']} {x['overview']} {x['genre_names']}" |
|
|
| df['soup'] = df.apply(create_soup, axis=1) |
|
|
| |
| final_df = df[['id', 'title', 'soup']].reset_index(drop=True) |
| |
| print(f"Cleaned data: {len(final_df)} movies ready for embedding.") |
| return final_df |