"""Load and process anime dataset""" import pandas as pd from pathlib import Path from typing import Generator import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from config import DATASET_PATH from data.anime_schema import Anime, parse_list_field def load_anime_dataset(limit: int = None) -> pd.DataFrame: """Load anime dataset from CSV""" print(f"Loading dataset from {DATASET_PATH}...") df = pd.read_csv(DATASET_PATH, nrows=limit) # Rename columns to match our schema column_mapping = { "id": "mal_id", "mean": "score", "num_scoring_users": "scored_by", "num_favorites": "favorites", "main_picture_medium": "image_url", "alternative_titles_en": "title_english", "alternative_titles_ja": "title_japanese", } df = df.rename(columns=column_mapping) print(f"Loaded {len(df)} anime entries") return df def parse_anime_row(row: pd.Series) -> Anime: """Convert DataFrame row to Anime model""" return Anime( mal_id=int(row["mal_id"]), title=str(row.get("title", "Unknown")), title_english=row.get("title_english") if pd.notna(row.get("title_english")) else None, title_japanese=row.get("title_japanese") if pd.notna(row.get("title_japanese")) else None, media_type=str(row.get("media_type", "unknown")), episodes=int(row["num_episodes"]) if pd.notna(row.get("num_episodes")) and row.get("num_episodes") != 0 else None, status=str(row.get("status", "unknown")), score=float(row["score"]) if pd.notna(row.get("score")) else None, scored_by=int(row["scored_by"]) if pd.notna(row.get("scored_by")) else None, rank=int(row["rank"]) if pd.notna(row.get("rank")) else None, popularity=int(row["popularity"]) if pd.notna(row.get("popularity")) else None, favorites=int(row["favorites"]) if pd.notna(row.get("favorites")) else None, synopsis=str(row.get("synopsis", "")) if pd.notna(row.get("synopsis")) else None, genres=parse_list_field(row.get("genres", "[]")), studios=parse_list_field(row.get("studios", "[]")), source=str(row.get("source")) if pd.notna(row.get("source")) else None, rating=str(row.get("rating")) if pd.notna(row.get("rating")) else None, image_url=str(row.get("image_url")) if pd.notna(row.get("image_url")) else None, start_date=str(row.get("start_date")) if pd.notna(row.get("start_date")) else None, end_date=str(row.get("end_date")) if pd.notna(row.get("end_date")) else None, ) def iter_anime(df: pd.DataFrame) -> Generator[Anime, None, None]: """Iterate over anime entries as Pydantic models""" for _, row in df.iterrows(): try: yield parse_anime_row(row) except Exception as e: print(f"Error parsing row {row.get('mal_id', 'unknown')}: {e}") continue def create_embedding_text(anime: Anime) -> str: """Create text for embedding generation""" parts = [anime.title] if anime.title_english and anime.title_english != anime.title: parts.append(anime.title_english) if anime.genres: parts.append(f"Genres: {', '.join(anime.genres)}") if anime.synopsis: # Truncate synopsis to prevent overly long embeddings synopsis = anime.synopsis[:1000] parts.append(synopsis) # Extract scene keywords for better scene-based search scene_keywords = extract_scene_keywords(synopsis, anime.genres or []) if scene_keywords: parts.append(f"Scenes and tropes: {', '.join(scene_keywords)}") return " | ".join(parts) # Scene/trope detection patterns SCENE_PATTERNS = { # Romantic scenes "confession": ["confess", "confession", "i love you", "feelings for", "admit feelings"], "rooftop scene": ["rooftop", "on the roof", "school rooftop"], "beach episode": ["beach", "swimsuit", "ocean", "summer vacation"], "festival date": ["festival", "fireworks", "yukata", "summer festival"], "accidental kiss": ["accidental", "lips touched", "fell on"], # Action scenes "training arc": ["training", "train harder", "become stronger", "special training"], "tournament arc": ["tournament", "competition", "championship", "finals"], "final battle": ["final battle", "last fight", "ultimate showdown", "final boss"], "power awakening": ["awakens", "hidden power", "true power", "unleash"], "sacrifice": ["sacrifice", "gave their life", "protect everyone", "died saving"], # Emotional scenes "tearful goodbye": ["goodbye", "farewell", "parting", "separation"], "death scene": ["death", "died", "killed", "passed away", "funeral"], "reunion": ["reunite", "reunion", "meet again", "found each other"], "flashback": ["flashback", "memories", "past", "childhood"], "redemption arc": ["redemption", "atone", "make amends", "change their ways"], # Character tropes "overpowered protagonist": ["overpowered", "strongest", "unbeatable", "one punch", "no match"], "hidden identity": ["secret identity", "hiding", "disguise", "true self"], "underdog story": ["underdog", "weakest", "looked down upon", "prove them wrong"], "transfer student": ["transfer student", "new student", "just arrived"], "chosen one": ["chosen", "prophecy", "destined", "fate"], # Setting/atmosphere "post-apocalyptic": ["apocalypse", "post-apocalyptic", "destroyed world", "ruins"], "isekai": ["another world", "transported", "reincarnated", "summoned to"], "time loop": ["time loop", "repeating", "stuck in time", "groundhog"], "school setting": ["high school", "academy", "school", "classroom"], "dystopian": ["dystopia", "oppressive", "government control", "rebellion"], } def extract_scene_keywords(synopsis: str, genres: list[str]) -> list[str]: """Extract scene/trope keywords from synopsis for better search""" if not synopsis: return [] synopsis_lower = synopsis.lower() detected = [] for scene_name, patterns in SCENE_PATTERNS.items(): for pattern in patterns: if pattern in synopsis_lower: detected.append(scene_name) break # Add genre-based common tropes genre_tropes = { "Romance": ["love triangle", "slow burn romance"], "Action": ["battle scenes", "fight choreography"], "Comedy": ["comedic moments", "slapstick"], "Drama": ["emotional moments", "character development"], "Horror": ["scary scenes", "tension building"], "Sports": ["match scenes", "team dynamics"], "Music": ["performance scenes", "concert"], } for genre in genres: if genre in genre_tropes: detected.extend(genre_tropes[genre]) return list(set(detected))[:10] # Limit to 10 keywords if __name__ == "__main__": # Test loading df = load_anime_dataset(limit=10) for anime in iter_anime(df): print(f"{anime.mal_id}: {anime.title} ({anime.score}) - {anime.genres}") print(f" Embedding text: {create_embedding_text(anime)[:150]}...") print()