Spaces:
Sleeping
Sleeping
| """Load and process anime dataset""" | |
| import pandas as pd | |
| from pathlib import Path | |
| from typing import Generator | |
| import sys | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from config import DATASET_PATH | |
| from data.anime_schema import Anime, parse_list_field | |
| def load_anime_dataset(limit: int = None) -> pd.DataFrame: | |
| """Load anime dataset from CSV""" | |
| print(f"Loading dataset from {DATASET_PATH}...") | |
| df = pd.read_csv(DATASET_PATH, nrows=limit) | |
| # Rename columns to match our schema | |
| column_mapping = { | |
| "id": "mal_id", | |
| "mean": "score", | |
| "num_scoring_users": "scored_by", | |
| "num_favorites": "favorites", | |
| "main_picture_medium": "image_url", | |
| "alternative_titles_en": "title_english", | |
| "alternative_titles_ja": "title_japanese", | |
| } | |
| df = df.rename(columns=column_mapping) | |
| print(f"Loaded {len(df)} anime entries") | |
| return df | |
| def parse_anime_row(row: pd.Series) -> Anime: | |
| """Convert DataFrame row to Anime model""" | |
| return Anime( | |
| mal_id=int(row["mal_id"]), | |
| title=str(row.get("title", "Unknown")), | |
| title_english=row.get("title_english") if pd.notna(row.get("title_english")) else None, | |
| title_japanese=row.get("title_japanese") if pd.notna(row.get("title_japanese")) else None, | |
| media_type=str(row.get("media_type", "unknown")), | |
| episodes=int(row["num_episodes"]) if pd.notna(row.get("num_episodes")) and row.get("num_episodes") != 0 else None, | |
| status=str(row.get("status", "unknown")), | |
| score=float(row["score"]) if pd.notna(row.get("score")) else None, | |
| scored_by=int(row["scored_by"]) if pd.notna(row.get("scored_by")) else None, | |
| rank=int(row["rank"]) if pd.notna(row.get("rank")) else None, | |
| popularity=int(row["popularity"]) if pd.notna(row.get("popularity")) else None, | |
| favorites=int(row["favorites"]) if pd.notna(row.get("favorites")) else None, | |
| synopsis=str(row.get("synopsis", "")) if pd.notna(row.get("synopsis")) else None, | |
| genres=parse_list_field(row.get("genres", "[]")), | |
| studios=parse_list_field(row.get("studios", "[]")), | |
| source=str(row.get("source")) if pd.notna(row.get("source")) else None, | |
| rating=str(row.get("rating")) if pd.notna(row.get("rating")) else None, | |
| image_url=str(row.get("image_url")) if pd.notna(row.get("image_url")) else None, | |
| start_date=str(row.get("start_date")) if pd.notna(row.get("start_date")) else None, | |
| end_date=str(row.get("end_date")) if pd.notna(row.get("end_date")) else None, | |
| ) | |
| def iter_anime(df: pd.DataFrame) -> Generator[Anime, None, None]: | |
| """Iterate over anime entries as Pydantic models""" | |
| for _, row in df.iterrows(): | |
| try: | |
| yield parse_anime_row(row) | |
| except Exception as e: | |
| print(f"Error parsing row {row.get('mal_id', 'unknown')}: {e}") | |
| continue | |
| def create_embedding_text(anime: Anime) -> str: | |
| """Create text for embedding generation""" | |
| parts = [anime.title] | |
| if anime.title_english and anime.title_english != anime.title: | |
| parts.append(anime.title_english) | |
| if anime.genres: | |
| parts.append(f"Genres: {', '.join(anime.genres)}") | |
| if anime.synopsis: | |
| # Truncate synopsis to prevent overly long embeddings | |
| synopsis = anime.synopsis[:1000] | |
| parts.append(synopsis) | |
| # Extract scene keywords for better scene-based search | |
| scene_keywords = extract_scene_keywords(synopsis, anime.genres or []) | |
| if scene_keywords: | |
| parts.append(f"Scenes and tropes: {', '.join(scene_keywords)}") | |
| return " | ".join(parts) | |
| # Scene/trope detection patterns | |
| SCENE_PATTERNS = { | |
| # Romantic scenes | |
| "confession": ["confess", "confession", "i love you", "feelings for", "admit feelings"], | |
| "rooftop scene": ["rooftop", "on the roof", "school rooftop"], | |
| "beach episode": ["beach", "swimsuit", "ocean", "summer vacation"], | |
| "festival date": ["festival", "fireworks", "yukata", "summer festival"], | |
| "accidental kiss": ["accidental", "lips touched", "fell on"], | |
| # Action scenes | |
| "training arc": ["training", "train harder", "become stronger", "special training"], | |
| "tournament arc": ["tournament", "competition", "championship", "finals"], | |
| "final battle": ["final battle", "last fight", "ultimate showdown", "final boss"], | |
| "power awakening": ["awakens", "hidden power", "true power", "unleash"], | |
| "sacrifice": ["sacrifice", "gave their life", "protect everyone", "died saving"], | |
| # Emotional scenes | |
| "tearful goodbye": ["goodbye", "farewell", "parting", "separation"], | |
| "death scene": ["death", "died", "killed", "passed away", "funeral"], | |
| "reunion": ["reunite", "reunion", "meet again", "found each other"], | |
| "flashback": ["flashback", "memories", "past", "childhood"], | |
| "redemption arc": ["redemption", "atone", "make amends", "change their ways"], | |
| # Character tropes | |
| "overpowered protagonist": ["overpowered", "strongest", "unbeatable", "one punch", "no match"], | |
| "hidden identity": ["secret identity", "hiding", "disguise", "true self"], | |
| "underdog story": ["underdog", "weakest", "looked down upon", "prove them wrong"], | |
| "transfer student": ["transfer student", "new student", "just arrived"], | |
| "chosen one": ["chosen", "prophecy", "destined", "fate"], | |
| # Setting/atmosphere | |
| "post-apocalyptic": ["apocalypse", "post-apocalyptic", "destroyed world", "ruins"], | |
| "isekai": ["another world", "transported", "reincarnated", "summoned to"], | |
| "time loop": ["time loop", "repeating", "stuck in time", "groundhog"], | |
| "school setting": ["high school", "academy", "school", "classroom"], | |
| "dystopian": ["dystopia", "oppressive", "government control", "rebellion"], | |
| } | |
| def extract_scene_keywords(synopsis: str, genres: list[str]) -> list[str]: | |
| """Extract scene/trope keywords from synopsis for better search""" | |
| if not synopsis: | |
| return [] | |
| synopsis_lower = synopsis.lower() | |
| detected = [] | |
| for scene_name, patterns in SCENE_PATTERNS.items(): | |
| for pattern in patterns: | |
| if pattern in synopsis_lower: | |
| detected.append(scene_name) | |
| break | |
| # Add genre-based common tropes | |
| genre_tropes = { | |
| "Romance": ["love triangle", "slow burn romance"], | |
| "Action": ["battle scenes", "fight choreography"], | |
| "Comedy": ["comedic moments", "slapstick"], | |
| "Drama": ["emotional moments", "character development"], | |
| "Horror": ["scary scenes", "tension building"], | |
| "Sports": ["match scenes", "team dynamics"], | |
| "Music": ["performance scenes", "concert"], | |
| } | |
| for genre in genres: | |
| if genre in genre_tropes: | |
| detected.extend(genre_tropes[genre]) | |
| return list(set(detected))[:10] # Limit to 10 keywords | |
| if __name__ == "__main__": | |
| # Test loading | |
| df = load_anime_dataset(limit=10) | |
| for anime in iter_anime(df): | |
| print(f"{anime.mal_id}: {anime.title} ({anime.score}) - {anime.genres}") | |
| print(f" Embedding text: {create_embedding_text(anime)[:150]}...") | |
| print() | |