File size: 7,402 Bytes
59eb043
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""Load and process anime dataset"""
import pandas as pd
from pathlib import Path
from typing import Generator
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))

from config import DATASET_PATH
from data.anime_schema import Anime, parse_list_field


def load_anime_dataset(limit: int = None) -> pd.DataFrame:
    """Load anime dataset from CSV"""
    print(f"Loading dataset from {DATASET_PATH}...")
    
    df = pd.read_csv(DATASET_PATH, nrows=limit)
    
    # Rename columns to match our schema
    column_mapping = {
        "id": "mal_id",
        "mean": "score",
        "num_scoring_users": "scored_by",
        "num_favorites": "favorites",
        "main_picture_medium": "image_url",
        "alternative_titles_en": "title_english",
        "alternative_titles_ja": "title_japanese",
    }
    df = df.rename(columns=column_mapping)
    
    print(f"Loaded {len(df)} anime entries")
    return df


def parse_anime_row(row: pd.Series) -> Anime:
    """Convert DataFrame row to Anime model"""
    return Anime(
        mal_id=int(row["mal_id"]),
        title=str(row.get("title", "Unknown")),
        title_english=row.get("title_english") if pd.notna(row.get("title_english")) else None,
        title_japanese=row.get("title_japanese") if pd.notna(row.get("title_japanese")) else None,
        media_type=str(row.get("media_type", "unknown")),
        episodes=int(row["num_episodes"]) if pd.notna(row.get("num_episodes")) and row.get("num_episodes") != 0 else None,
        status=str(row.get("status", "unknown")),
        score=float(row["score"]) if pd.notna(row.get("score")) else None,
        scored_by=int(row["scored_by"]) if pd.notna(row.get("scored_by")) else None,
        rank=int(row["rank"]) if pd.notna(row.get("rank")) else None,
        popularity=int(row["popularity"]) if pd.notna(row.get("popularity")) else None,
        favorites=int(row["favorites"]) if pd.notna(row.get("favorites")) else None,
        synopsis=str(row.get("synopsis", "")) if pd.notna(row.get("synopsis")) else None,
        genres=parse_list_field(row.get("genres", "[]")),
        studios=parse_list_field(row.get("studios", "[]")),
        source=str(row.get("source")) if pd.notna(row.get("source")) else None,
        rating=str(row.get("rating")) if pd.notna(row.get("rating")) else None,
        image_url=str(row.get("image_url")) if pd.notna(row.get("image_url")) else None,
        start_date=str(row.get("start_date")) if pd.notna(row.get("start_date")) else None,
        end_date=str(row.get("end_date")) if pd.notna(row.get("end_date")) else None,
    )


def iter_anime(df: pd.DataFrame) -> Generator[Anime, None, None]:
    """Iterate over anime entries as Pydantic models"""
    for _, row in df.iterrows():
        try:
            yield parse_anime_row(row)
        except Exception as e:
            print(f"Error parsing row {row.get('mal_id', 'unknown')}: {e}")
            continue


def create_embedding_text(anime: Anime) -> str:
    """Create text for embedding generation"""
    parts = [anime.title]
    
    if anime.title_english and anime.title_english != anime.title:
        parts.append(anime.title_english)
    
    if anime.genres:
        parts.append(f"Genres: {', '.join(anime.genres)}")
    
    if anime.synopsis:
        # Truncate synopsis to prevent overly long embeddings
        synopsis = anime.synopsis[:1000]
        parts.append(synopsis)
        
        # Extract scene keywords for better scene-based search
        scene_keywords = extract_scene_keywords(synopsis, anime.genres or [])
        if scene_keywords:
            parts.append(f"Scenes and tropes: {', '.join(scene_keywords)}")
    
    return " | ".join(parts)


# Scene/trope detection patterns
SCENE_PATTERNS = {
    # Romantic scenes
    "confession": ["confess", "confession", "i love you", "feelings for", "admit feelings"],
    "rooftop scene": ["rooftop", "on the roof", "school rooftop"],
    "beach episode": ["beach", "swimsuit", "ocean", "summer vacation"],
    "festival date": ["festival", "fireworks", "yukata", "summer festival"],
    "accidental kiss": ["accidental", "lips touched", "fell on"],
    
    # Action scenes
    "training arc": ["training", "train harder", "become stronger", "special training"],
    "tournament arc": ["tournament", "competition", "championship", "finals"],
    "final battle": ["final battle", "last fight", "ultimate showdown", "final boss"],
    "power awakening": ["awakens", "hidden power", "true power", "unleash"],
    "sacrifice": ["sacrifice", "gave their life", "protect everyone", "died saving"],
    
    # Emotional scenes
    "tearful goodbye": ["goodbye", "farewell", "parting", "separation"],
    "death scene": ["death", "died", "killed", "passed away", "funeral"],
    "reunion": ["reunite", "reunion", "meet again", "found each other"],
    "flashback": ["flashback", "memories", "past", "childhood"],
    "redemption arc": ["redemption", "atone", "make amends", "change their ways"],
    
    # Character tropes
    "overpowered protagonist": ["overpowered", "strongest", "unbeatable", "one punch", "no match"],
    "hidden identity": ["secret identity", "hiding", "disguise", "true self"],
    "underdog story": ["underdog", "weakest", "looked down upon", "prove them wrong"],
    "transfer student": ["transfer student", "new student", "just arrived"],
    "chosen one": ["chosen", "prophecy", "destined", "fate"],
    
    # Setting/atmosphere
    "post-apocalyptic": ["apocalypse", "post-apocalyptic", "destroyed world", "ruins"],
    "isekai": ["another world", "transported", "reincarnated", "summoned to"],
    "time loop": ["time loop", "repeating", "stuck in time", "groundhog"],
    "school setting": ["high school", "academy", "school", "classroom"],
    "dystopian": ["dystopia", "oppressive", "government control", "rebellion"],
}


def extract_scene_keywords(synopsis: str, genres: list[str]) -> list[str]:
    """Extract scene/trope keywords from synopsis for better search"""
    if not synopsis:
        return []
    
    synopsis_lower = synopsis.lower()
    detected = []
    
    for scene_name, patterns in SCENE_PATTERNS.items():
        for pattern in patterns:
            if pattern in synopsis_lower:
                detected.append(scene_name)
                break
    
    # Add genre-based common tropes
    genre_tropes = {
        "Romance": ["love triangle", "slow burn romance"],
        "Action": ["battle scenes", "fight choreography"],
        "Comedy": ["comedic moments", "slapstick"],
        "Drama": ["emotional moments", "character development"],
        "Horror": ["scary scenes", "tension building"],
        "Sports": ["match scenes", "team dynamics"],
        "Music": ["performance scenes", "concert"],
    }
    
    for genre in genres:
        if genre in genre_tropes:
            detected.extend(genre_tropes[genre])
    
    return list(set(detected))[:10]  # Limit to 10 keywords


if __name__ == "__main__":
    # Test loading
    df = load_anime_dataset(limit=10)
    for anime in iter_anime(df):
        print(f"{anime.mal_id}: {anime.title} ({anime.score}) - {anime.genres}")
        print(f"  Embedding text: {create_embedding_text(anime)[:150]}...")
        print()