Spaces:

Penguindrum920
/

aniverse

Sleeping

App Files Files Community

aniverse / data /data_loader.py

Penguindrum920

Upload 57 files

59eb043 verified 14 days ago

raw

history blame contribute delete

7.4 kB

	"""Load and process anime dataset"""
	import pandas as pd
	from pathlib import Path
	from typing import Generator
	import sys
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from config import DATASET_PATH
	from data.anime_schema import Anime, parse_list_field


	def load_anime_dataset(limit: int = None) -> pd.DataFrame:
	"""Load anime dataset from CSV"""
	print(f"Loading dataset from {DATASET_PATH}...")

	df = pd.read_csv(DATASET_PATH, nrows=limit)

	# Rename columns to match our schema
	column_mapping = {
	"id": "mal_id",
	"mean": "score",
	"num_scoring_users": "scored_by",
	"num_favorites": "favorites",
	"main_picture_medium": "image_url",
	"alternative_titles_en": "title_english",
	"alternative_titles_ja": "title_japanese",
	}
	df = df.rename(columns=column_mapping)

	print(f"Loaded {len(df)} anime entries")
	return df


	def parse_anime_row(row: pd.Series) -> Anime:
	"""Convert DataFrame row to Anime model"""
	return Anime(
	mal_id=int(row["mal_id"]),
	title=str(row.get("title", "Unknown")),
	title_english=row.get("title_english") if pd.notna(row.get("title_english")) else None,
	title_japanese=row.get("title_japanese") if pd.notna(row.get("title_japanese")) else None,
	media_type=str(row.get("media_type", "unknown")),
	episodes=int(row["num_episodes"]) if pd.notna(row.get("num_episodes")) and row.get("num_episodes") != 0 else None,
	status=str(row.get("status", "unknown")),
	score=float(row["score"]) if pd.notna(row.get("score")) else None,
	scored_by=int(row["scored_by"]) if pd.notna(row.get("scored_by")) else None,
	rank=int(row["rank"]) if pd.notna(row.get("rank")) else None,
	popularity=int(row["popularity"]) if pd.notna(row.get("popularity")) else None,
	favorites=int(row["favorites"]) if pd.notna(row.get("favorites")) else None,
	synopsis=str(row.get("synopsis", "")) if pd.notna(row.get("synopsis")) else None,
	genres=parse_list_field(row.get("genres", "[]")),
	studios=parse_list_field(row.get("studios", "[]")),
	source=str(row.get("source")) if pd.notna(row.get("source")) else None,
	rating=str(row.get("rating")) if pd.notna(row.get("rating")) else None,
	image_url=str(row.get("image_url")) if pd.notna(row.get("image_url")) else None,
	start_date=str(row.get("start_date")) if pd.notna(row.get("start_date")) else None,
	end_date=str(row.get("end_date")) if pd.notna(row.get("end_date")) else None,
	)


	def iter_anime(df: pd.DataFrame) -> Generator[Anime, None, None]:
	"""Iterate over anime entries as Pydantic models"""
	for _, row in df.iterrows():
	try:
	yield parse_anime_row(row)
	except Exception as e:
	print(f"Error parsing row {row.get('mal_id', 'unknown')}: {e}")
	continue


	def create_embedding_text(anime: Anime) -> str:
	"""Create text for embedding generation"""
	parts = [anime.title]

	if anime.title_english and anime.title_english != anime.title:
	parts.append(anime.title_english)

	if anime.genres:
	parts.append(f"Genres: {', '.join(anime.genres)}")

	if anime.synopsis:
	# Truncate synopsis to prevent overly long embeddings
	synopsis = anime.synopsis[:1000]
	parts.append(synopsis)

	# Extract scene keywords for better scene-based search
	scene_keywords = extract_scene_keywords(synopsis, anime.genres or [])
	if scene_keywords:
	parts.append(f"Scenes and tropes: {', '.join(scene_keywords)}")

	return " \| ".join(parts)


	# Scene/trope detection patterns
	SCENE_PATTERNS = {
	# Romantic scenes
	"confession": ["confess", "confession", "i love you", "feelings for", "admit feelings"],
	"rooftop scene": ["rooftop", "on the roof", "school rooftop"],
	"beach episode": ["beach", "swimsuit", "ocean", "summer vacation"],
	"festival date": ["festival", "fireworks", "yukata", "summer festival"],
	"accidental kiss": ["accidental", "lips touched", "fell on"],

	# Action scenes
	"training arc": ["training", "train harder", "become stronger", "special training"],
	"tournament arc": ["tournament", "competition", "championship", "finals"],
	"final battle": ["final battle", "last fight", "ultimate showdown", "final boss"],
	"power awakening": ["awakens", "hidden power", "true power", "unleash"],
	"sacrifice": ["sacrifice", "gave their life", "protect everyone", "died saving"],

	# Emotional scenes
	"tearful goodbye": ["goodbye", "farewell", "parting", "separation"],
	"death scene": ["death", "died", "killed", "passed away", "funeral"],
	"reunion": ["reunite", "reunion", "meet again", "found each other"],
	"flashback": ["flashback", "memories", "past", "childhood"],
	"redemption arc": ["redemption", "atone", "make amends", "change their ways"],

	# Character tropes
	"overpowered protagonist": ["overpowered", "strongest", "unbeatable", "one punch", "no match"],
	"hidden identity": ["secret identity", "hiding", "disguise", "true self"],
	"underdog story": ["underdog", "weakest", "looked down upon", "prove them wrong"],
	"transfer student": ["transfer student", "new student", "just arrived"],
	"chosen one": ["chosen", "prophecy", "destined", "fate"],

	# Setting/atmosphere
	"post-apocalyptic": ["apocalypse", "post-apocalyptic", "destroyed world", "ruins"],
	"isekai": ["another world", "transported", "reincarnated", "summoned to"],
	"time loop": ["time loop", "repeating", "stuck in time", "groundhog"],
	"school setting": ["high school", "academy", "school", "classroom"],
	"dystopian": ["dystopia", "oppressive", "government control", "rebellion"],
	}


	def extract_scene_keywords(synopsis: str, genres: list[str]) -> list[str]:
	"""Extract scene/trope keywords from synopsis for better search"""
	if not synopsis:
	return []

	synopsis_lower = synopsis.lower()
	detected = []

	for scene_name, patterns in SCENE_PATTERNS.items():
	for pattern in patterns:
	if pattern in synopsis_lower:
	detected.append(scene_name)
	break

	# Add genre-based common tropes
	genre_tropes = {
	"Romance": ["love triangle", "slow burn romance"],
	"Action": ["battle scenes", "fight choreography"],
	"Comedy": ["comedic moments", "slapstick"],
	"Drama": ["emotional moments", "character development"],
	"Horror": ["scary scenes", "tension building"],
	"Sports": ["match scenes", "team dynamics"],
	"Music": ["performance scenes", "concert"],
	}

	for genre in genres:
	if genre in genre_tropes:
	detected.extend(genre_tropes[genre])

	return list(set(detected))[:10] # Limit to 10 keywords


	if __name__ == "__main__":
	# Test loading
	df = load_anime_dataset(limit=10)
	for anime in iter_anime(df):
	print(f"{anime.mal_id}: {anime.title} ({anime.score}) - {anime.genres}")
	print(f" Embedding text: {create_embedding_text(anime)[:150]}...")
	print()