Spaces:
Sleeping
Sleeping
Upload 72 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +2 -0
- backend/.dockerignore +21 -0
- backend/.env.example +29 -0
- backend/.python-version +1 -0
- backend/Dockerfile +28 -0
- backend/README.md +8 -0
- backend/__pycache__/config.cpython-313.pyc +0 -0
- backend/__pycache__/main.cpython-313.pyc +0 -0
- backend/__pycache__/security.cpython-313.pyc +0 -0
- backend/aniverse.db +0 -0
- backend/chroma_db/24e7682a-e5b5-4ac0-89db-2d9ca2508335/data_level0.bin +3 -0
- backend/chroma_db/24e7682a-e5b5-4ac0-89db-2d9ca2508335/header.bin +3 -0
- backend/chroma_db/24e7682a-e5b5-4ac0-89db-2d9ca2508335/index_metadata.pickle +3 -0
- backend/chroma_db/24e7682a-e5b5-4ac0-89db-2d9ca2508335/length.bin +3 -0
- backend/chroma_db/24e7682a-e5b5-4ac0-89db-2d9ca2508335/link_lists.bin +3 -0
- backend/chroma_db/chroma.sqlite3 +3 -0
- backend/config.py +33 -0
- backend/data/__init__.py +1 -0
- backend/data/__pycache__/__init__.cpython-313.pyc +0 -0
- backend/data/__pycache__/anime_schema.cpython-313.pyc +0 -0
- backend/data/__pycache__/data_loader.cpython-313.pyc +0 -0
- backend/data/__pycache__/database.cpython-313.pyc +0 -0
- backend/data/__pycache__/manga_loader.cpython-313.pyc +0 -0
- backend/data/__pycache__/manga_schema.cpython-313.pyc +0 -0
- backend/data/anime_schema.py +68 -0
- backend/data/data_loader.py +170 -0
- backend/data/database.py +95 -0
- backend/data/manga_loader.py +129 -0
- backend/data/manga_schema.py +52 -0
- backend/embeddings/__init__.py +1 -0
- backend/embeddings/__pycache__/__init__.cpython-313.pyc +0 -0
- backend/embeddings/__pycache__/chroma_store.cpython-313.pyc +0 -0
- backend/embeddings/__pycache__/manga_chroma_store.cpython-313.pyc +0 -0
- backend/embeddings/__pycache__/search_utils.cpython-313.pyc +0 -0
- backend/embeddings/build_embeddings.py +66 -0
- backend/embeddings/build_manga_embeddings.py +61 -0
- backend/embeddings/chroma_store.py +162 -0
- backend/embeddings/manga_chroma_store.py +156 -0
- backend/embeddings/search_utils.py +126 -0
- backend/llm/__init__.py +1 -0
- backend/llm/__pycache__/__init__.cpython-313.pyc +0 -0
- backend/llm/__pycache__/groq_client.cpython-313.pyc +0 -0
- backend/llm/groq_client.py +162 -0
- backend/main.py +88 -0
- backend/manga_chroma_db/466eefed-add1-4ba8-932f-06a367917727/data_level0.bin +3 -0
- backend/manga_chroma_db/466eefed-add1-4ba8-932f-06a367917727/header.bin +3 -0
- backend/manga_chroma_db/466eefed-add1-4ba8-932f-06a367917727/index_metadata.pickle +3 -0
- backend/manga_chroma_db/466eefed-add1-4ba8-932f-06a367917727/length.bin +3 -0
- backend/manga_chroma_db/466eefed-add1-4ba8-932f-06a367917727/link_lists.bin +3 -0
- backend/manga_chroma_db/chroma.sqlite3 +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
backend/chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
backend/manga_chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
backend/.dockerignore
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.py[cod]
|
| 3 |
+
*$py.class
|
| 4 |
+
*.so
|
| 5 |
+
.Python
|
| 6 |
+
.env
|
| 7 |
+
.venv/
|
| 8 |
+
venv/
|
| 9 |
+
ENV/
|
| 10 |
+
*.egg-info/
|
| 11 |
+
.eggs/
|
| 12 |
+
dist/
|
| 13 |
+
build/
|
| 14 |
+
*.egg
|
| 15 |
+
.git
|
| 16 |
+
.gitignore
|
| 17 |
+
.pytest_cache/
|
| 18 |
+
.mypy_cache/
|
| 19 |
+
*.log
|
| 20 |
+
.DS_Store
|
| 21 |
+
Thumbs.db
|
backend/.env.example
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment Configuration Template
|
| 2 |
+
# Copy this to .env and fill in your values
|
| 3 |
+
|
| 4 |
+
# Required: Groq API Key for AI chat
|
| 5 |
+
GROQ_API_KEY=your_groq_api_key_here
|
| 6 |
+
|
| 7 |
+
# Optional: MyAnimeList API (for MAL import feature)
|
| 8 |
+
MAL_CLIENT_ID=your_mal_client_id
|
| 9 |
+
MAL_CLIENT_SECRET=your_mal_client_secret
|
| 10 |
+
|
| 11 |
+
# Server Configuration (defaults work for development)
|
| 12 |
+
HOST=0.0.0.0
|
| 13 |
+
PORT=8000
|
| 14 |
+
|
| 15 |
+
# LLM Model (default: llama-3.1-8b-instant)
|
| 16 |
+
LLM_MODEL=llama-3.1-8b-instant
|
| 17 |
+
|
| 18 |
+
# Database Path (default: ./aniverse.db)
|
| 19 |
+
DATABASE_PATH=./aniverse.db
|
| 20 |
+
|
| 21 |
+
# ChromaDB Paths (defaults work for Docker)
|
| 22 |
+
CHROMA_DB_PATH=./data/chroma_db
|
| 23 |
+
MANGA_CHROMA_DB_PATH=./data/manga_chroma_db
|
| 24 |
+
|
| 25 |
+
# CORS Origins (comma-separated for production)
|
| 26 |
+
CORS_ORIGINS=http://localhost:5500,http://localhost:3000
|
| 27 |
+
|
| 28 |
+
# Production Mode (set to 'true' for production)
|
| 29 |
+
PRODUCTION=false
|
backend/.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.11.0
|
backend/Dockerfile
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AniVerse Backend - Hugging Face Spaces Dockerfile
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
gcc \
|
| 10 |
+
g++ \
|
| 11 |
+
curl \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
# Copy requirements first for caching
|
| 15 |
+
COPY requirements.txt .
|
| 16 |
+
|
| 17 |
+
# Install Python dependencies
|
| 18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Copy application code
|
| 21 |
+
COPY . .
|
| 22 |
+
|
| 23 |
+
# Set environment variables
|
| 24 |
+
ENV PYTHONUNBUFFERED=1
|
| 25 |
+
ENV PORT=7860
|
| 26 |
+
|
| 27 |
+
# Download data on startup and run server
|
| 28 |
+
CMD python setup_data.py && uvicorn main:app --host 0.0.0.0 --port 7860
|
backend/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: AniVerse API
|
| 3 |
+
emoji: 🎌
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
---
|
backend/__pycache__/config.cpython-313.pyc
ADDED
|
Binary file (939 Bytes). View file
|
|
|
backend/__pycache__/main.cpython-313.pyc
ADDED
|
Binary file (3.14 kB). View file
|
|
|
backend/__pycache__/security.cpython-313.pyc
ADDED
|
Binary file (5.01 kB). View file
|
|
|
backend/aniverse.db
ADDED
|
Binary file (36.9 kB). View file
|
|
|
backend/chroma_db/24e7682a-e5b5-4ac0-89db-2d9ca2508335/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e0ec6d7f7c3e18b3416b0c55da30d6357a455820b85b3935dd1e630a719e19e
|
| 3 |
+
size 34616104
|
backend/chroma_db/24e7682a-e5b5-4ac0-89db-2d9ca2508335/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3dd664db442013f61804af49e0c50db29dd271dfaec5d2d737b62219d2e8ada
|
| 3 |
+
size 100
|
backend/chroma_db/24e7682a-e5b5-4ac0-89db-2d9ca2508335/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8488b075b81b5e4a24c0d5a07c346f431b467db1d4bc40faf5b7b38bd8955a07
|
| 3 |
+
size 605982
|
backend/chroma_db/24e7682a-e5b5-4ac0-89db-2d9ca2508335/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c37ac8716cd37bc387370b60d0552618ac361de76f5c44e32aa5aa17cb2b5dd
|
| 3 |
+
size 82616
|
backend/chroma_db/24e7682a-e5b5-4ac0-89db-2d9ca2508335/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f3973cb0499e959fdc45c3c1900beca3c75e0ed7e67fa93818a84c64a250268
|
| 3 |
+
size 180876
|
backend/chroma_db/chroma.sqlite3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc52dc177e5a53724f27d6656234d10c623ddc1f014125b2738e0bceb777cc67
|
| 3 |
+
size 134754304
|
backend/config.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""AniVerse Configuration"""
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
# Paths - support both local development and container deployment
|
| 9 |
+
BASE_DIR = Path(__file__).parent.parent
|
| 10 |
+
BACKEND_DIR = Path(__file__).parent
|
| 11 |
+
|
| 12 |
+
# Check if running in container (dataset will be in /app/dataset)
|
| 13 |
+
if (BACKEND_DIR / "dataset").exists():
|
| 14 |
+
DATASET_PATH = BACKEND_DIR / "dataset" / "anime.csv"
|
| 15 |
+
MANGA_DATASET_PATH = BACKEND_DIR / "dataset" / "manga_data" / "MAL-manga.csv"
|
| 16 |
+
else:
|
| 17 |
+
DATASET_PATH = BASE_DIR / "dataset" / "anime.csv"
|
| 18 |
+
MANGA_DATASET_PATH = BASE_DIR / "dataset" / "manga data" / "MAL-manga.csv"
|
| 19 |
+
|
| 20 |
+
# ChromaDB paths - use environment variables with fallbacks
|
| 21 |
+
CHROMA_DB_PATH = Path(os.getenv("CHROMA_DB_PATH", str(BACKEND_DIR / "chroma_db")))
|
| 22 |
+
MANGA_CHROMA_DB_PATH = Path(os.getenv("MANGA_CHROMA_DB_PATH", str(BACKEND_DIR / "manga_chroma_db")))
|
| 23 |
+
|
| 24 |
+
# API Keys
|
| 25 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
|
| 26 |
+
|
| 27 |
+
# Model Settings
|
| 28 |
+
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
| 29 |
+
LLM_MODEL = "llama-3.1-8b-instant" # Fast, free on Groq
|
| 30 |
+
|
| 31 |
+
# API Settings
|
| 32 |
+
JIKAN_BASE_URL = "https://api.jikan.moe/v4"
|
| 33 |
+
JIKAN_RATE_LIMIT = 3 # requests per second
|
backend/data/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Data module
|
backend/data/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (130 Bytes). View file
|
|
|
backend/data/__pycache__/anime_schema.cpython-313.pyc
ADDED
|
Binary file (3.96 kB). View file
|
|
|
backend/data/__pycache__/data_loader.cpython-313.pyc
ADDED
|
Binary file (10.4 kB). View file
|
|
|
backend/data/__pycache__/database.cpython-313.pyc
ADDED
|
Binary file (4.38 kB). View file
|
|
|
backend/data/__pycache__/manga_loader.cpython-313.pyc
ADDED
|
Binary file (7.02 kB). View file
|
|
|
backend/data/__pycache__/manga_schema.cpython-313.pyc
ADDED
|
Binary file (2.65 kB). View file
|
|
|
backend/data/anime_schema.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Anime Data Models"""
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
from typing import Optional
|
| 4 |
+
import ast
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Anime(BaseModel):
|
| 8 |
+
"""Core anime data model"""
|
| 9 |
+
mal_id: int = Field(..., description="MyAnimeList ID")
|
| 10 |
+
title: str
|
| 11 |
+
title_english: Optional[str] = None
|
| 12 |
+
title_japanese: Optional[str] = None
|
| 13 |
+
media_type: str = "tv"
|
| 14 |
+
episodes: Optional[int] = None
|
| 15 |
+
status: str = "unknown"
|
| 16 |
+
score: Optional[float] = None
|
| 17 |
+
scored_by: Optional[int] = None
|
| 18 |
+
rank: Optional[int] = None
|
| 19 |
+
popularity: Optional[int] = None
|
| 20 |
+
favorites: Optional[int] = None
|
| 21 |
+
synopsis: Optional[str] = None
|
| 22 |
+
genres: list[str] = []
|
| 23 |
+
studios: list[str] = []
|
| 24 |
+
source: Optional[str] = None
|
| 25 |
+
rating: Optional[str] = None
|
| 26 |
+
image_url: Optional[str] = None
|
| 27 |
+
start_date: Optional[str] = None
|
| 28 |
+
end_date: Optional[str] = None
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class AnimeSearchResult(BaseModel):
|
| 32 |
+
"""Search result with similarity score"""
|
| 33 |
+
anime: Anime
|
| 34 |
+
similarity: float = Field(..., ge=0, le=1)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class ChatMessage(BaseModel):
|
| 38 |
+
"""Chat message for AI recommendations"""
|
| 39 |
+
role: str = Field(..., pattern="^(user|assistant)$")
|
| 40 |
+
content: str
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class RecommendationRequest(BaseModel):
|
| 44 |
+
"""Request for AI recommendations"""
|
| 45 |
+
query: str
|
| 46 |
+
history: list[ChatMessage] = []
|
| 47 |
+
limit: int = Field(default=10, ge=1, le=50)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class ReviewSummary(BaseModel):
|
| 51 |
+
"""Summarized review data"""
|
| 52 |
+
overall_sentiment: str # positive, negative, mixed
|
| 53 |
+
pros: list[str]
|
| 54 |
+
cons: list[str]
|
| 55 |
+
summary: str
|
| 56 |
+
aspect_scores: dict[str, float] = {} # story, animation, characters, etc.
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def parse_list_field(value: str) -> list[str]:
|
| 60 |
+
"""Parse stringified list from CSV"""
|
| 61 |
+
if not value or value == "[]" or isinstance(value, float):
|
| 62 |
+
return []
|
| 63 |
+
try:
|
| 64 |
+
# Handle Python list string format: "['Action', 'Adventure']"
|
| 65 |
+
return ast.literal_eval(value)
|
| 66 |
+
except (ValueError, SyntaxError):
|
| 67 |
+
# Handle comma-separated format
|
| 68 |
+
return [g.strip() for g in str(value).split(",") if g.strip()]
|
backend/data/data_loader.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Load and process anime dataset"""
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Generator
|
| 5 |
+
import sys
|
| 6 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 7 |
+
|
| 8 |
+
from config import DATASET_PATH
|
| 9 |
+
from data.anime_schema import Anime, parse_list_field
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def load_anime_dataset(limit: int = None) -> pd.DataFrame:
|
| 13 |
+
"""Load anime dataset from CSV"""
|
| 14 |
+
print(f"Loading dataset from {DATASET_PATH}...")
|
| 15 |
+
|
| 16 |
+
df = pd.read_csv(DATASET_PATH, nrows=limit)
|
| 17 |
+
|
| 18 |
+
# Rename columns to match our schema
|
| 19 |
+
column_mapping = {
|
| 20 |
+
"id": "mal_id",
|
| 21 |
+
"mean": "score",
|
| 22 |
+
"num_scoring_users": "scored_by",
|
| 23 |
+
"num_favorites": "favorites",
|
| 24 |
+
"main_picture_medium": "image_url",
|
| 25 |
+
"alternative_titles_en": "title_english",
|
| 26 |
+
"alternative_titles_ja": "title_japanese",
|
| 27 |
+
}
|
| 28 |
+
df = df.rename(columns=column_mapping)
|
| 29 |
+
|
| 30 |
+
print(f"Loaded {len(df)} anime entries")
|
| 31 |
+
return df
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def parse_anime_row(row: pd.Series) -> Anime:
|
| 35 |
+
"""Convert DataFrame row to Anime model"""
|
| 36 |
+
return Anime(
|
| 37 |
+
mal_id=int(row["mal_id"]),
|
| 38 |
+
title=str(row.get("title", "Unknown")),
|
| 39 |
+
title_english=row.get("title_english") if pd.notna(row.get("title_english")) else None,
|
| 40 |
+
title_japanese=row.get("title_japanese") if pd.notna(row.get("title_japanese")) else None,
|
| 41 |
+
media_type=str(row.get("media_type", "unknown")),
|
| 42 |
+
episodes=int(row["num_episodes"]) if pd.notna(row.get("num_episodes")) and row.get("num_episodes") != 0 else None,
|
| 43 |
+
status=str(row.get("status", "unknown")),
|
| 44 |
+
score=float(row["score"]) if pd.notna(row.get("score")) else None,
|
| 45 |
+
scored_by=int(row["scored_by"]) if pd.notna(row.get("scored_by")) else None,
|
| 46 |
+
rank=int(row["rank"]) if pd.notna(row.get("rank")) else None,
|
| 47 |
+
popularity=int(row["popularity"]) if pd.notna(row.get("popularity")) else None,
|
| 48 |
+
favorites=int(row["favorites"]) if pd.notna(row.get("favorites")) else None,
|
| 49 |
+
synopsis=str(row.get("synopsis", "")) if pd.notna(row.get("synopsis")) else None,
|
| 50 |
+
genres=parse_list_field(row.get("genres", "[]")),
|
| 51 |
+
studios=parse_list_field(row.get("studios", "[]")),
|
| 52 |
+
source=str(row.get("source")) if pd.notna(row.get("source")) else None,
|
| 53 |
+
rating=str(row.get("rating")) if pd.notna(row.get("rating")) else None,
|
| 54 |
+
image_url=str(row.get("image_url")) if pd.notna(row.get("image_url")) else None,
|
| 55 |
+
start_date=str(row.get("start_date")) if pd.notna(row.get("start_date")) else None,
|
| 56 |
+
end_date=str(row.get("end_date")) if pd.notna(row.get("end_date")) else None,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def iter_anime(df: pd.DataFrame) -> Generator[Anime, None, None]:
|
| 61 |
+
"""Iterate over anime entries as Pydantic models"""
|
| 62 |
+
for _, row in df.iterrows():
|
| 63 |
+
try:
|
| 64 |
+
yield parse_anime_row(row)
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print(f"Error parsing row {row.get('mal_id', 'unknown')}: {e}")
|
| 67 |
+
continue
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def create_embedding_text(anime: Anime) -> str:
|
| 71 |
+
"""Create text for embedding generation"""
|
| 72 |
+
parts = [anime.title]
|
| 73 |
+
|
| 74 |
+
if anime.title_english and anime.title_english != anime.title:
|
| 75 |
+
parts.append(anime.title_english)
|
| 76 |
+
|
| 77 |
+
if anime.genres:
|
| 78 |
+
parts.append(f"Genres: {', '.join(anime.genres)}")
|
| 79 |
+
|
| 80 |
+
if anime.synopsis:
|
| 81 |
+
# Truncate synopsis to prevent overly long embeddings
|
| 82 |
+
synopsis = anime.synopsis[:1000]
|
| 83 |
+
parts.append(synopsis)
|
| 84 |
+
|
| 85 |
+
# Extract scene keywords for better scene-based search
|
| 86 |
+
scene_keywords = extract_scene_keywords(synopsis, anime.genres or [])
|
| 87 |
+
if scene_keywords:
|
| 88 |
+
parts.append(f"Scenes and tropes: {', '.join(scene_keywords)}")
|
| 89 |
+
|
| 90 |
+
return " | ".join(parts)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# Scene/trope detection patterns
|
| 94 |
+
SCENE_PATTERNS = {
|
| 95 |
+
# Romantic scenes
|
| 96 |
+
"confession": ["confess", "confession", "i love you", "feelings for", "admit feelings"],
|
| 97 |
+
"rooftop scene": ["rooftop", "on the roof", "school rooftop"],
|
| 98 |
+
"beach episode": ["beach", "swimsuit", "ocean", "summer vacation"],
|
| 99 |
+
"festival date": ["festival", "fireworks", "yukata", "summer festival"],
|
| 100 |
+
"accidental kiss": ["accidental", "lips touched", "fell on"],
|
| 101 |
+
|
| 102 |
+
# Action scenes
|
| 103 |
+
"training arc": ["training", "train harder", "become stronger", "special training"],
|
| 104 |
+
"tournament arc": ["tournament", "competition", "championship", "finals"],
|
| 105 |
+
"final battle": ["final battle", "last fight", "ultimate showdown", "final boss"],
|
| 106 |
+
"power awakening": ["awakens", "hidden power", "true power", "unleash"],
|
| 107 |
+
"sacrifice": ["sacrifice", "gave their life", "protect everyone", "died saving"],
|
| 108 |
+
|
| 109 |
+
# Emotional scenes
|
| 110 |
+
"tearful goodbye": ["goodbye", "farewell", "parting", "separation"],
|
| 111 |
+
"death scene": ["death", "died", "killed", "passed away", "funeral"],
|
| 112 |
+
"reunion": ["reunite", "reunion", "meet again", "found each other"],
|
| 113 |
+
"flashback": ["flashback", "memories", "past", "childhood"],
|
| 114 |
+
"redemption arc": ["redemption", "atone", "make amends", "change their ways"],
|
| 115 |
+
|
| 116 |
+
# Character tropes
|
| 117 |
+
"overpowered protagonist": ["overpowered", "strongest", "unbeatable", "one punch", "no match"],
|
| 118 |
+
"hidden identity": ["secret identity", "hiding", "disguise", "true self"],
|
| 119 |
+
"underdog story": ["underdog", "weakest", "looked down upon", "prove them wrong"],
|
| 120 |
+
"transfer student": ["transfer student", "new student", "just arrived"],
|
| 121 |
+
"chosen one": ["chosen", "prophecy", "destined", "fate"],
|
| 122 |
+
|
| 123 |
+
# Setting/atmosphere
|
| 124 |
+
"post-apocalyptic": ["apocalypse", "post-apocalyptic", "destroyed world", "ruins"],
|
| 125 |
+
"isekai": ["another world", "transported", "reincarnated", "summoned to"],
|
| 126 |
+
"time loop": ["time loop", "repeating", "stuck in time", "groundhog"],
|
| 127 |
+
"school setting": ["high school", "academy", "school", "classroom"],
|
| 128 |
+
"dystopian": ["dystopia", "oppressive", "government control", "rebellion"],
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def extract_scene_keywords(synopsis: str, genres: list[str]) -> list[str]:
|
| 133 |
+
"""Extract scene/trope keywords from synopsis for better search"""
|
| 134 |
+
if not synopsis:
|
| 135 |
+
return []
|
| 136 |
+
|
| 137 |
+
synopsis_lower = synopsis.lower()
|
| 138 |
+
detected = []
|
| 139 |
+
|
| 140 |
+
for scene_name, patterns in SCENE_PATTERNS.items():
|
| 141 |
+
for pattern in patterns:
|
| 142 |
+
if pattern in synopsis_lower:
|
| 143 |
+
detected.append(scene_name)
|
| 144 |
+
break
|
| 145 |
+
|
| 146 |
+
# Add genre-based common tropes
|
| 147 |
+
genre_tropes = {
|
| 148 |
+
"Romance": ["love triangle", "slow burn romance"],
|
| 149 |
+
"Action": ["battle scenes", "fight choreography"],
|
| 150 |
+
"Comedy": ["comedic moments", "slapstick"],
|
| 151 |
+
"Drama": ["emotional moments", "character development"],
|
| 152 |
+
"Horror": ["scary scenes", "tension building"],
|
| 153 |
+
"Sports": ["match scenes", "team dynamics"],
|
| 154 |
+
"Music": ["performance scenes", "concert"],
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
for genre in genres:
|
| 158 |
+
if genre in genre_tropes:
|
| 159 |
+
detected.extend(genre_tropes[genre])
|
| 160 |
+
|
| 161 |
+
return list(set(detected))[:10] # Limit to 10 keywords
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
if __name__ == "__main__":
|
| 165 |
+
# Test loading
|
| 166 |
+
df = load_anime_dataset(limit=10)
|
| 167 |
+
for anime in iter_anime(df):
|
| 168 |
+
print(f"{anime.mal_id}: {anime.title} ({anime.score}) - {anime.genres}")
|
| 169 |
+
print(f" Embedding text: {create_embedding_text(anime)[:150]}...")
|
| 170 |
+
print()
|
backend/data/database.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Database setup and models"""
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime, ForeignKey, Enum as SQLEnum
|
| 5 |
+
from sqlalchemy.ext.declarative import declarative_base
|
| 6 |
+
from sqlalchemy.orm import sessionmaker, relationship
|
| 7 |
+
import enum
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# Database path
|
| 11 |
+
DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "aniverse.db")
|
| 12 |
+
DATABASE_URL = f"sqlite:///{DB_PATH}"
|
| 13 |
+
|
| 14 |
+
# Create engine
|
| 15 |
+
engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
|
| 16 |
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 17 |
+
Base = declarative_base()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class AnimeStatus(enum.Enum):
|
| 21 |
+
"""User's status for an anime"""
|
| 22 |
+
watching = "watching"
|
| 23 |
+
completed = "completed"
|
| 24 |
+
planned = "planned"
|
| 25 |
+
dropped = "dropped"
|
| 26 |
+
on_hold = "on_hold"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class User(Base):
|
| 30 |
+
"""User account model"""
|
| 31 |
+
__tablename__ = "users"
|
| 32 |
+
|
| 33 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 34 |
+
email = Column(String, unique=True, index=True, nullable=False)
|
| 35 |
+
username = Column(String, unique=True, index=True, nullable=False)
|
| 36 |
+
password_hash = Column(String, nullable=False)
|
| 37 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 38 |
+
|
| 39 |
+
# Relationships
|
| 40 |
+
anime_list = relationship("UserAnime", back_populates="user")
|
| 41 |
+
manga_list = relationship("UserManga", back_populates="user")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class UserAnime(Base):
|
| 45 |
+
"""User's anime list entry"""
|
| 46 |
+
__tablename__ = "user_anime"
|
| 47 |
+
|
| 48 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 49 |
+
user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
|
| 50 |
+
anime_id = Column(Integer, nullable=False) # MAL ID
|
| 51 |
+
status = Column(SQLEnum(AnimeStatus), default=AnimeStatus.planned)
|
| 52 |
+
rating = Column(Float, nullable=True) # 1-10 scale
|
| 53 |
+
is_favorite = Column(Integer, default=0) # SQLite boolean
|
| 54 |
+
added_at = Column(DateTime, default=datetime.utcnow)
|
| 55 |
+
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
| 56 |
+
|
| 57 |
+
# Relationships
|
| 58 |
+
user = relationship("User", back_populates="anime_list")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class UserManga(Base):
|
| 62 |
+
"""User's manga list entry"""
|
| 63 |
+
__tablename__ = "user_manga"
|
| 64 |
+
|
| 65 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 66 |
+
user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
|
| 67 |
+
manga_id = Column(Integer, nullable=False) # MAL ID
|
| 68 |
+
status = Column(SQLEnum(AnimeStatus), default=AnimeStatus.planned) # Reuse status enum
|
| 69 |
+
rating = Column(Float, nullable=True) # 1-10 scale
|
| 70 |
+
is_favorite = Column(Integer, default=0) # SQLite boolean
|
| 71 |
+
added_at = Column(DateTime, default=datetime.utcnow)
|
| 72 |
+
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
| 73 |
+
|
| 74 |
+
# Relationships
|
| 75 |
+
user = relationship("User", back_populates="manga_list")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def init_db():
|
| 79 |
+
"""Initialize database tables"""
|
| 80 |
+
Base.metadata.create_all(bind=engine)
|
| 81 |
+
print(f"Database initialized at {DB_PATH}")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def get_db():
|
| 85 |
+
"""Get database session"""
|
| 86 |
+
db = SessionLocal()
|
| 87 |
+
try:
|
| 88 |
+
yield db
|
| 89 |
+
finally:
|
| 90 |
+
db.close()
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
init_db()
|
| 95 |
+
print("Database tables created successfully!")
|
backend/data/manga_loader.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Load and process manga dataset"""
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Generator
|
| 5 |
+
import sys
|
| 6 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 7 |
+
|
| 8 |
+
from config import MANGA_DATASET_PATH
|
| 9 |
+
from data.manga_schema import Manga, parse_list_field
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def load_manga_dataset(limit: int = None) -> pd.DataFrame:
|
| 13 |
+
"""Load manga dataset from CSV"""
|
| 14 |
+
print(f"Loading manga dataset from {MANGA_DATASET_PATH}...")
|
| 15 |
+
|
| 16 |
+
df = pd.read_csv(MANGA_DATASET_PATH, nrows=limit)
|
| 17 |
+
|
| 18 |
+
# Clean up column names
|
| 19 |
+
df.columns = df.columns.str.strip()
|
| 20 |
+
|
| 21 |
+
print(f"Loaded {len(df)} manga entries")
|
| 22 |
+
print(f"Columns: {df.columns.tolist()}")
|
| 23 |
+
return df
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def parse_manga_row(row: pd.Series) -> Manga:
|
| 27 |
+
"""Convert DataFrame row to Manga model"""
|
| 28 |
+
# Extract mal_id from URL if available
|
| 29 |
+
mal_id = None
|
| 30 |
+
if pd.notna(row.get("page_url")):
|
| 31 |
+
try:
|
| 32 |
+
# URL format: https://myanimelist.net/manga/ID/title
|
| 33 |
+
url = str(row["page_url"])
|
| 34 |
+
parts = url.split("/manga/")
|
| 35 |
+
if len(parts) > 1:
|
| 36 |
+
mal_id = int(parts[1].split("/")[0])
|
| 37 |
+
except (ValueError, IndexError):
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
if mal_id is None:
|
| 41 |
+
# Use index or unnamed column
|
| 42 |
+
mal_id = int(row.get("Unnamed: 0", row.name)) if pd.notna(row.get("Unnamed: 0")) else row.name
|
| 43 |
+
|
| 44 |
+
# Parse volumes
|
| 45 |
+
volumes = None
|
| 46 |
+
if pd.notna(row.get("Volumes")):
|
| 47 |
+
try:
|
| 48 |
+
vol_str = str(row["Volumes"]).strip()
|
| 49 |
+
if vol_str.isdigit():
|
| 50 |
+
volumes = int(vol_str)
|
| 51 |
+
except ValueError:
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
# Parse score
|
| 55 |
+
score = None
|
| 56 |
+
if pd.notna(row.get("Score")):
|
| 57 |
+
try:
|
| 58 |
+
score = float(row["Score"])
|
| 59 |
+
except (ValueError, TypeError):
|
| 60 |
+
pass
|
| 61 |
+
|
| 62 |
+
# Parse members
|
| 63 |
+
members = None
|
| 64 |
+
if pd.notna(row.get("Members")):
|
| 65 |
+
try:
|
| 66 |
+
members = int(str(row["Members"]).replace(",", ""))
|
| 67 |
+
except (ValueError, TypeError):
|
| 68 |
+
pass
|
| 69 |
+
|
| 70 |
+
# Parse rank
|
| 71 |
+
rank = None
|
| 72 |
+
if pd.notna(row.get("Rank")):
|
| 73 |
+
try:
|
| 74 |
+
rank = int(row["Rank"])
|
| 75 |
+
except (ValueError, TypeError):
|
| 76 |
+
pass
|
| 77 |
+
|
| 78 |
+
return Manga(
|
| 79 |
+
mal_id=mal_id,
|
| 80 |
+
title=str(row.get("Title", "Unknown")).strip(),
|
| 81 |
+
media_type=str(row.get("Type", "Manga")).strip().lower() if pd.notna(row.get("Type")) else "manga",
|
| 82 |
+
volumes=volumes,
|
| 83 |
+
score=score,
|
| 84 |
+
rank=rank,
|
| 85 |
+
members=members,
|
| 86 |
+
published=str(row.get("Published")) if pd.notna(row.get("Published")) else None,
|
| 87 |
+
genres=parse_list_field(row.get("Genres", "[]")),
|
| 88 |
+
authors=parse_list_field(row.get("Authors", "[]")),
|
| 89 |
+
image_url=str(row.get("image_url")) if pd.notna(row.get("image_url")) else None,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def iter_manga(df: pd.DataFrame) -> Generator[Manga, None, None]:
|
| 94 |
+
"""Iterate over manga entries as Pydantic models"""
|
| 95 |
+
for _, row in df.iterrows():
|
| 96 |
+
try:
|
| 97 |
+
yield parse_manga_row(row)
|
| 98 |
+
except Exception as e:
|
| 99 |
+
print(f"Error parsing manga row {row.get('Title', 'unknown')}: {e}")
|
| 100 |
+
continue
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def create_manga_embedding_text(manga: Manga) -> str:
|
| 104 |
+
"""Create text for embedding generation"""
|
| 105 |
+
parts = [manga.title]
|
| 106 |
+
|
| 107 |
+
if manga.genres:
|
| 108 |
+
parts.append(f"Genres: {', '.join(manga.genres)}")
|
| 109 |
+
|
| 110 |
+
if manga.media_type:
|
| 111 |
+
parts.append(f"Type: {manga.media_type}")
|
| 112 |
+
|
| 113 |
+
if manga.authors:
|
| 114 |
+
parts.append(f"Authors: {', '.join(manga.authors[:3])}")
|
| 115 |
+
|
| 116 |
+
if manga.synopsis:
|
| 117 |
+
synopsis = manga.synopsis[:1000]
|
| 118 |
+
parts.append(synopsis)
|
| 119 |
+
|
| 120 |
+
return " | ".join(parts)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
if __name__ == "__main__":
|
| 124 |
+
# Test loading
|
| 125 |
+
df = load_manga_dataset(limit=10)
|
| 126 |
+
for manga in iter_manga(df):
|
| 127 |
+
print(f"{manga.mal_id}: {manga.title} (Score: {manga.score}) - {manga.genres}")
|
| 128 |
+
print(f" Type: {manga.media_type}, Volumes: {manga.volumes}")
|
| 129 |
+
print()
|
backend/data/manga_schema.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Manga data schema"""
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
from typing import Optional
|
| 4 |
+
import ast
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Manga(BaseModel):
|
| 8 |
+
"""Manga entry with MAL-style fields"""
|
| 9 |
+
mal_id: int
|
| 10 |
+
title: str
|
| 11 |
+
title_english: Optional[str] = None
|
| 12 |
+
media_type: str = "manga" # manga, manhwa, manhua, novel, light_novel
|
| 13 |
+
volumes: Optional[int] = None
|
| 14 |
+
chapters: Optional[int] = None
|
| 15 |
+
status: Optional[str] = None # publishing, finished
|
| 16 |
+
score: Optional[float] = None
|
| 17 |
+
scored_by: Optional[int] = None
|
| 18 |
+
rank: Optional[int] = None
|
| 19 |
+
popularity: Optional[int] = None
|
| 20 |
+
members: Optional[int] = None
|
| 21 |
+
favorites: Optional[int] = None
|
| 22 |
+
synopsis: Optional[str] = None
|
| 23 |
+
genres: list[str] = []
|
| 24 |
+
authors: list[str] = []
|
| 25 |
+
image_url: Optional[str] = None
|
| 26 |
+
published: Optional[str] = None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def parse_list_field(value) -> list[str]:
|
| 30 |
+
"""Parse string list field from CSV"""
|
| 31 |
+
if not value or (isinstance(value, float) and str(value) == 'nan'):
|
| 32 |
+
return []
|
| 33 |
+
|
| 34 |
+
if isinstance(value, list):
|
| 35 |
+
return value
|
| 36 |
+
|
| 37 |
+
if isinstance(value, str):
|
| 38 |
+
value = value.strip()
|
| 39 |
+
if value.startswith('['):
|
| 40 |
+
try:
|
| 41 |
+
parsed = ast.literal_eval(value)
|
| 42 |
+
return [str(item) for item in parsed if item]
|
| 43 |
+
except (ValueError, SyntaxError):
|
| 44 |
+
pass
|
| 45 |
+
|
| 46 |
+
# Try comma-separated
|
| 47 |
+
if ',' in value:
|
| 48 |
+
return [v.strip() for v in value.split(',') if v.strip()]
|
| 49 |
+
|
| 50 |
+
return [value] if value else []
|
| 51 |
+
|
| 52 |
+
return []
|
backend/embeddings/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Embeddings module
|
backend/embeddings/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (136 Bytes). View file
|
|
|
backend/embeddings/__pycache__/chroma_store.cpython-313.pyc
ADDED
|
Binary file (5.84 kB). View file
|
|
|
backend/embeddings/__pycache__/manga_chroma_store.cpython-313.pyc
ADDED
|
Binary file (5.88 kB). View file
|
|
|
backend/embeddings/__pycache__/search_utils.cpython-313.pyc
ADDED
|
Binary file (4.81 kB). View file
|
|
|
backend/embeddings/build_embeddings.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build anime embeddings and populate vector store"""
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 6 |
+
|
| 7 |
+
from data.data_loader import load_anime_dataset, iter_anime, create_embedding_text
|
| 8 |
+
from embeddings.chroma_store import get_vector_store
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def build_embeddings(limit: int = None, batch_size: int = 100):
|
| 12 |
+
"""Build embeddings for all anime and store in ChromaDB"""
|
| 13 |
+
print("=" * 50)
|
| 14 |
+
print("AniVerse Embedding Builder")
|
| 15 |
+
print("=" * 50)
|
| 16 |
+
|
| 17 |
+
# Load dataset
|
| 18 |
+
df = load_anime_dataset(limit=limit)
|
| 19 |
+
|
| 20 |
+
# Initialize vector store
|
| 21 |
+
store = get_vector_store()
|
| 22 |
+
existing_count = store.get_count()
|
| 23 |
+
print(f"Existing entries in vector store: {existing_count}")
|
| 24 |
+
|
| 25 |
+
# Prepare batch data
|
| 26 |
+
ids = []
|
| 27 |
+
texts = []
|
| 28 |
+
metadatas = []
|
| 29 |
+
|
| 30 |
+
print("Processing anime entries...")
|
| 31 |
+
for anime in iter_anime(df):
|
| 32 |
+
# Skip entries without synopsis (poor embeddings)
|
| 33 |
+
if not anime.synopsis or len(anime.synopsis) < 20:
|
| 34 |
+
continue
|
| 35 |
+
|
| 36 |
+
ids.append(anime.mal_id)
|
| 37 |
+
texts.append(create_embedding_text(anime))
|
| 38 |
+
metadatas.append({
|
| 39 |
+
"title": anime.title,
|
| 40 |
+
"score": anime.score or 0,
|
| 41 |
+
"genres": ", ".join(anime.genres) if anime.genres else "",
|
| 42 |
+
"media_type": anime.media_type,
|
| 43 |
+
"status": anime.status,
|
| 44 |
+
"image_url": anime.image_url or "",
|
| 45 |
+
})
|
| 46 |
+
|
| 47 |
+
print(f"Prepared {len(ids)} anime entries for embedding")
|
| 48 |
+
|
| 49 |
+
# Add to vector store
|
| 50 |
+
print("Generating embeddings and storing in ChromaDB...")
|
| 51 |
+
store.add_batch(ids, texts, metadatas, batch_size=batch_size)
|
| 52 |
+
|
| 53 |
+
print("=" * 50)
|
| 54 |
+
print(f"Complete! Vector store now has {store.get_count()} entries")
|
| 55 |
+
print("=" * 50)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
import argparse
|
| 60 |
+
|
| 61 |
+
parser = argparse.ArgumentParser(description="Build anime embeddings")
|
| 62 |
+
parser.add_argument("--limit", type=int, default=None, help="Limit number of entries to process")
|
| 63 |
+
parser.add_argument("--batch-size", type=int, default=100, help="Batch size for embedding generation")
|
| 64 |
+
|
| 65 |
+
args = parser.parse_args()
|
| 66 |
+
build_embeddings(limit=args.limit, batch_size=args.batch_size)
|
backend/embeddings/build_manga_embeddings.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build manga embeddings and store in ChromaDB"""
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 5 |
+
|
| 6 |
+
from data.manga_loader import load_manga_dataset, iter_manga, create_manga_embedding_text
|
| 7 |
+
from embeddings.manga_chroma_store import MangaVectorStore
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def build_manga_embeddings(limit: int = None):
|
| 11 |
+
"""Build embeddings for manga dataset"""
|
| 12 |
+
print("="*50)
|
| 13 |
+
print("Building Manga Embeddings")
|
| 14 |
+
print("="*50)
|
| 15 |
+
|
| 16 |
+
# Load dataset
|
| 17 |
+
df = load_manga_dataset(limit=limit)
|
| 18 |
+
|
| 19 |
+
# Initialize vector store
|
| 20 |
+
store = MangaVectorStore()
|
| 21 |
+
|
| 22 |
+
# Collect data for batch insert
|
| 23 |
+
ids = []
|
| 24 |
+
texts = []
|
| 25 |
+
metadatas = []
|
| 26 |
+
|
| 27 |
+
print("\nProcessing manga entries...")
|
| 28 |
+
for manga in iter_manga(df):
|
| 29 |
+
embedding_text = create_manga_embedding_text(manga)
|
| 30 |
+
|
| 31 |
+
metadata = {
|
| 32 |
+
"title": manga.title,
|
| 33 |
+
"media_type": manga.media_type or "manga",
|
| 34 |
+
"score": manga.score or 0,
|
| 35 |
+
"rank": manga.rank or 0,
|
| 36 |
+
"members": manga.members or 0,
|
| 37 |
+
"volumes": manga.volumes or 0,
|
| 38 |
+
"genres": ", ".join(manga.genres) if manga.genres else "",
|
| 39 |
+
"authors": ", ".join(manga.authors[:3]) if manga.authors else "",
|
| 40 |
+
"image_url": manga.image_url or "",
|
| 41 |
+
"published": manga.published or "",
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
ids.append(manga.mal_id)
|
| 45 |
+
texts.append(embedding_text)
|
| 46 |
+
metadatas.append(metadata)
|
| 47 |
+
|
| 48 |
+
print(f"\nAdding {len(ids)} manga to vector store...")
|
| 49 |
+
store.add_batch(ids, texts, metadatas, batch_size=100)
|
| 50 |
+
|
| 51 |
+
print(f"\n✓ Successfully indexed {store.get_count()} manga entries!")
|
| 52 |
+
print("="*50)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
if __name__ == "__main__":
|
| 56 |
+
import argparse
|
| 57 |
+
parser = argparse.ArgumentParser(description="Build manga embeddings")
|
| 58 |
+
parser.add_argument("--limit", type=int, help="Limit number of entries")
|
| 59 |
+
args = parser.parse_args()
|
| 60 |
+
|
| 61 |
+
build_manga_embeddings(limit=args.limit)
|
backend/embeddings/chroma_store.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ChromaDB Vector Store for Anime Similarity Search"""
|
| 2 |
+
import chromadb
|
| 3 |
+
from chromadb.config import Settings
|
| 4 |
+
from typing import Optional
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 9 |
+
from config import CHROMA_DB_PATH, EMBEDDING_MODEL
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class AnimeVectorStore:
|
| 13 |
+
"""Vector database for anime semantic search"""
|
| 14 |
+
|
| 15 |
+
def __init__(self, persist_directory: str = None):
|
| 16 |
+
self.persist_dir = persist_directory or str(CHROMA_DB_PATH)
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
# Initialize ChromaDB client with telemetry disabled
|
| 20 |
+
self.client = chromadb.PersistentClient(
|
| 21 |
+
path=self.persist_dir,
|
| 22 |
+
settings=Settings(
|
| 23 |
+
anonymized_telemetry=False,
|
| 24 |
+
allow_reset=True
|
| 25 |
+
)
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# Use sentence-transformers for embeddings (more compatible than onnxruntime)
|
| 29 |
+
from chromadb.utils import embedding_functions
|
| 30 |
+
self.embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
|
| 31 |
+
model_name=EMBEDDING_MODEL
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# Get or create anime collection with embedding function
|
| 35 |
+
self.collection = self.client.get_or_create_collection(
|
| 36 |
+
name="anime",
|
| 37 |
+
metadata={"hnsw:space": "cosine"},
|
| 38 |
+
embedding_function=self.embedding_fn
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
print(f"Vector store initialized at {self.persist_dir}")
|
| 42 |
+
print(f"Collection count: {self.collection.count()}")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"ERROR initializing vector store: {e}")
|
| 45 |
+
import traceback
|
| 46 |
+
traceback.print_exc()
|
| 47 |
+
raise
|
| 48 |
+
|
| 49 |
+
def add_anime(
|
| 50 |
+
self,
|
| 51 |
+
mal_id: int,
|
| 52 |
+
embedding_text: str,
|
| 53 |
+
metadata: dict
|
| 54 |
+
) -> None:
|
| 55 |
+
"""Add or update anime entry in vector store"""
|
| 56 |
+
# Upsert to collection (embeddings auto-generated)
|
| 57 |
+
self.collection.upsert(
|
| 58 |
+
ids=[str(mal_id)],
|
| 59 |
+
documents=[embedding_text],
|
| 60 |
+
metadatas=[metadata]
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
def add_batch(
|
| 64 |
+
self,
|
| 65 |
+
ids: list[int],
|
| 66 |
+
texts: list[str],
|
| 67 |
+
metadatas: list[dict],
|
| 68 |
+
batch_size: int = 100
|
| 69 |
+
) -> None:
|
| 70 |
+
"""Add multiple anime entries in batches"""
|
| 71 |
+
total = len(ids)
|
| 72 |
+
for i in range(0, total, batch_size):
|
| 73 |
+
batch_ids = [str(id_) for id_ in ids[i:i+batch_size]]
|
| 74 |
+
batch_texts = texts[i:i+batch_size]
|
| 75 |
+
batch_meta = metadatas[i:i+batch_size]
|
| 76 |
+
|
| 77 |
+
self.collection.upsert(
|
| 78 |
+
ids=batch_ids,
|
| 79 |
+
documents=batch_texts,
|
| 80 |
+
metadatas=batch_meta
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
print(f" Added {min(i+batch_size, total)}/{total} entries...")
|
| 84 |
+
|
| 85 |
+
def search(
|
| 86 |
+
self,
|
| 87 |
+
query: str,
|
| 88 |
+
n_results: int = 10,
|
| 89 |
+
where: Optional[dict] = None
|
| 90 |
+
) -> list[dict]:
|
| 91 |
+
"""Search for similar anime by text query"""
|
| 92 |
+
# Query ChromaDB (embedding auto-generated from query)
|
| 93 |
+
results = self.collection.query(
|
| 94 |
+
query_texts=[query],
|
| 95 |
+
n_results=n_results,
|
| 96 |
+
where=where,
|
| 97 |
+
include=["metadatas", "documents", "distances"]
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Format results
|
| 101 |
+
formatted = []
|
| 102 |
+
for i, mal_id in enumerate(results["ids"][0]):
|
| 103 |
+
formatted.append({
|
| 104 |
+
"mal_id": int(mal_id),
|
| 105 |
+
"metadata": results["metadatas"][0][i],
|
| 106 |
+
"document": results["documents"][0][i],
|
| 107 |
+
"similarity": 1 - results["distances"][0][i] # Convert distance to similarity
|
| 108 |
+
})
|
| 109 |
+
|
| 110 |
+
return formatted
|
| 111 |
+
|
| 112 |
+
def search_similar(
|
| 113 |
+
self,
|
| 114 |
+
mal_id: int,
|
| 115 |
+
n_results: int = 10
|
| 116 |
+
) -> list[dict]:
|
| 117 |
+
"""Find anime similar to a given anime by MAL ID"""
|
| 118 |
+
# Get the anime's document
|
| 119 |
+
result = self.collection.get(
|
| 120 |
+
ids=[str(mal_id)],
|
| 121 |
+
include=["documents"]
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
if not result["documents"]:
|
| 125 |
+
return []
|
| 126 |
+
|
| 127 |
+
# Query with that document
|
| 128 |
+
results = self.collection.query(
|
| 129 |
+
query_texts=result["documents"],
|
| 130 |
+
n_results=n_results + 1, # +1 to exclude self
|
| 131 |
+
include=["metadatas", "documents", "distances"]
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# Format and exclude self
|
| 135 |
+
formatted = []
|
| 136 |
+
for i, id_ in enumerate(results["ids"][0]):
|
| 137 |
+
if int(id_) == mal_id:
|
| 138 |
+
continue
|
| 139 |
+
formatted.append({
|
| 140 |
+
"mal_id": int(id_),
|
| 141 |
+
"metadata": results["metadatas"][0][i],
|
| 142 |
+
"document": results["documents"][0][i],
|
| 143 |
+
"similarity": 1 - results["distances"][0][i]
|
| 144 |
+
})
|
| 145 |
+
|
| 146 |
+
return formatted[:n_results]
|
| 147 |
+
|
| 148 |
+
def get_count(self) -> int:
|
| 149 |
+
"""Get total number of entries in the collection"""
|
| 150 |
+
return self.collection.count()
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
# Singleton instance
|
| 154 |
+
_store: Optional[AnimeVectorStore] = None
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def get_vector_store() -> AnimeVectorStore:
|
| 158 |
+
"""Get or create vector store instance"""
|
| 159 |
+
global _store
|
| 160 |
+
if _store is None:
|
| 161 |
+
_store = AnimeVectorStore()
|
| 162 |
+
return _store
|
backend/embeddings/manga_chroma_store.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ChromaDB Vector Store for Manga Similarity Search"""
|
| 2 |
+
import chromadb
|
| 3 |
+
from chromadb.config import Settings
|
| 4 |
+
from chromadb.utils import embedding_functions
|
| 5 |
+
from typing import Optional
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 10 |
+
from config import MANGA_CHROMA_DB_PATH, EMBEDDING_MODEL
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class MangaVectorStore:
|
| 14 |
+
"""Vector database for manga semantic search"""
|
| 15 |
+
|
| 16 |
+
def __init__(self, persist_directory: str = None):
|
| 17 |
+
self.persist_dir = persist_directory or str(MANGA_CHROMA_DB_PATH)
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
# Initialize ChromaDB client
|
| 21 |
+
self.client = chromadb.PersistentClient(
|
| 22 |
+
path=self.persist_dir,
|
| 23 |
+
settings=Settings(
|
| 24 |
+
anonymized_telemetry=False,
|
| 25 |
+
allow_reset=True
|
| 26 |
+
)
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# Use sentence-transformers for embeddings (more compatible)
|
| 30 |
+
self.embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
|
| 31 |
+
model_name=EMBEDDING_MODEL
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# Get or create manga collection with embedding function
|
| 35 |
+
self.collection = self.client.get_or_create_collection(
|
| 36 |
+
name="manga",
|
| 37 |
+
metadata={"hnsw:space": "cosine"},
|
| 38 |
+
embedding_function=self.embedding_fn
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
print(f"Manga vector store initialized at {self.persist_dir}")
|
| 42 |
+
print(f"Manga collection count: {self.collection.count()}")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"ERROR initializing manga vector store: {e}")
|
| 45 |
+
import traceback
|
| 46 |
+
traceback.print_exc()
|
| 47 |
+
raise
|
| 48 |
+
|
| 49 |
+
def add_manga(
|
| 50 |
+
self,
|
| 51 |
+
mal_id: int,
|
| 52 |
+
embedding_text: str,
|
| 53 |
+
metadata: dict
|
| 54 |
+
) -> None:
|
| 55 |
+
"""Add or update manga entry in vector store"""
|
| 56 |
+
self.collection.upsert(
|
| 57 |
+
ids=[str(mal_id)],
|
| 58 |
+
documents=[embedding_text],
|
| 59 |
+
metadatas=[metadata]
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
def add_batch(
|
| 63 |
+
self,
|
| 64 |
+
ids: list[int],
|
| 65 |
+
texts: list[str],
|
| 66 |
+
metadatas: list[dict],
|
| 67 |
+
batch_size: int = 100
|
| 68 |
+
) -> None:
|
| 69 |
+
"""Add multiple manga entries in batches"""
|
| 70 |
+
total = len(ids)
|
| 71 |
+
for i in range(0, total, batch_size):
|
| 72 |
+
batch_ids = [str(id_) for id_ in ids[i:i+batch_size]]
|
| 73 |
+
batch_texts = texts[i:i+batch_size]
|
| 74 |
+
batch_meta = metadatas[i:i+batch_size]
|
| 75 |
+
|
| 76 |
+
self.collection.upsert(
|
| 77 |
+
ids=batch_ids,
|
| 78 |
+
documents=batch_texts,
|
| 79 |
+
metadatas=batch_meta
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
print(f" Added {min(i+batch_size, total)}/{total} manga entries...")
|
| 83 |
+
|
| 84 |
+
def search(
|
| 85 |
+
self,
|
| 86 |
+
query: str,
|
| 87 |
+
n_results: int = 10,
|
| 88 |
+
where: Optional[dict] = None
|
| 89 |
+
) -> list[dict]:
|
| 90 |
+
"""Search for similar manga by text query"""
|
| 91 |
+
results = self.collection.query(
|
| 92 |
+
query_texts=[query],
|
| 93 |
+
n_results=n_results,
|
| 94 |
+
where=where,
|
| 95 |
+
include=["metadatas", "documents", "distances"]
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
formatted = []
|
| 99 |
+
for i, mal_id in enumerate(results["ids"][0]):
|
| 100 |
+
formatted.append({
|
| 101 |
+
"mal_id": int(mal_id),
|
| 102 |
+
"metadata": results["metadatas"][0][i],
|
| 103 |
+
"document": results["documents"][0][i],
|
| 104 |
+
"similarity": 1 - results["distances"][0][i]
|
| 105 |
+
})
|
| 106 |
+
|
| 107 |
+
return formatted
|
| 108 |
+
|
| 109 |
+
def search_similar(
|
| 110 |
+
self,
|
| 111 |
+
mal_id: int,
|
| 112 |
+
n_results: int = 10
|
| 113 |
+
) -> list[dict]:
|
| 114 |
+
"""Find manga similar to a given manga by MAL ID"""
|
| 115 |
+
result = self.collection.get(
|
| 116 |
+
ids=[str(mal_id)],
|
| 117 |
+
include=["documents"]
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
if not result["documents"]:
|
| 121 |
+
return []
|
| 122 |
+
|
| 123 |
+
results = self.collection.query(
|
| 124 |
+
query_texts=result["documents"],
|
| 125 |
+
n_results=n_results + 1,
|
| 126 |
+
include=["metadatas", "documents", "distances"]
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
formatted = []
|
| 130 |
+
for i, id_ in enumerate(results["ids"][0]):
|
| 131 |
+
if int(id_) == mal_id:
|
| 132 |
+
continue
|
| 133 |
+
formatted.append({
|
| 134 |
+
"mal_id": int(id_),
|
| 135 |
+
"metadata": results["metadatas"][0][i],
|
| 136 |
+
"document": results["documents"][0][i],
|
| 137 |
+
"similarity": 1 - results["distances"][0][i]
|
| 138 |
+
})
|
| 139 |
+
|
| 140 |
+
return formatted[:n_results]
|
| 141 |
+
|
| 142 |
+
def get_count(self) -> int:
|
| 143 |
+
"""Get total number of entries in the collection"""
|
| 144 |
+
return self.collection.count()
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
# Singleton instance
|
| 148 |
+
_manga_store: Optional[MangaVectorStore] = None
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def get_manga_vector_store() -> MangaVectorStore:
|
| 152 |
+
"""Get or create manga vector store instance"""
|
| 153 |
+
global _manga_store
|
| 154 |
+
if _manga_store is None:
|
| 155 |
+
_manga_store = MangaVectorStore()
|
| 156 |
+
return _manga_store
|
backend/embeddings/search_utils.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Search utilities for improved ranking and filtering"""
|
| 2 |
+
from typing import Optional
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def calculate_combined_score(
|
| 6 |
+
similarity: float,
|
| 7 |
+
anime_score: float,
|
| 8 |
+
popularity: int = None,
|
| 9 |
+
weight_similarity: float = 0.6,
|
| 10 |
+
weight_anime_score: float = 0.3,
|
| 11 |
+
weight_popularity: float = 0.1
|
| 12 |
+
) -> float:
|
| 13 |
+
"""
|
| 14 |
+
Calculate a combined ranking score.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
similarity: Vector similarity (0-1)
|
| 18 |
+
anime_score: MAL score (0-10)
|
| 19 |
+
popularity: Popularity rank (lower is better)
|
| 20 |
+
weight_*: Weights for each factor
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
Combined score (0-1)
|
| 24 |
+
"""
|
| 25 |
+
# Normalize anime score to 0-1
|
| 26 |
+
normalized_score = (anime_score or 0) / 10
|
| 27 |
+
|
| 28 |
+
# Normalize popularity (inverse, since lower rank = more popular)
|
| 29 |
+
normalized_pop = 0.5 # Default if not available
|
| 30 |
+
if popularity and popularity > 0:
|
| 31 |
+
# Map rank 1-1000 to 1-0.5, rank > 1000 to 0.5-0.1
|
| 32 |
+
if popularity <= 1000:
|
| 33 |
+
normalized_pop = 1 - (popularity / 2000)
|
| 34 |
+
else:
|
| 35 |
+
normalized_pop = max(0.1, 0.5 - (popularity - 1000) / 20000)
|
| 36 |
+
|
| 37 |
+
combined = (
|
| 38 |
+
weight_similarity * similarity +
|
| 39 |
+
weight_anime_score * normalized_score +
|
| 40 |
+
weight_popularity * normalized_pop
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
return round(combined, 4)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def rerank_results(results: list[dict], limit: int = 15) -> list[dict]:
|
| 47 |
+
"""
|
| 48 |
+
Rerank search results using combined scoring.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
results: List of search results with metadata
|
| 52 |
+
limit: Max results to return
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
Reranked and limited results
|
| 56 |
+
"""
|
| 57 |
+
for r in results:
|
| 58 |
+
r["combined_score"] = calculate_combined_score(
|
| 59 |
+
similarity=r.get("similarity", 0),
|
| 60 |
+
anime_score=r.get("metadata", {}).get("score", 0),
|
| 61 |
+
popularity=r.get("metadata", {}).get("popularity")
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# Sort by combined score
|
| 65 |
+
reranked = sorted(results, key=lambda x: x["combined_score"], reverse=True)
|
| 66 |
+
|
| 67 |
+
return reranked[:limit]
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def build_genre_filter(genres: list[str]) -> dict:
|
| 71 |
+
"""Build ChromaDB where filter for genres"""
|
| 72 |
+
if not genres:
|
| 73 |
+
return None
|
| 74 |
+
|
| 75 |
+
# ChromaDB uses $contains for partial string match
|
| 76 |
+
if len(genres) == 1:
|
| 77 |
+
return {"genres": {"$contains": genres[0]}}
|
| 78 |
+
|
| 79 |
+
# Multiple genres: any match
|
| 80 |
+
return {"$or": [{"genres": {"$contains": g}} for g in genres]}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def extract_keywords(query: str) -> list[str]:
|
| 84 |
+
"""Extract important keywords from search query"""
|
| 85 |
+
# Common words to ignore
|
| 86 |
+
stop_words = {
|
| 87 |
+
"anime", "like", "similar", "to", "with", "the", "a", "an", "and", "or",
|
| 88 |
+
"that", "has", "have", "good", "best", "top", "show", "series", "want",
|
| 89 |
+
"looking", "for", "something", "recommend", "me", "please", "i", "my"
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
words = query.lower().split()
|
| 93 |
+
keywords = [w.strip(",.!?") for w in words if w.strip(",.!?") not in stop_words]
|
| 94 |
+
|
| 95 |
+
return keywords
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# Genre keyword mappings for better matching
|
| 99 |
+
GENRE_KEYWORDS = {
|
| 100 |
+
"action": ["action", "fight", "battle", "combat", "war"],
|
| 101 |
+
"romance": ["romance", "love", "romantic", "relationship", "dating"],
|
| 102 |
+
"comedy": ["comedy", "funny", "humor", "hilarious", "laugh"],
|
| 103 |
+
"drama": ["drama", "emotional", "feels", "sad", "tear"],
|
| 104 |
+
"horror": ["horror", "scary", "terrifying", "creepy", "dark"],
|
| 105 |
+
"psychological": ["psychological", "mind", "mental", "thriller", "mindbending"],
|
| 106 |
+
"slice of life": ["slice of life", "daily", "everyday", "relaxing", "wholesome"],
|
| 107 |
+
"fantasy": ["fantasy", "magic", "wizard", "isekai", "magical"],
|
| 108 |
+
"sci-fi": ["sci-fi", "scifi", "science fiction", "future", "space", "mecha"],
|
| 109 |
+
"sports": ["sports", "basketball", "soccer", "volleyball", "baseball"],
|
| 110 |
+
"mystery": ["mystery", "detective", "investigation", "whodunit"],
|
| 111 |
+
"supernatural": ["supernatural", "ghost", "spirit", "demon", "paranormal"],
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def detect_genres_from_query(query: str) -> list[str]:
|
| 116 |
+
"""Detect genre preferences from natural language query"""
|
| 117 |
+
query_lower = query.lower()
|
| 118 |
+
detected = []
|
| 119 |
+
|
| 120 |
+
for genre, keywords in GENRE_KEYWORDS.items():
|
| 121 |
+
for kw in keywords:
|
| 122 |
+
if kw in query_lower:
|
| 123 |
+
detected.append(genre.title())
|
| 124 |
+
break
|
| 125 |
+
|
| 126 |
+
return detected
|
backend/llm/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# LLM module
|
backend/llm/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (129 Bytes). View file
|
|
|
backend/llm/__pycache__/groq_client.cpython-313.pyc
ADDED
|
Binary file (6.37 kB). View file
|
|
|
backend/llm/groq_client.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Groq LLM Client for AI Recommendations"""
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 7 |
+
from config import GROQ_API_KEY, LLM_MODEL
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
from groq import Groq
|
| 11 |
+
except ImportError:
|
| 12 |
+
Groq = None
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
SYSTEM_PROMPT = """You are AniVerse AI, an expert anime and manga recommendation assistant.
|
| 16 |
+
|
| 17 |
+
## YOUR CORE MISSION
|
| 18 |
+
Provide HIGHLY RELEVANT, PRECISE recommendations. Quality over quantity. Every suggestion must directly address what the user is looking for.
|
| 19 |
+
|
| 20 |
+
## RECOMMENDATION RULES
|
| 21 |
+
1. **Match the Query Exactly**: If user asks for "dark fantasy", recommend dark fantasy - not action comedy.
|
| 22 |
+
2. **Use Context Wisely**: Reference the "Relevant Anime/Manga" data provided. These are semantically matched to the query.
|
| 23 |
+
3. **Explain Your Picks**: For EACH recommendation, give 1-2 sentences on WHY it fits the request.
|
| 24 |
+
4. **Limit Recommendations**: Suggest 2-4 titles max per response. Be selective.
|
| 25 |
+
5. **Format Clearly**: Use bold for titles, include scores and genres inline.
|
| 26 |
+
|
| 27 |
+
## PERSONALIZATION (When User Profile Available)
|
| 28 |
+
- Reference their high-rated titles: "Since you gave Attack on Titan a 9..."
|
| 29 |
+
- Avoid genres from low-rated shows
|
| 30 |
+
- Connect new suggestions to their favorites
|
| 31 |
+
|
| 32 |
+
## RESPONSE FORMAT
|
| 33 |
+
When recommending, use this structure:
|
| 34 |
+
**[Title]** (★ score/10) - [Brief reason why this matches their request]
|
| 35 |
+
|
| 36 |
+
## GUIDELINES
|
| 37 |
+
- Be enthusiastic but concise
|
| 38 |
+
- No spoilers
|
| 39 |
+
- If the context doesn't have good matches, say so honestly
|
| 40 |
+
- You can discuss plots, characters, and themes
|
| 41 |
+
- Support both anime AND manga recommendations
|
| 42 |
+
|
| 43 |
+
Context about relevant titles will be provided below."""
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class GroqClient:
|
| 47 |
+
"""Groq LLM client for AI-powered recommendations"""
|
| 48 |
+
|
| 49 |
+
def __init__(self):
|
| 50 |
+
if not Groq:
|
| 51 |
+
raise ImportError("groq package not installed. Run: pip install groq")
|
| 52 |
+
|
| 53 |
+
if not GROQ_API_KEY:
|
| 54 |
+
raise ValueError("GROQ_API_KEY not set. Add it to your .env file")
|
| 55 |
+
|
| 56 |
+
self.client = Groq(api_key=GROQ_API_KEY)
|
| 57 |
+
self.model = LLM_MODEL
|
| 58 |
+
|
| 59 |
+
def chat(
|
| 60 |
+
self,
|
| 61 |
+
user_message: str,
|
| 62 |
+
context: str = "",
|
| 63 |
+
history: list[dict] = None,
|
| 64 |
+
max_tokens: int = 1024
|
| 65 |
+
) -> str:
|
| 66 |
+
"""Send a chat message and get a response"""
|
| 67 |
+
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
| 68 |
+
|
| 69 |
+
# Add context if provided
|
| 70 |
+
if context:
|
| 71 |
+
messages.append({
|
| 72 |
+
"role": "system",
|
| 73 |
+
"content": f"Here is relevant anime data from our database:\n\n{context}"
|
| 74 |
+
})
|
| 75 |
+
|
| 76 |
+
# Add conversation history
|
| 77 |
+
if history:
|
| 78 |
+
messages.extend(history)
|
| 79 |
+
|
| 80 |
+
# Add current user message
|
| 81 |
+
messages.append({"role": "user", "content": user_message})
|
| 82 |
+
|
| 83 |
+
# Call Groq API
|
| 84 |
+
response = self.client.chat.completions.create(
|
| 85 |
+
model=self.model,
|
| 86 |
+
messages=messages,
|
| 87 |
+
max_tokens=max_tokens,
|
| 88 |
+
temperature=0.7,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
return response.choices[0].message.content
|
| 92 |
+
|
| 93 |
+
def summarize_reviews(
|
| 94 |
+
self,
|
| 95 |
+
reviews: list[str],
|
| 96 |
+
anime_title: str
|
| 97 |
+
) -> dict:
|
| 98 |
+
"""Summarize multiple reviews into pros/cons"""
|
| 99 |
+
reviews_text = "\n---\n".join(reviews[:10]) # Limit to 10 reviews
|
| 100 |
+
|
| 101 |
+
prompt = f"""Analyze these reviews for "{anime_title}" and provide:
|
| 102 |
+
1. Overall sentiment (positive/negative/mixed)
|
| 103 |
+
2. Top 3 pros (things reviewers loved)
|
| 104 |
+
3. Top 3 cons (things reviewers criticized)
|
| 105 |
+
4. A 2-3 sentence summary
|
| 106 |
+
5. Aspect scores (1-10) for: story, animation, characters, music, enjoyment
|
| 107 |
+
|
| 108 |
+
Reviews:
|
| 109 |
+
{reviews_text}
|
| 110 |
+
|
| 111 |
+
Respond in JSON format:
|
| 112 |
+
{{
|
| 113 |
+
"sentiment": "positive|negative|mixed",
|
| 114 |
+
"pros": ["pro1", "pro2", "pro3"],
|
| 115 |
+
"cons": ["con1", "con2", "con3"],
|
| 116 |
+
"summary": "...",
|
| 117 |
+
"aspects": {{"story": 8, "animation": 9, ...}}
|
| 118 |
+
}}"""
|
| 119 |
+
|
| 120 |
+
response = self.chat(prompt, max_tokens=512)
|
| 121 |
+
|
| 122 |
+
# Parse JSON response (with fallback)
|
| 123 |
+
import json
|
| 124 |
+
try:
|
| 125 |
+
return json.loads(response)
|
| 126 |
+
except json.JSONDecodeError:
|
| 127 |
+
return {
|
| 128 |
+
"sentiment": "mixed",
|
| 129 |
+
"pros": [],
|
| 130 |
+
"cons": [],
|
| 131 |
+
"summary": response,
|
| 132 |
+
"aspects": {}
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
def generate_recommendation_reason(
|
| 136 |
+
self,
|
| 137 |
+
user_query: str,
|
| 138 |
+
anime_data: dict
|
| 139 |
+
) -> str:
|
| 140 |
+
"""Generate a personalized reason why an anime matches the user's request"""
|
| 141 |
+
prompt = f"""The user asked: "{user_query}"
|
| 142 |
+
|
| 143 |
+
This anime was matched:
|
| 144 |
+
- Title: {anime_data.get('title', 'Unknown')}
|
| 145 |
+
- Genres: {anime_data.get('genres', 'Unknown')}
|
| 146 |
+
- Score: {anime_data.get('score', 'N/A')}
|
| 147 |
+
|
| 148 |
+
In 1-2 sentences, explain why this anime matches what the user is looking for. Be specific about the connection."""
|
| 149 |
+
|
| 150 |
+
return self.chat(prompt, max_tokens=150)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
# Singleton
|
| 154 |
+
_client: Optional[GroqClient] = None
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def get_llm_client() -> GroqClient:
|
| 158 |
+
"""Get or create LLM client instance"""
|
| 159 |
+
global _client
|
| 160 |
+
if _client is None:
|
| 161 |
+
_client = GroqClient()
|
| 162 |
+
return _client
|
backend/main.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""AniVerse API - Main Entry Point"""
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
# Add backend to path
|
| 8 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 9 |
+
|
| 10 |
+
from routes import search, chat, anime, auth, lists, recommendations, mal_import, manga
|
| 11 |
+
|
| 12 |
+
# Create FastAPI app
|
| 13 |
+
app = FastAPI(
|
| 14 |
+
title="AniVerse API",
|
| 15 |
+
description="AI-powered anime & manga discovery platform with semantic search, personalized recommendations, and user lists",
|
| 16 |
+
version="2.0.0",
|
| 17 |
+
docs_url="/docs",
|
| 18 |
+
redoc_url="/redoc",
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# CORS middleware - Allow ALL origins for cross-domain requests
|
| 22 |
+
app.add_middleware(
|
| 23 |
+
CORSMiddleware,
|
| 24 |
+
allow_origins=["*"],
|
| 25 |
+
allow_credentials=False, # Must be False when using wildcard
|
| 26 |
+
allow_methods=["*"],
|
| 27 |
+
allow_headers=["*"],
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Include routers
|
| 31 |
+
app.include_router(search.router)
|
| 32 |
+
app.include_router(chat.router)
|
| 33 |
+
app.include_router(anime.router)
|
| 34 |
+
app.include_router(auth.router)
|
| 35 |
+
app.include_router(lists.router)
|
| 36 |
+
app.include_router(recommendations.router)
|
| 37 |
+
app.include_router(mal_import.router)
|
| 38 |
+
app.include_router(manga.router)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
@app.get("/")
|
| 42 |
+
async def root():
|
| 43 |
+
"""API root - health check and info"""
|
| 44 |
+
return {
|
| 45 |
+
"name": "AniVerse API",
|
| 46 |
+
"version": "2.0.0",
|
| 47 |
+
"status": "running",
|
| 48 |
+
"endpoints": {
|
| 49 |
+
"docs": "/docs",
|
| 50 |
+
"search": "/api/search",
|
| 51 |
+
"chat": "/api/chat",
|
| 52 |
+
"anime": "/api/anime",
|
| 53 |
+
"manga": "/api/manga",
|
| 54 |
+
"auth": "/api/auth",
|
| 55 |
+
"lists": "/api/lists",
|
| 56 |
+
"recommendations": "/api/recommendations",
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@app.get("/api/health")
|
| 62 |
+
async def health_check():
|
| 63 |
+
"""Health check endpoint for Docker/k8s"""
|
| 64 |
+
return {"status": "healthy"}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@app.get("/api/stats")
|
| 68 |
+
async def get_stats():
|
| 69 |
+
"""Get database statistics"""
|
| 70 |
+
from embeddings.chroma_store import get_vector_store
|
| 71 |
+
from embeddings.manga_chroma_store import get_manga_vector_store
|
| 72 |
+
from config import DATASET_PATH
|
| 73 |
+
import pandas as pd
|
| 74 |
+
|
| 75 |
+
anime_store = get_vector_store()
|
| 76 |
+
manga_store = get_manga_vector_store()
|
| 77 |
+
df = pd.read_csv(DATASET_PATH)
|
| 78 |
+
|
| 79 |
+
return {
|
| 80 |
+
"total_anime": len(df),
|
| 81 |
+
"indexed_anime": anime_store.get_count(),
|
| 82 |
+
"indexed_manga": manga_store.get_count(),
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
import uvicorn
|
| 88 |
+
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
|
backend/manga_chroma_db/466eefed-add1-4ba8-932f-06a367917727/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a055086a635d2cacb3a426111c19a9b788478cd9a55baaea33036c6cbf5b2b13
|
| 3 |
+
size 29851236
|
backend/manga_chroma_db/466eefed-add1-4ba8-932f-06a367917727/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9fdf8dfc32fae317f2cff4ec0a5f920a6591ff14b0f8721ad6c584b713d592dd
|
| 3 |
+
size 100
|
backend/manga_chroma_db/466eefed-add1-4ba8-932f-06a367917727/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7f71f0b2f29d673b4c6a282b9af736e2a498d132f162c4840b3d9aeb1501c89
|
| 3 |
+
size 535698
|
backend/manga_chroma_db/466eefed-add1-4ba8-932f-06a367917727/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6b4cc8354a23c80c0abaf4485b174ea8bd44dc51c76c5d0acc1218cc1173df6e
|
| 3 |
+
size 71244
|
backend/manga_chroma_db/466eefed-add1-4ba8-932f-06a367917727/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c442cbb882eeb83457c7747aff45504845f000234c962b652e33dc5c779cb82
|
| 3 |
+
size 157876
|
backend/manga_chroma_db/chroma.sqlite3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69e98a1267fe754f0ebf174c81cc5fe9823375d1fc66d7f78d462550a9ec5d68
|
| 3 |
+
size 29241344
|