Spaces:

Pushkar02-n
/

AnimeRAGSystem

Running

App Files Files Community

Pushkar02-n commited on 19 days ago

Commit

1d33c61

verified ·

1 Parent(s): d906298

Delete src/data_ingestion

Browse files

Files changed (4) hide show

src/data_ingestion/clean_data.py +0 -160
src/data_ingestion/create_embeddings.py +0 -232
src/data_ingestion/fetch_anime.py +0 -151
src/data_ingestion/load_to_postgres.py +0 -53

src/data_ingestion/clean_data.py DELETED Viewed

@@ -1,160 +0,0 @@
-import json
-import pandas as pd
-import logging
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-class AnimeDataCleaner:
-    """Cleans and prepares anime data for embeddings"""
-    @staticmethod
-    def load_raw_data(filepath: str = "data/raw/raw_anime.json") -> list[dict]:
-        """Load raw anime data"""
-        with open(filepath, 'r', encoding='utf-8') as f:
-            return json.load(f)
-    @staticmethod
-    def clean_synopsis(synopsis: str) -> str:
-        """Clean synopsis text"""
-        if not synopsis or synopsis == "":
-            return "No synopsis available."
-        synopsis = synopsis.replace("[Written by MAL Rewrite]", "")
-        synopsis.strip()
-        return synopsis
-    @staticmethod
-    def create_searchable_text(anime: dict) -> str:
-        """
-        Combine multiple fields into one searchable text.
-        This is what we'll embed!
-        Format: Title. Genres. Synopsis.
-        """
-        title = anime.get("title")
-        title_en = anime.get("title_english")
-        title_text = title
-        if title_en and title_en != title:
-            title_text = f"{title} ({title_en})"
-        genres = ", ".join(anime.get("genres", []))
-        themes = ", ".join(anime.get("themes", []))
-        demographics = ", ".join(anime.get("demographics", []))
-        genre_parts = [p for p in [genres, themes, demographics] if p]
-        genre_text = ". ".join(genre_parts) if genre_parts else ""
-        synopsis = AnimeDataCleaner.clean_synopsis(anime.get("synopsis", ""))
-        searchable_text = f"{title_text}. {genre_text}. {synopsis}"
-        return searchable_text.strip()
-    @staticmethod
-    def filter_valid_anime(anime_list: list[dict]) -> list[dict]:
-        """Remove anime without synopsis or essential fields"""
-        valid_anime = []
-        for anime in anime_list:
-            if not anime.get("title"):
-                continue
-            synopsis = anime.get("synopsis", "")
-            if not synopsis or len(synopsis) < 50:
-                continue  # Later modify this part to get custom synopsis from online sources
-            valid_anime.append(anime)
-        print(f"Filtered {len(anime_list)} -> {len(valid_anime)} animes")
-        logger.info(f"Filtered {len(anime_list)} -> {len(valid_anime)} animes")
-        return valid_anime
-    @staticmethod
-    def prepare_for_embedding(anime_list: list[dict]) -> pd.DataFrame:
-        """
-        Prepare final dataset for embedding
-        Returns dataframe with columns:
-        - mal_id: unique identifier
-        - searchable_text: what to embed
-        - metadata: everything else (for filtering/display)
-        """
-        records = []
-        for anime in anime_list:
-            record = {
-                "mal_id": anime["mal_id"],
-                "url": anime.get("url"),
-                "title": anime.get("title"),
-                "title_english": anime.get("title_english"),
-                "synopsis": AnimeDataCleaner.clean_synopsis(anime.get("synopsis", "")),
-                # Keep these as native dicts/lists for Postgres JSONB!
-                "images": anime.get("images", {}),
-                "genres": anime.get("genres", []),
-                "studios": anime.get("studios", []),
-                "themes": anime.get("themes", []),
-                "demographics": anime.get("demographics", []),
-                "type": anime.get("type"),
-                "episodes": anime.get("episodes"),
-                "score": anime.get("score"),
-                "scored_by": anime.get("scored_by"),
-                "rank": anime.get("rank"),
-                "popularity": anime.get("popularity"),
-                "year": anime.get("year"),
-                "season": anime.get("season"),
-                "rating": anime.get("rating"),
-                "aired_from": anime.get("aired_from"),
-                "aired_to": anime.get("aired_to"),
-                "favorites": anime.get("favorites"),
-                # And our custom RAG field
-                "searchable_text": AnimeDataCleaner.create_searchable_text(anime)
-            }
-            records.append(record)
-        df = pd.DataFrame(records)
-        return df
-    @staticmethod
-    def save_processed_data(df: pd.DataFrame, filepath: str = "data/processed/anime_clean.csv"):
-        """Save processed data"""
-        df.to_csv(filepath, index=False, encoding="utf-8")
-        logger.info(f"Saved {len(df)} anime to {filepath}")
-        json_path = filepath.replace(".csv", ".json")
-        df.to_json(json_path, orient="records", indent=2, force_ascii=False)
-        logger.info(f"Also saved to {json_path}")
-if __name__ == "__main__":
-    cleaner = AnimeDataCleaner()
-    print("Loading raw data....")
-    raw_animes = cleaner.load_raw_data("data/raw/raw_anime.json")
-    valid_animes = cleaner.filter_valid_anime(raw_animes)
-    print("\nPreparing data for embedding...")
-    df = cleaner.prepare_for_embedding(valid_animes)
-    cleaner.save_processed_data(
-        df, filepath="data/processed/anime_clean.csv")
-    print("\nSample searchable text:")
-    print(df.iloc[0]["searchable_text"][:500])
-    print(f"\nDataset statistics:")
-    print(f"Total anime: {len(df)}")
-    print(
-        f"Average text length: {df['searchable_text'].str.len().mean():.0f} chars")
-    print(f"Score range: {df['score'].min():.1f} - {df['score'].max():.1f}")

src/data_ingestion/create_embeddings.py DELETED Viewed

@@ -1,232 +0,0 @@
-from sentence_transformers import SentenceTransformer
-# import chromadb
-# from chromadb.config import Settings
-from qdrant_client import QdrantClient, models
-from sqlmodel import Session, select
-import logging
-from config import settings
-from src.database.session import engine
-from src.database.models import Animes
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO,
-                    format='%(asctime)s - %(levelname)s - %(message)s')
-class EmbeddingPipeline:
-    """Creates embeddings and store in ChromaDB"""
-    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
-        """
-        Initialize embedding model and ChromaDB client
-        Args:
-            model_name: HuggingFace model for embeddings
-                       all-MiniLM-L6-v2: Fast, good quality, 384 dims
-        """
-        logger.info(f"Loading embedding model: {model_name}")
-        self.model = SentenceTransformer(model_name)
-        self.vector_size = self.model.get_sentence_embedding_dimension() or 0
-        # self.chroma_client = chromadb.PersistentClient(
-        #     path="data/embeddings/chroma_db")
-        self.client = QdrantClient(url=settings.qdrant_url,
-                                   api_key=settings.qdrant_api_key,
-                                   cloud_inference=True)
-        self.use_existing_embeddings = False
-        print("ChromaDB initialized at data/embeddings/chroma_db")
-    def create_or_get_collection(self, collection_name: str = "anime_collection"):
-        """Create or get existing collection"""
-        if self.client.collection_exists(collection_name=collection_name):
-            logger.info(f"Found existing collection: {collection_name}")
-            collection = self.client.get_collection(collection_name)
-            logger.info(f"Found existing collection: {collection_name}")
-            logger.info(f"Current count: {collection.points_count} points")
-            user_input = input("Reset collection? (y/n): ")
-            if user_input.lower() == "y":
-                self.client.delete_collection(collection_name)
-                logger.info("Collection reset")
-            else:
-                self.use_existing_embeddings = True
-                return collection_name
-        if not self.use_existing_embeddings:
-            is_collection_created = self.client.create_collection(collection_name=collection_name,
-                                                       vectors_config=models.VectorParams(
-                                                           size=self.vector_size,
-                                                           distance=models.Distance.COSINE
-                                                       ))
-            logger.info(f"Created new collection: {collection_name}: {is_collection_created}")
-        return collection_name
-    def fetch_data_from_postgres(self, batch_size: int = 2000):
-        """Fetch anime records from PostgreSQL in batches to avoid timeouts"""
-        logger.info("Fetching data from PostgreSQL in batches...")
-        all_results = []
-        with Session(engine) as session:
-            offset = 0
-            while True:
-                # order_by is strictly required when using offset/limit to guarantee no duplicates
-                statement = (
-                    select(Animes)
-                    .where(Animes.searchable_text != None)
-                    .order_by(Animes.id)
-                    .offset(offset)
-                    .limit(batch_size)
-                )
-                batch = session.exec(statement).all()
-                if not batch:
-                    break  # Break the loop when no more rows are returned
-                all_results.extend(batch)
-                offset += len(batch)
-                logger.info(
-                    f"Downloaded {offset} rows from Supabase so far...")
-        logger.info(
-            f"Successfully fetched a total of {len(all_results)} records.")
-        return all_results
-    def embed_texts(self, texts: list[str], batch_size: int = 32) -> list[list[float]] | None:
-        """
-        Create embeddings for texts
-        Args:
-            texts: List of texts to embed
-            batch_size: Process in batches for efficiency
-        """
-        if self.use_existing_embeddings == False:
-            logger.info(f"Embedding {len(texts)} texts...")
-            embeddings = self.model.encode(
-                sentences=texts,
-                batch_size=batch_size,
-                show_progress_bar=True,
-                convert_to_numpy=True
-            )
-            return embeddings.tolist()
-        else:
-            logger.info(f"Using existing stored embeddings.")
-    def store_in_QdrantDB(self, client: QdrantClient, collection_name, db_records: list[Animes], final_texts: list[str], embeddings: list[list[float]]):
-        """
-        Store embeddings and metadata in QdrantDB
-        Args:
-            client: QdrantDB Client
-            collection_name: QdrantDB collection name,
-            db_records: List of Anime data retrieved from PostgreSQL database,
-            embeddings: Pre_commputed embeddings
-        """
-        logger.info("Storing in QdrantDB...")
-        points = []
-        for i, row in enumerate(db_records):
-            genres_list = row.genres if isinstance(row.genres, list) else []
-            if len(genres_list) == 0:
-                genres_list = ["Unknown"]
-            # Qdrant uses 'PointStruct' which holds the ID, Vector, and Payload (metadata + document)
-            point = models.PointStruct(
-                # Qdrant requires IDs to be integers or UUIDs
-                id=int(row.mal_id),
-                vector=embeddings[i],
-                payload={
-                    # Store the text here since Qdrant doesn't separate docs from metadata
-                    "document": final_texts[i],
-                    "title": row.title,
-                    "genres": genres_list,
-                    "score": float(row.score) if row.score else 0.0,
-                    "type": row.type if row.type else "Unknown",
-                    "scored_by": row.scored_by if row.scored_by else 0
-                }
-            )
-            points.append(point)
-        chunk_size = 500
-        total_chunks = (len(points) // chunk_size) + 1
-        logger.info(f"Inserting into Qdrant in {total_chunks} batches...")
-        for i in range(0, len(points), chunk_size):
-            batch = points[i: i + chunk_size]
-            self.client.upsert(
-                collection_name=collection_name,
-                points=batch
-            )
-            logger.info(f"Inserted batch {(i//chunk_size)+1}/{total_chunks}")
-        logger.info(f"Successfully stored {len(points)} animes in Qdrant")
-    def run_pipeline(self):
-        """Run complete embedding pipeline"""
-        # 1. Fetch from DB instead of CSV
-        db_records = self.fetch_data_from_postgres()
-        logger.info(f"Loaded {len(db_records)} animes from Postgres")
-        collection_name = self.create_or_get_collection()
-        if not self.use_existing_embeddings:
-            texts_to_embed = []
-            for row in db_records:
-                text = row.searchable_text if row.searchable_text else ""
-                if hasattr(row, 'studios') and row.studios:
-                    text += f" Studio: {', '.join(row.studios)}"
-                texts_to_embed.append(text)
-            print(texts_to_embed[0])
-            embeddings = self.embed_texts(texts_to_embed)
-            if embeddings:
-                self.store_in_QdrantDB(
-                    self.client, collection_name, db_records, texts_to_embed, embeddings)
-        logger.info("Embedding pipeline complete!")
-        return collection_name
-if __name__ == "__main__":
-    pass
-    # pipeline = EmbeddingPipeline()
-    # collection_name = pipeline.run_pipeline()
-    # client = QdrantClient(
-    #         url=settings.qdrant_url,
-    #         api_key=settings.qdrant_api_key,
-    #         cloud_inference=True
-    #     )
-    # print("\n--- Testing vector search ---")
-    # query = "Attack Titan"
-    # print(f"Query: {query}")
-    # search_results = client.search(
-    #         collection_name=collection_name,
-    #         query_vector=query_vector,
-    #         limit=limit,
-    #         # We want Qdrant to return the payload (metadata) so we can see the titles
-    #         with_payload=True
-    #     )
-    # print("\n--- TOP 15 RESULTS ---")
-    # for i, (title, distance) in enumerate(zip(
-    #     [m["title"] for m in results["metadatas"][0]],
-    #     results["distances"][0]
-    # )):
-    #     print(f"{i+1}. {title} (distance: {distance:.3f})")

src/data_ingestion/fetch_anime.py DELETED Viewed

@@ -1,151 +0,0 @@
-import requests
-import time
-import json
-import logging
-from datetime import datetime
-def convert_datetime(dt: str | None):
-    if not dt:
-        return None
-    return datetime.fromisoformat(dt)
-logging.basicConfig(level=logging.INFO,
-                    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-class AnimeDataFetcher:
-    """Fetches anime data from Jikan API (Unofficial MyAnimeList API)"""
-    BASE_URL = "https://api.jikan.moe/v4/"
-    def __init__(self):
-        self.session = requests.Session()
-    def fetch_bulk_anime(self, total_limit: int = 10000, filename: str = "raw_anime.json"):
-        """
-        Fetches anime in bulk with Resume and Retry capabilities.
-        """
-        filepath = f"data/raw/{filename}"
-        all_animes = []
-        # --- RESUME LOGIC ---
-        try:
-            with open(filepath, "r", encoding="utf-8") as f:
-                all_animes = json.load(f)
-            logger.info(
-                f"Found existing data. Resuming from record {len(all_animes)}.")
-        except (FileNotFoundError, json.JSONDecodeError):
-            logger.info("No existing data found. Starting fresh.")
-        # Calculate the next page to fetch (25 items per page)
-        page = (len(all_animes) // 25) + 1
-        max_retries = 5
-        while len(all_animes) < total_limit:
-            retries = 0
-            success = False
-            while retries < max_retries and not success:
-                try:
-                    logger.info(
-                        f"🚀 Fetching page {page} (Progress: {len(all_animes)}/{total_limit})...")
-                    response = self.session.get(
-                        f"{self.BASE_URL}anime",
-                        params={
-                            "page": page,
-                            "limit": 25,
-                            "order_by": "popularity",
-                            "sort": "asc"
-                        },
-                        timeout=30
-                    )
-                    if response.status_code == 429:
-                        wait = 60 + (retries * 30)  # Increasing wait time
-                        logger.warning(f"⚠️ Rate limit! Sleeping {wait}s...")
-                        time.sleep(wait)
-                        retries += 1
-                        continue
-                    response.raise_for_status()
-                    data = response.json()
-                    anime_list = data.get("data", [])
-                    if not anime_list:
-                        logger.info("🏁 No more anime found in API.")
-                        return all_animes
-                    all_animes.extend(anime_list)
-                    # --- AUTO-SAVE EVERY PAGE ---
-                    # In production, we save often so we never lose more than 1 page of work
-                    self.save_raw_data(all_animes, filename=filename)
-                    success = True
-                    page += 1
-                    time.sleep(1.2)  # Polite delay
-                except Exception as e:
-                    retries += 1
-                    logger.error(
-                        f"❌ Error on page {page}: {e}. Retry {retries}/{max_retries}")
-                    time.sleep(10 * retries)
-            if not success:
-                logger.critical(
-                    f"🛑 Giving up on page {page}. Run script again later to resume.")
-                break
-        return all_animes[:total_limit]
-    def extract_relevant_fields(self, anime: dict) -> dict:
-        """Extract only fields we need for RAG"""
-        return {
-            "mal_id": anime.get("mal_id"),
-            "url": anime.get("url", ""),
-            "images": anime.get("images", {}),
-            "title": anime.get("title"),
-            "title_english": anime.get("title_english"),
-            "synopsis": anime.get("synopsis"),
-            "genres": [g["name"] for g in anime.get("genres", [])],
-            "studios": [s["name"] for s in anime.get("studios", [])],
-            "themes": [t["name"] for t in anime.get("themes", [])],
-            "demographics": [d["name"] for d in anime.get("demographics", [])],
-            "type": anime.get("type"),
-            "episodes": anime.get("episodes"),
-            "score": anime.get("score"),
-            "scored_by": anime.get("scored_by"),
-            "rank": anime.get("rank"),
-            "popularity": anime.get("popularity"),
-            "year": anime.get("year"),
-            "rating": anime.get("rating"),
-            "season": anime.get("season"),
-            "aired_from": anime.get("aired", {}).get("from", ""),
-            "aired_to": anime.get("aired", {}).get("to", ""),
-            "favorites": anime.get("favorites")
-        }
-    def save_raw_data(self, anime_list: list[dict], filename: str = "raw_anime.json"):
-        """Save raw anime data to file"""
-        with open(f"data/raw/{filename}", "w", encoding="utf-8") as f:
-            json.dump(anime_list, f, indent=2, ensure_ascii=False)
-        print(f"Saved {len(anime_list)} anime to data/raw/{filename}")
-if __name__ == "__main__":
-    fetcher = AnimeDataFetcher()
-    logger.info("Fetching top 10000 anime from MyAnimeList...")
-    raw_anime = fetcher.fetch_bulk_anime(total_limit=10000)
-    processed_anime = [fetcher.extract_relevant_fields(a) for a in raw_anime]
-    fetcher.save_raw_data(processed_anime,
-                          filename="raw_anime.json")
-    print("\nSample anime: ")
-    print(json.dumps(processed_anime[0], indent=2, ensure_ascii=False))

src/data_ingestion/load_to_postgres.py DELETED Viewed

@@ -1,53 +0,0 @@
-import json
-import logging
-from sqlmodel import Session, select
-from src.database.session import engine
-from src.database.models import Animes
-from src.database import init_db
-init_db()
-logging.basicConfig(level=logging.WARNING)
-logger = logging.getLogger(__name__)
-def load_json_data(filepath: str = "data/processed/anime_clean.json") -> list[dict]:
-    """Reads the cleaned JSON file."""
-    with open(filepath, 'r', encoding='utf-8') as f:
-        return json.load(f)
-def insert_animes_to_db(anime_list: list[dict]):
-    """
-    Inject a list of anime dictionaries into PostgreSQL safely
-    """
-    inserted_count = 0
-    skipped_count = 0
-    with Session(engine) as session:
-        for data in anime_list:
-            try:
-                existing = session.exec(
-                    select(Animes).where(Animes.mal_id == data["mal_id"])
-                ).first()
-                if not existing:
-                    new_anime = Animes(**data)
-                    session.add(new_anime)
-                    inserted_count += 1
-                    print(f"Inserted_count: {inserted_count}/{len(anime_list)} animes")
-                else:
-                    skipped_count += 1
-            except Exception as e:
-                logger.error(
-                    f"Error processing anime ID: {data.get("mal_id")}: {e}")
-        session.commit()
-    logger.info(
-        f"Injection complete! Inserted: {inserted_count} | Skipped (Duplicates): {skipped_count}")
-if __name__ == "__main__":
-    anime_data = load_json_data(filepath="data/processed/anime_clean.json")
-    insert_animes_to_db(anime_data)