Spaces:

Aqso
/

tim

Paused

File size: 13,230 Bytes

8b75dd4

// scraper.js — Anichin.cafe Gacor Scraper
// Logika: Scrape → Parse → Deduplicate → Upsert ke Firebase

require("dotenv").config();
const axios = require("axios");
const cheerio = require("cheerio");
const { getDB } = require("./firebase");

const BASE_URL = process.env.BASE_URL || "https://anichin.cafe";
const DELAY_MS = parseInt(process.env.SCRAPE_DELAY_MS || "1500");
const MAX_CONCURRENT = parseInt(process.env.MAX_CONCURRENT || "3");

// ─── UTILITY ─────────────────────────────────────────────────────────────────

const sleep = (ms) => new Promise((res) => setTimeout(res, ms));

const randomDelay = () =>
  sleep(DELAY_MS + Math.floor(Math.random() * 1000));

const USER_AGENTS = [
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
];

function getRandomUA() {
  return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
}

async function fetchPage(url, retries = 3) {
  for (let attempt = 1; attempt <= retries; attempt++) {
    try {
      const res = await axios.get(url, {
        headers: {
          "User-Agent": getRandomUA(),
          Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
          "Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
          Referer: BASE_URL,
          Connection: "keep-alive",
        },
        timeout: 15000,
      });
      return res.data;
    } catch (err) {
      const isLast = attempt === retries;
      console.warn(`⚠️  Attempt ${attempt}/${retries} failed for ${url}: ${err.message}`);
      if (!isLast) await sleep(2000 * attempt); // exponential backoff
      else throw err;
    }
  }
}

// ─── PARSERS ─────────────────────────────────────────────────────────────────

/**
 * Parse halaman list/catalog anime
 * Anichin biasanya pake struktur WordPress + plugin
 */
function parseAnimeList(html) {
  const $ = cheerio.load(html);
  const animes = [];

  // Selector untuk card anime di halaman list
  $(".bsx, .bs, article.bs").each((_, el) => {
    const $el = $(el);

    const title =
      $el.find(".tt, h2, .title, a[title]").first().text().trim() ||
      $el.find("a").attr("title") ||
      "";

    const url =
      $el.find("a").first().attr("href") || "";

    const thumbnail =
      $el.find("img").attr("src") ||
      $el.find("img").attr("data-src") ||
      "";

    const score =
      parseFloat($el.find(".numscore, .score, .rating").text().trim()) || null;

    const status =
      $el.find(".statuss, .status").text().trim() || "";

    const type =
      $el.find(".typez, .type").text().trim() || "";

    if (title && url) {
      animes.push({
        title,
        url: url.startsWith("http") ? url : `${BASE_URL}${url}`,
        thumbnail,
        score,
        status,
        type,
      });
    }
  });

  // Pagination — cari next page URL
  const nextPage =
    $(".next.page-numbers, a.next, .navigation .next a").attr("href") || null;

  return { animes, nextPage };
}

/**
 * Parse halaman detail anime
 */
function parseAnimeDetail(html, animeUrl) {
  const $ = cheerio.load(html);

  const title =
    $(".entry-title, h1.title, .animDetail h1").first().text().trim();

  const synopsis =
    $(".entry-content p, .synops, .desc").first().text().trim();

  const thumbnail =
    $(".thumbook img, .thumb img, .poster img").attr("src") ||
    $(".thumb img").attr("data-src") ||
    "";

  // Info box (Genre, Studio, Status, dll)
  const info = {};
  $(".infox .spe span, .infoanime span, .spe span").each((_, el) => {
    const text = $(el).text();
    const [key, ...val] = text.split(":");
    if (key && val.length) {
      info[key.trim().toLowerCase()] = val.join(":").trim();
    }
  });

  // Genres
  const genres = [];
  $(".genxed a, .genres a, .genre a").each((_, el) => {
    const g = $(el).text().trim();
    if (g) genres.push(g);
  });

  // Episodes list
  const episodes = [];
  $("#episode_by_py li, .eplister ul li, .eps li").each((_, el) => {
    const $ep = $(el);
    const epTitle = $ep.find(".epl-title, .eptitle").text().trim();
    const epUrl = $ep.find("a").attr("href") || "";
    const epNum =
      parseFloat($ep.find(".epl-num").text().trim()) ||
      parseFloat(epTitle.replace(/\D/g, "")) ||
      null;
    const epDate = $ep.find(".epl-date").text().trim() || "";

    if (epUrl) {
      episodes.push({
        number: epNum,
        title: epTitle,
        url: epUrl.startsWith("http") ? epUrl : `${BASE_URL}${epUrl}`,
        date: epDate,
      });
    }
  });

  // Sort episodes ascending
  episodes.sort((a, b) => (a.number || 0) - (b.number || 0));

  return {
    title,
    url: animeUrl,
    synopsis,
    thumbnail,
    genres,
    totalEpisodes: episodes.length,
    episodes,
    ...info,
    scrapedAt: new Date().toISOString(),
  };
}

/**
 * Parse halaman episode (ambil streaming links)
 */
function parseEpisodePage(html) {
  const $ = cheerio.load(html);

  const streamingLinks = [];

  // Tombol server/mirror
  $(".mirror .btn, .serverselect a, .mirrorselect a, .soraddl a").each((_, el) => {
    const $el = $(el);
    const label = $el.text().trim();
    const href = $el.attr("href") || $el.attr("data-src") || "";
    if (href) streamingLinks.push({ label, url: href });
  });

  // iFrame embed links
  $("iframe[src], .entry-content iframe").each((_, el) => {
    const src = $(el).attr("src") || "";
    if (src) streamingLinks.push({ label: "embed", url: src });
  });

  return { streamingLinks };
}

// ─── SCRAPING JOBS ────────────────────────────────────────────────────────────

/**
 * Scrape semua anime dari halaman list dengan pagination
 */
async function scrapeAnimeList(startUrl = `${BASE_URL}/anime/`, maxPages = 10) {
  console.log(`\n🚀 Mulai scrape list anime — max ${maxPages} halaman`);
  const db = getDB();
  const allAnimes = [];
  let currentUrl = startUrl;
  let page = 1;

  while (currentUrl && page <= maxPages) {
    console.log(`📄 Halaman ${page}: ${currentUrl}`);

    try {
      const html = await fetchPage(currentUrl);
      const { animes, nextPage } = parseAnimeList(html);

      console.log(`   ✓ Dapet ${animes.length} anime`);
      allAnimes.push(...animes);

      // Batch upsert ke Firebase
      if (animes.length > 0) {
        await batchUpsertAnimes(db, animes);
      }

      currentUrl = nextPage;
      page++;

      if (currentUrl) await randomDelay();
    } catch (err) {
      console.error(`❌ Gagal scrape halaman ${page}: ${err.message}`);
      break;
    }
  }

  console.log(`\n✅ List selesai — total ${allAnimes.length} anime discrape`);
  return allAnimes;
}

/**
 * Scrape detail + episode tiap anime
 */
async function scrapeAnimeDetails(animes) {
  console.log(`\n🔍 Scrape detail untuk ${animes.length} anime...`);
  const db = getDB();

  // Process dengan concurrency limit
  const results = [];
  const queue = [...animes];
  let active = 0;
  let done = 0;

  async function processNext() {
    if (queue.length === 0) return;
    const anime = queue.shift();
    active++;

    try {
      console.log(`  [${done + 1}/${animes.length}] ${anime.title}`);
      const html = await fetchPage(anime.url);
      const detail = parseAnimeDetail(html, anime.url);

      // Merge dengan data dari list
      const merged = { ...anime, ...detail };
      await upsertAnimeDetail(db, merged);
      results.push(merged);
    } catch (err) {
      console.error(`  ❌ ${anime.title}: ${err.message}`);
    }

    done++;
    active--;
    await randomDelay();
    await processNext();
  }

  // Jalankan N concurrent workers
  const workers = Array.from({ length: MAX_CONCURRENT }, () => processNext());
  await Promise.all(workers);

  console.log(`\n✅ Detail selesai — ${results.length}/${animes.length} berhasil`);
  return results;
}

/**
 * Scrape episode baru saja (incremental update)
 * Cek Firebase → bandingkan episode count → scrape yang kurang
 */
async function scrapeIncrementalUpdates() {
  console.log("\n🔄 Incremental update — cek episode baru...");
  const db = getDB();

  // Ambil anime yang statusnya "Ongoing" dari Firebase
  const snapshot = await db
    .collection("animes")
    .where("status", "in", ["Ongoing", "ongoing", "Airing"])
    .limit(50)
    .get();

  console.log(`   Dapet ${snapshot.size} anime ongoing`);

  const toUpdate = [];
  snapshot.forEach((doc) => toUpdate.push({ id: doc.id, ...doc.data() }));

  for (const anime of toUpdate) {
    try {
      const html = await fetchPage(anime.url);
      const fresh = parseAnimeDetail(html, anime.url);

      // Hanya update jika ada episode baru
      if (fresh.totalEpisodes > (anime.totalEpisodes || 0)) {
        console.log(
          `  🆕 ${anime.title}: ${anime.totalEpisodes} → ${fresh.totalEpisodes} eps`
        );
        await upsertAnimeDetail(db, { ...anime, ...fresh });
      }

      await randomDelay();
    } catch (err) {
      console.error(`  ❌ ${anime.title}: ${err.message}`);
    }
  }

  console.log("✅ Incremental update selesai");
}

// ─── FIREBASE OPERATIONS ──────────────────────────────────────────────────────

/**
 * Batch upsert anime dasar (title, url, thumbnail, dll)
 * Pake slug dari URL sebagai document ID
 */
async function batchUpsertAnimes(db, animes) {
  const BATCH_SIZE = 400; // Firestore max 500 per batch
  for (let i = 0; i < animes.length; i += BATCH_SIZE) {
    const batch = db.batch();
    const chunk = animes.slice(i, i + BATCH_SIZE);

    chunk.forEach((anime) => {
      const slug = extractSlug(anime.url);
      const ref = db.collection("animes").doc(slug);
      batch.set(
        ref,
        {
          ...anime,
          slug,
          updatedAt: new Date().toISOString(),
        },
        { merge: true }
      );
    });

    await batch.commit();
    console.log(`   💾 Saved batch ${i / BATCH_SIZE + 1}`);
  }
}

/**
 * Upsert detail anime lengkap beserta episodes sebagai subcollection
 */
async function upsertAnimeDetail(db, anime) {
  const slug = extractSlug(anime.url);
  const animeRef = db.collection("animes").doc(slug);

  // Pisah episodes dari main doc (biar nggak oversize)
  const { episodes, ...animeData } = anime;

  // Update main anime document
  await animeRef.set(
    {
      ...animeData,
      slug,
      hasDetails: true,
      updatedAt: new Date().toISOString(),
    },
    { merge: true }
  );

  // Upsert episodes ke subcollection
  if (episodes && episodes.length > 0) {
    const BATCH_SIZE = 400;
    for (let i = 0; i < episodes.length; i += BATCH_SIZE) {
      const batch = db.batch();
      episodes.slice(i, i + BATCH_SIZE).forEach((ep) => {
        const epId = `ep-${String(ep.number || i).padStart(4, "0")}`;
        const epRef = animeRef.collection("episodes").doc(epId);
        batch.set(epRef, { ...ep, animeSlug: slug }, { merge: true });
      });
      await batch.commit();
    }
  }
}

/**
 * Extract slug dari URL
 * e.g. https://anichin.cafe/anime/one-piece/ → one-piece
 */
function extractSlug(url) {
  try {
    const parts = new URL(url).pathname
      .split("/")
      .filter(Boolean);
    return parts[parts.length - 1] || url;
  } catch {
    return url.replace(/[^a-z0-9-]/gi, "-").toLowerCase();
  }
}

// ─── SEARCH ───────────────────────────────────────────────────────────────────

async function scrapeSearch(query) {
  const searchUrl = `${BASE_URL}/?s=${encodeURIComponent(query)}`;
  console.log(`🔍 Search: "${query}"`);

  const html = await fetchPage(searchUrl);
  const { animes } = parseAnimeList(html);
  return animes;
}

// ─── EXPORT ───────────────────────────────────────────────────────────────────

module.exports = {
  scrapeAnimeList,
  scrapeAnimeDetails,
  scrapeIncrementalUpdates,
  scrapeSearch,
  fetchPage,
  parseAnimeList,
  parseAnimeDetail,
  parseEpisodePage,
  extractSlug,
};

// Kalau dirun langsung: node scraper.js
if (require.main === module) {
  (async () => {
    try {
      const animes = await scrapeAnimeList(`${BASE_URL}/anime/`, 5);
      await scrapeAnimeDetails(animes.slice(0, 10)); // detail 10 anime pertama
    } catch (err) {
      console.error("Fatal:", err);
      process.exit(1);
    }
  })();
}