// scraper.js — Anichin.cafe Gacor Scraper // Logika: Scrape → Parse → Deduplicate → Upsert ke Firebase require("dotenv").config(); const axios = require("axios"); const cheerio = require("cheerio"); const { getDB } = require("./firebase"); const BASE_URL = process.env.BASE_URL || "https://anichin.cafe"; const DELAY_MS = parseInt(process.env.SCRAPE_DELAY_MS || "1500"); const MAX_CONCURRENT = parseInt(process.env.MAX_CONCURRENT || "3"); // ─── UTILITY ───────────────────────────────────────────────────────────────── const sleep = (ms) => new Promise((res) => setTimeout(res, ms)); const randomDelay = () => sleep(DELAY_MS + Math.floor(Math.random() * 1000)); const USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", ]; function getRandomUA() { return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)]; } async function fetchPage(url, retries = 3) { for (let attempt = 1; attempt <= retries; attempt++) { try { const res = await axios.get(url, { headers: { "User-Agent": getRandomUA(), Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7", Referer: BASE_URL, Connection: "keep-alive", }, timeout: 15000, }); return res.data; } catch (err) { const isLast = attempt === retries; console.warn(`⚠️ Attempt ${attempt}/${retries} failed for ${url}: ${err.message}`); if (!isLast) await sleep(2000 * attempt); // exponential backoff else throw err; } } } // ─── PARSERS ───────────────────────────────────────────────────────────────── /** * Parse halaman list/catalog anime * Anichin biasanya pake struktur WordPress + plugin */ function parseAnimeList(html) { const $ = cheerio.load(html); const animes = []; // Selector untuk card anime di halaman list $(".bsx, .bs, article.bs").each((_, el) => { const $el = $(el); const title = $el.find(".tt, h2, .title, a[title]").first().text().trim() || $el.find("a").attr("title") || ""; const url = $el.find("a").first().attr("href") || ""; const thumbnail = $el.find("img").attr("src") || $el.find("img").attr("data-src") || ""; const score = parseFloat($el.find(".numscore, .score, .rating").text().trim()) || null; const status = $el.find(".statuss, .status").text().trim() || ""; const type = $el.find(".typez, .type").text().trim() || ""; if (title && url) { animes.push({ title, url: url.startsWith("http") ? url : `${BASE_URL}${url}`, thumbnail, score, status, type, }); } }); // Pagination — cari next page URL const nextPage = $(".next.page-numbers, a.next, .navigation .next a").attr("href") || null; return { animes, nextPage }; } /** * Parse halaman detail anime */ function parseAnimeDetail(html, animeUrl) { const $ = cheerio.load(html); const title = $(".entry-title, h1.title, .animDetail h1").first().text().trim(); const synopsis = $(".entry-content p, .synops, .desc").first().text().trim(); const thumbnail = $(".thumbook img, .thumb img, .poster img").attr("src") || $(".thumb img").attr("data-src") || ""; // Info box (Genre, Studio, Status, dll) const info = {}; $(".infox .spe span, .infoanime span, .spe span").each((_, el) => { const text = $(el).text(); const [key, ...val] = text.split(":"); if (key && val.length) { info[key.trim().toLowerCase()] = val.join(":").trim(); } }); // Genres const genres = []; $(".genxed a, .genres a, .genre a").each((_, el) => { const g = $(el).text().trim(); if (g) genres.push(g); }); // Episodes list const episodes = []; $("#episode_by_py li, .eplister ul li, .eps li").each((_, el) => { const $ep = $(el); const epTitle = $ep.find(".epl-title, .eptitle").text().trim(); const epUrl = $ep.find("a").attr("href") || ""; const epNum = parseFloat($ep.find(".epl-num").text().trim()) || parseFloat(epTitle.replace(/\D/g, "")) || null; const epDate = $ep.find(".epl-date").text().trim() || ""; if (epUrl) { episodes.push({ number: epNum, title: epTitle, url: epUrl.startsWith("http") ? epUrl : `${BASE_URL}${epUrl}`, date: epDate, }); } }); // Sort episodes ascending episodes.sort((a, b) => (a.number || 0) - (b.number || 0)); return { title, url: animeUrl, synopsis, thumbnail, genres, totalEpisodes: episodes.length, episodes, ...info, scrapedAt: new Date().toISOString(), }; } /** * Parse halaman episode (ambil streaming links) */ function parseEpisodePage(html) { const $ = cheerio.load(html); const streamingLinks = []; // Tombol server/mirror $(".mirror .btn, .serverselect a, .mirrorselect a, .soraddl a").each((_, el) => { const $el = $(el); const label = $el.text().trim(); const href = $el.attr("href") || $el.attr("data-src") || ""; if (href) streamingLinks.push({ label, url: href }); }); // iFrame embed links $("iframe[src], .entry-content iframe").each((_, el) => { const src = $(el).attr("src") || ""; if (src) streamingLinks.push({ label: "embed", url: src }); }); return { streamingLinks }; } // ─── SCRAPING JOBS ──────────────────────────────────────────────────────────── /** * Scrape semua anime dari halaman list dengan pagination */ async function scrapeAnimeList(startUrl = `${BASE_URL}/anime/`, maxPages = 10) { console.log(`\n🚀 Mulai scrape list anime — max ${maxPages} halaman`); const db = getDB(); const allAnimes = []; let currentUrl = startUrl; let page = 1; while (currentUrl && page <= maxPages) { console.log(`📄 Halaman ${page}: ${currentUrl}`); try { const html = await fetchPage(currentUrl); const { animes, nextPage } = parseAnimeList(html); console.log(` ✓ Dapet ${animes.length} anime`); allAnimes.push(...animes); // Batch upsert ke Firebase if (animes.length > 0) { await batchUpsertAnimes(db, animes); } currentUrl = nextPage; page++; if (currentUrl) await randomDelay(); } catch (err) { console.error(`❌ Gagal scrape halaman ${page}: ${err.message}`); break; } } console.log(`\n✅ List selesai — total ${allAnimes.length} anime discrape`); return allAnimes; } /** * Scrape detail + episode tiap anime */ async function scrapeAnimeDetails(animes) { console.log(`\n🔍 Scrape detail untuk ${animes.length} anime...`); const db = getDB(); // Process dengan concurrency limit const results = []; const queue = [...animes]; let active = 0; let done = 0; async function processNext() { if (queue.length === 0) return; const anime = queue.shift(); active++; try { console.log(` [${done + 1}/${animes.length}] ${anime.title}`); const html = await fetchPage(anime.url); const detail = parseAnimeDetail(html, anime.url); // Merge dengan data dari list const merged = { ...anime, ...detail }; await upsertAnimeDetail(db, merged); results.push(merged); } catch (err) { console.error(` ❌ ${anime.title}: ${err.message}`); } done++; active--; await randomDelay(); await processNext(); } // Jalankan N concurrent workers const workers = Array.from({ length: MAX_CONCURRENT }, () => processNext()); await Promise.all(workers); console.log(`\n✅ Detail selesai — ${results.length}/${animes.length} berhasil`); return results; } /** * Scrape episode baru saja (incremental update) * Cek Firebase → bandingkan episode count → scrape yang kurang */ async function scrapeIncrementalUpdates() { console.log("\n🔄 Incremental update — cek episode baru..."); const db = getDB(); // Ambil anime yang statusnya "Ongoing" dari Firebase const snapshot = await db .collection("animes") .where("status", "in", ["Ongoing", "ongoing", "Airing"]) .limit(50) .get(); console.log(` Dapet ${snapshot.size} anime ongoing`); const toUpdate = []; snapshot.forEach((doc) => toUpdate.push({ id: doc.id, ...doc.data() })); for (const anime of toUpdate) { try { const html = await fetchPage(anime.url); const fresh = parseAnimeDetail(html, anime.url); // Hanya update jika ada episode baru if (fresh.totalEpisodes > (anime.totalEpisodes || 0)) { console.log( ` 🆕 ${anime.title}: ${anime.totalEpisodes} → ${fresh.totalEpisodes} eps` ); await upsertAnimeDetail(db, { ...anime, ...fresh }); } await randomDelay(); } catch (err) { console.error(` ❌ ${anime.title}: ${err.message}`); } } console.log("✅ Incremental update selesai"); } // ─── FIREBASE OPERATIONS ────────────────────────────────────────────────────── /** * Batch upsert anime dasar (title, url, thumbnail, dll) * Pake slug dari URL sebagai document ID */ async function batchUpsertAnimes(db, animes) { const BATCH_SIZE = 400; // Firestore max 500 per batch for (let i = 0; i < animes.length; i += BATCH_SIZE) { const batch = db.batch(); const chunk = animes.slice(i, i + BATCH_SIZE); chunk.forEach((anime) => { const slug = extractSlug(anime.url); const ref = db.collection("animes").doc(slug); batch.set( ref, { ...anime, slug, updatedAt: new Date().toISOString(), }, { merge: true } ); }); await batch.commit(); console.log(` 💾 Saved batch ${i / BATCH_SIZE + 1}`); } } /** * Upsert detail anime lengkap beserta episodes sebagai subcollection */ async function upsertAnimeDetail(db, anime) { const slug = extractSlug(anime.url); const animeRef = db.collection("animes").doc(slug); // Pisah episodes dari main doc (biar nggak oversize) const { episodes, ...animeData } = anime; // Update main anime document await animeRef.set( { ...animeData, slug, hasDetails: true, updatedAt: new Date().toISOString(), }, { merge: true } ); // Upsert episodes ke subcollection if (episodes && episodes.length > 0) { const BATCH_SIZE = 400; for (let i = 0; i < episodes.length; i += BATCH_SIZE) { const batch = db.batch(); episodes.slice(i, i + BATCH_SIZE).forEach((ep) => { const epId = `ep-${String(ep.number || i).padStart(4, "0")}`; const epRef = animeRef.collection("episodes").doc(epId); batch.set(epRef, { ...ep, animeSlug: slug }, { merge: true }); }); await batch.commit(); } } } /** * Extract slug dari URL * e.g. https://anichin.cafe/anime/one-piece/ → one-piece */ function extractSlug(url) { try { const parts = new URL(url).pathname .split("/") .filter(Boolean); return parts[parts.length - 1] || url; } catch { return url.replace(/[^a-z0-9-]/gi, "-").toLowerCase(); } } // ─── SEARCH ─────────────────────────────────────────────────────────────────── async function scrapeSearch(query) { const searchUrl = `${BASE_URL}/?s=${encodeURIComponent(query)}`; console.log(`🔍 Search: "${query}"`); const html = await fetchPage(searchUrl); const { animes } = parseAnimeList(html); return animes; } // ─── EXPORT ─────────────────────────────────────────────────────────────────── module.exports = { scrapeAnimeList, scrapeAnimeDetails, scrapeIncrementalUpdates, scrapeSearch, fetchPage, parseAnimeList, parseAnimeDetail, parseEpisodePage, extractSlug, }; // Kalau dirun langsung: node scraper.js if (require.main === module) { (async () => { try { const animes = await scrapeAnimeList(`${BASE_URL}/anime/`, 5); await scrapeAnimeDetails(animes.slice(0, 10)); // detail 10 anime pertama } catch (err) { console.error("Fatal:", err); process.exit(1); } })(); }