| | |
| | |
| |
|
| | require("dotenv").config(); |
| | const axios = require("axios"); |
| | const cheerio = require("cheerio"); |
| | const { getDB } = require("./firebase"); |
| |
|
| | const BASE_URL = process.env.BASE_URL || "https://anichin.cafe"; |
| | const DELAY_MS = parseInt(process.env.SCRAPE_DELAY_MS || "1500"); |
| | const MAX_CONCURRENT = parseInt(process.env.MAX_CONCURRENT || "3"); |
| |
|
| | |
| |
|
| | const sleep = (ms) => new Promise((res) => setTimeout(res, ms)); |
| |
|
| | const randomDelay = () => |
| | sleep(DELAY_MS + Math.floor(Math.random() * 1000)); |
| |
|
| | const USER_AGENTS = [ |
| | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", |
| | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", |
| | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", |
| | ]; |
| |
|
| | function getRandomUA() { |
| | return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)]; |
| | } |
| |
|
| | async function fetchPage(url, retries = 3) { |
| | for (let attempt = 1; attempt <= retries; attempt++) { |
| | try { |
| | const res = await axios.get(url, { |
| | headers: { |
| | "User-Agent": getRandomUA(), |
| | Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
| | "Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7", |
| | Referer: BASE_URL, |
| | Connection: "keep-alive", |
| | }, |
| | timeout: 15000, |
| | }); |
| | return res.data; |
| | } catch (err) { |
| | const isLast = attempt === retries; |
| | console.warn(`β οΈ Attempt ${attempt}/${retries} failed for ${url}: ${err.message}`); |
| | if (!isLast) await sleep(2000 * attempt); |
| | else throw err; |
| | } |
| | } |
| | } |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | function parseAnimeList(html) { |
| | const $ = cheerio.load(html); |
| | const animes = []; |
| |
|
| | |
| | $(".bsx, .bs, article.bs").each((_, el) => { |
| | const $el = $(el); |
| |
|
| | const title = |
| | $el.find(".tt, h2, .title, a[title]").first().text().trim() || |
| | $el.find("a").attr("title") || |
| | ""; |
| |
|
| | const url = |
| | $el.find("a").first().attr("href") || ""; |
| |
|
| | const thumbnail = |
| | $el.find("img").attr("src") || |
| | $el.find("img").attr("data-src") || |
| | ""; |
| |
|
| | const score = |
| | parseFloat($el.find(".numscore, .score, .rating").text().trim()) || null; |
| |
|
| | const status = |
| | $el.find(".statuss, .status").text().trim() || ""; |
| |
|
| | const type = |
| | $el.find(".typez, .type").text().trim() || ""; |
| |
|
| | if (title && url) { |
| | animes.push({ |
| | title, |
| | url: url.startsWith("http") ? url : `${BASE_URL}${url}`, |
| | thumbnail, |
| | score, |
| | status, |
| | type, |
| | }); |
| | } |
| | }); |
| |
|
| | |
| | const nextPage = |
| | $(".next.page-numbers, a.next, .navigation .next a").attr("href") || null; |
| |
|
| | return { animes, nextPage }; |
| | } |
| |
|
| | |
| | |
| | |
| | function parseAnimeDetail(html, animeUrl) { |
| | const $ = cheerio.load(html); |
| |
|
| | const title = |
| | $(".entry-title, h1.title, .animDetail h1").first().text().trim(); |
| |
|
| | const synopsis = |
| | $(".entry-content p, .synops, .desc").first().text().trim(); |
| |
|
| | const thumbnail = |
| | $(".thumbook img, .thumb img, .poster img").attr("src") || |
| | $(".thumb img").attr("data-src") || |
| | ""; |
| |
|
| | |
| | const info = {}; |
| | $(".infox .spe span, .infoanime span, .spe span").each((_, el) => { |
| | const text = $(el).text(); |
| | const [key, ...val] = text.split(":"); |
| | if (key && val.length) { |
| | info[key.trim().toLowerCase()] = val.join(":").trim(); |
| | } |
| | }); |
| |
|
| | |
| | const genres = []; |
| | $(".genxed a, .genres a, .genre a").each((_, el) => { |
| | const g = $(el).text().trim(); |
| | if (g) genres.push(g); |
| | }); |
| |
|
| | |
| | const episodes = []; |
| | $("#episode_by_py li, .eplister ul li, .eps li").each((_, el) => { |
| | const $ep = $(el); |
| | const epTitle = $ep.find(".epl-title, .eptitle").text().trim(); |
| | const epUrl = $ep.find("a").attr("href") || ""; |
| | const epNum = |
| | parseFloat($ep.find(".epl-num").text().trim()) || |
| | parseFloat(epTitle.replace(/\D/g, "")) || |
| | null; |
| | const epDate = $ep.find(".epl-date").text().trim() || ""; |
| |
|
| | if (epUrl) { |
| | episodes.push({ |
| | number: epNum, |
| | title: epTitle, |
| | url: epUrl.startsWith("http") ? epUrl : `${BASE_URL}${epUrl}`, |
| | date: epDate, |
| | }); |
| | } |
| | }); |
| |
|
| | |
| | episodes.sort((a, b) => (a.number || 0) - (b.number || 0)); |
| |
|
| | return { |
| | title, |
| | url: animeUrl, |
| | synopsis, |
| | thumbnail, |
| | genres, |
| | totalEpisodes: episodes.length, |
| | episodes, |
| | ...info, |
| | scrapedAt: new Date().toISOString(), |
| | }; |
| | } |
| |
|
| | |
| | |
| | |
| | function parseEpisodePage(html) { |
| | const $ = cheerio.load(html); |
| |
|
| | const streamingLinks = []; |
| |
|
| | |
| | $(".mirror .btn, .serverselect a, .mirrorselect a, .soraddl a").each((_, el) => { |
| | const $el = $(el); |
| | const label = $el.text().trim(); |
| | const href = $el.attr("href") || $el.attr("data-src") || ""; |
| | if (href) streamingLinks.push({ label, url: href }); |
| | }); |
| |
|
| | |
| | $("iframe[src], .entry-content iframe").each((_, el) => { |
| | const src = $(el).attr("src") || ""; |
| | if (src) streamingLinks.push({ label: "embed", url: src }); |
| | }); |
| |
|
| | return { streamingLinks }; |
| | } |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | async function scrapeAnimeList(startUrl = `${BASE_URL}/anime/`, maxPages = 10) { |
| | console.log(`\nπ Mulai scrape list anime β max ${maxPages} halaman`); |
| | const db = getDB(); |
| | const allAnimes = []; |
| | let currentUrl = startUrl; |
| | let page = 1; |
| |
|
| | while (currentUrl && page <= maxPages) { |
| | console.log(`π Halaman ${page}: ${currentUrl}`); |
| |
|
| | try { |
| | const html = await fetchPage(currentUrl); |
| | const { animes, nextPage } = parseAnimeList(html); |
| |
|
| | console.log(` β Dapet ${animes.length} anime`); |
| | allAnimes.push(...animes); |
| |
|
| | |
| | if (animes.length > 0) { |
| | await batchUpsertAnimes(db, animes); |
| | } |
| |
|
| | currentUrl = nextPage; |
| | page++; |
| |
|
| | if (currentUrl) await randomDelay(); |
| | } catch (err) { |
| | console.error(`β Gagal scrape halaman ${page}: ${err.message}`); |
| | break; |
| | } |
| | } |
| |
|
| | console.log(`\nβ
List selesai β total ${allAnimes.length} anime discrape`); |
| | return allAnimes; |
| | } |
| |
|
| | |
| | |
| | |
| | async function scrapeAnimeDetails(animes) { |
| | console.log(`\nπ Scrape detail untuk ${animes.length} anime...`); |
| | const db = getDB(); |
| |
|
| | |
| | const results = []; |
| | const queue = [...animes]; |
| | let active = 0; |
| | let done = 0; |
| |
|
| | async function processNext() { |
| | if (queue.length === 0) return; |
| | const anime = queue.shift(); |
| | active++; |
| |
|
| | try { |
| | console.log(` [${done + 1}/${animes.length}] ${anime.title}`); |
| | const html = await fetchPage(anime.url); |
| | const detail = parseAnimeDetail(html, anime.url); |
| |
|
| | |
| | const merged = { ...anime, ...detail }; |
| | await upsertAnimeDetail(db, merged); |
| | results.push(merged); |
| | } catch (err) { |
| | console.error(` β ${anime.title}: ${err.message}`); |
| | } |
| |
|
| | done++; |
| | active--; |
| | await randomDelay(); |
| | await processNext(); |
| | } |
| |
|
| | |
| | const workers = Array.from({ length: MAX_CONCURRENT }, () => processNext()); |
| | await Promise.all(workers); |
| |
|
| | console.log(`\nβ
Detail selesai β ${results.length}/${animes.length} berhasil`); |
| | return results; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | async function scrapeIncrementalUpdates() { |
| | console.log("\nπ Incremental update β cek episode baru..."); |
| | const db = getDB(); |
| |
|
| | |
| | const snapshot = await db |
| | .collection("animes") |
| | .where("status", "in", ["Ongoing", "ongoing", "Airing"]) |
| | .limit(50) |
| | .get(); |
| |
|
| | console.log(` Dapet ${snapshot.size} anime ongoing`); |
| |
|
| | const toUpdate = []; |
| | snapshot.forEach((doc) => toUpdate.push({ id: doc.id, ...doc.data() })); |
| |
|
| | for (const anime of toUpdate) { |
| | try { |
| | const html = await fetchPage(anime.url); |
| | const fresh = parseAnimeDetail(html, anime.url); |
| |
|
| | |
| | if (fresh.totalEpisodes > (anime.totalEpisodes || 0)) { |
| | console.log( |
| | ` π ${anime.title}: ${anime.totalEpisodes} β ${fresh.totalEpisodes} eps` |
| | ); |
| | await upsertAnimeDetail(db, { ...anime, ...fresh }); |
| | } |
| |
|
| | await randomDelay(); |
| | } catch (err) { |
| | console.error(` β ${anime.title}: ${err.message}`); |
| | } |
| | } |
| |
|
| | console.log("β
Incremental update selesai"); |
| | } |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | async function batchUpsertAnimes(db, animes) { |
| | const BATCH_SIZE = 400; |
| | for (let i = 0; i < animes.length; i += BATCH_SIZE) { |
| | const batch = db.batch(); |
| | const chunk = animes.slice(i, i + BATCH_SIZE); |
| |
|
| | chunk.forEach((anime) => { |
| | const slug = extractSlug(anime.url); |
| | const ref = db.collection("animes").doc(slug); |
| | batch.set( |
| | ref, |
| | { |
| | ...anime, |
| | slug, |
| | updatedAt: new Date().toISOString(), |
| | }, |
| | { merge: true } |
| | ); |
| | }); |
| |
|
| | await batch.commit(); |
| | console.log(` πΎ Saved batch ${i / BATCH_SIZE + 1}`); |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | async function upsertAnimeDetail(db, anime) { |
| | const slug = extractSlug(anime.url); |
| | const animeRef = db.collection("animes").doc(slug); |
| |
|
| | |
| | const { episodes, ...animeData } = anime; |
| |
|
| | |
| | await animeRef.set( |
| | { |
| | ...animeData, |
| | slug, |
| | hasDetails: true, |
| | updatedAt: new Date().toISOString(), |
| | }, |
| | { merge: true } |
| | ); |
| |
|
| | |
| | if (episodes && episodes.length > 0) { |
| | const BATCH_SIZE = 400; |
| | for (let i = 0; i < episodes.length; i += BATCH_SIZE) { |
| | const batch = db.batch(); |
| | episodes.slice(i, i + BATCH_SIZE).forEach((ep) => { |
| | const epId = `ep-${String(ep.number || i).padStart(4, "0")}`; |
| | const epRef = animeRef.collection("episodes").doc(epId); |
| | batch.set(epRef, { ...ep, animeSlug: slug }, { merge: true }); |
| | }); |
| | await batch.commit(); |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | function extractSlug(url) { |
| | try { |
| | const parts = new URL(url).pathname |
| | .split("/") |
| | .filter(Boolean); |
| | return parts[parts.length - 1] || url; |
| | } catch { |
| | return url.replace(/[^a-z0-9-]/gi, "-").toLowerCase(); |
| | } |
| | } |
| |
|
| | |
| |
|
| | async function scrapeSearch(query) { |
| | const searchUrl = `${BASE_URL}/?s=${encodeURIComponent(query)}`; |
| | console.log(`π Search: "${query}"`); |
| |
|
| | const html = await fetchPage(searchUrl); |
| | const { animes } = parseAnimeList(html); |
| | return animes; |
| | } |
| |
|
| | |
| |
|
| | module.exports = { |
| | scrapeAnimeList, |
| | scrapeAnimeDetails, |
| | scrapeIncrementalUpdates, |
| | scrapeSearch, |
| | fetchPage, |
| | parseAnimeList, |
| | parseAnimeDetail, |
| | parseEpisodePage, |
| | extractSlug, |
| | }; |
| |
|
| | |
| | if (require.main === module) { |
| | (async () => { |
| | try { |
| | const animes = await scrapeAnimeList(`${BASE_URL}/anime/`, 5); |
| | await scrapeAnimeDetails(animes.slice(0, 10)); |
| | } catch (err) { |
| | console.error("Fatal:", err); |
| | process.exit(1); |
| | } |
| | })(); |
| | } |
| |
|