tim / scraper.js
Aqso's picture
Upload scraper.js
8b75dd4 verified
// scraper.js β€” Anichin.cafe Gacor Scraper
// Logika: Scrape β†’ Parse β†’ Deduplicate β†’ Upsert ke Firebase
require("dotenv").config();
const axios = require("axios");
const cheerio = require("cheerio");
const { getDB } = require("./firebase");
const BASE_URL = process.env.BASE_URL || "https://anichin.cafe";
const DELAY_MS = parseInt(process.env.SCRAPE_DELAY_MS || "1500");
const MAX_CONCURRENT = parseInt(process.env.MAX_CONCURRENT || "3");
// ─── UTILITY ─────────────────────────────────────────────────────────────────
const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
const randomDelay = () =>
sleep(DELAY_MS + Math.floor(Math.random() * 1000));
const USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
];
function getRandomUA() {
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
}
async function fetchPage(url, retries = 3) {
for (let attempt = 1; attempt <= retries; attempt++) {
try {
const res = await axios.get(url, {
headers: {
"User-Agent": getRandomUA(),
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
Referer: BASE_URL,
Connection: "keep-alive",
},
timeout: 15000,
});
return res.data;
} catch (err) {
const isLast = attempt === retries;
console.warn(`⚠️ Attempt ${attempt}/${retries} failed for ${url}: ${err.message}`);
if (!isLast) await sleep(2000 * attempt); // exponential backoff
else throw err;
}
}
}
// ─── PARSERS ─────────────────────────────────────────────────────────────────
/**
* Parse halaman list/catalog anime
* Anichin biasanya pake struktur WordPress + plugin
*/
function parseAnimeList(html) {
const $ = cheerio.load(html);
const animes = [];
// Selector untuk card anime di halaman list
$(".bsx, .bs, article.bs").each((_, el) => {
const $el = $(el);
const title =
$el.find(".tt, h2, .title, a[title]").first().text().trim() ||
$el.find("a").attr("title") ||
"";
const url =
$el.find("a").first().attr("href") || "";
const thumbnail =
$el.find("img").attr("src") ||
$el.find("img").attr("data-src") ||
"";
const score =
parseFloat($el.find(".numscore, .score, .rating").text().trim()) || null;
const status =
$el.find(".statuss, .status").text().trim() || "";
const type =
$el.find(".typez, .type").text().trim() || "";
if (title && url) {
animes.push({
title,
url: url.startsWith("http") ? url : `${BASE_URL}${url}`,
thumbnail,
score,
status,
type,
});
}
});
// Pagination β€” cari next page URL
const nextPage =
$(".next.page-numbers, a.next, .navigation .next a").attr("href") || null;
return { animes, nextPage };
}
/**
* Parse halaman detail anime
*/
function parseAnimeDetail(html, animeUrl) {
const $ = cheerio.load(html);
const title =
$(".entry-title, h1.title, .animDetail h1").first().text().trim();
const synopsis =
$(".entry-content p, .synops, .desc").first().text().trim();
const thumbnail =
$(".thumbook img, .thumb img, .poster img").attr("src") ||
$(".thumb img").attr("data-src") ||
"";
// Info box (Genre, Studio, Status, dll)
const info = {};
$(".infox .spe span, .infoanime span, .spe span").each((_, el) => {
const text = $(el).text();
const [key, ...val] = text.split(":");
if (key && val.length) {
info[key.trim().toLowerCase()] = val.join(":").trim();
}
});
// Genres
const genres = [];
$(".genxed a, .genres a, .genre a").each((_, el) => {
const g = $(el).text().trim();
if (g) genres.push(g);
});
// Episodes list
const episodes = [];
$("#episode_by_py li, .eplister ul li, .eps li").each((_, el) => {
const $ep = $(el);
const epTitle = $ep.find(".epl-title, .eptitle").text().trim();
const epUrl = $ep.find("a").attr("href") || "";
const epNum =
parseFloat($ep.find(".epl-num").text().trim()) ||
parseFloat(epTitle.replace(/\D/g, "")) ||
null;
const epDate = $ep.find(".epl-date").text().trim() || "";
if (epUrl) {
episodes.push({
number: epNum,
title: epTitle,
url: epUrl.startsWith("http") ? epUrl : `${BASE_URL}${epUrl}`,
date: epDate,
});
}
});
// Sort episodes ascending
episodes.sort((a, b) => (a.number || 0) - (b.number || 0));
return {
title,
url: animeUrl,
synopsis,
thumbnail,
genres,
totalEpisodes: episodes.length,
episodes,
...info,
scrapedAt: new Date().toISOString(),
};
}
/**
* Parse halaman episode (ambil streaming links)
*/
function parseEpisodePage(html) {
const $ = cheerio.load(html);
const streamingLinks = [];
// Tombol server/mirror
$(".mirror .btn, .serverselect a, .mirrorselect a, .soraddl a").each((_, el) => {
const $el = $(el);
const label = $el.text().trim();
const href = $el.attr("href") || $el.attr("data-src") || "";
if (href) streamingLinks.push({ label, url: href });
});
// iFrame embed links
$("iframe[src], .entry-content iframe").each((_, el) => {
const src = $(el).attr("src") || "";
if (src) streamingLinks.push({ label: "embed", url: src });
});
return { streamingLinks };
}
// ─── SCRAPING JOBS ────────────────────────────────────────────────────────────
/**
* Scrape semua anime dari halaman list dengan pagination
*/
async function scrapeAnimeList(startUrl = `${BASE_URL}/anime/`, maxPages = 10) {
console.log(`\nπŸš€ Mulai scrape list anime β€” max ${maxPages} halaman`);
const db = getDB();
const allAnimes = [];
let currentUrl = startUrl;
let page = 1;
while (currentUrl && page <= maxPages) {
console.log(`πŸ“„ Halaman ${page}: ${currentUrl}`);
try {
const html = await fetchPage(currentUrl);
const { animes, nextPage } = parseAnimeList(html);
console.log(` βœ“ Dapet ${animes.length} anime`);
allAnimes.push(...animes);
// Batch upsert ke Firebase
if (animes.length > 0) {
await batchUpsertAnimes(db, animes);
}
currentUrl = nextPage;
page++;
if (currentUrl) await randomDelay();
} catch (err) {
console.error(`❌ Gagal scrape halaman ${page}: ${err.message}`);
break;
}
}
console.log(`\nβœ… List selesai β€” total ${allAnimes.length} anime discrape`);
return allAnimes;
}
/**
* Scrape detail + episode tiap anime
*/
async function scrapeAnimeDetails(animes) {
console.log(`\nπŸ” Scrape detail untuk ${animes.length} anime...`);
const db = getDB();
// Process dengan concurrency limit
const results = [];
const queue = [...animes];
let active = 0;
let done = 0;
async function processNext() {
if (queue.length === 0) return;
const anime = queue.shift();
active++;
try {
console.log(` [${done + 1}/${animes.length}] ${anime.title}`);
const html = await fetchPage(anime.url);
const detail = parseAnimeDetail(html, anime.url);
// Merge dengan data dari list
const merged = { ...anime, ...detail };
await upsertAnimeDetail(db, merged);
results.push(merged);
} catch (err) {
console.error(` ❌ ${anime.title}: ${err.message}`);
}
done++;
active--;
await randomDelay();
await processNext();
}
// Jalankan N concurrent workers
const workers = Array.from({ length: MAX_CONCURRENT }, () => processNext());
await Promise.all(workers);
console.log(`\nβœ… Detail selesai β€” ${results.length}/${animes.length} berhasil`);
return results;
}
/**
* Scrape episode baru saja (incremental update)
* Cek Firebase β†’ bandingkan episode count β†’ scrape yang kurang
*/
async function scrapeIncrementalUpdates() {
console.log("\nπŸ”„ Incremental update β€” cek episode baru...");
const db = getDB();
// Ambil anime yang statusnya "Ongoing" dari Firebase
const snapshot = await db
.collection("animes")
.where("status", "in", ["Ongoing", "ongoing", "Airing"])
.limit(50)
.get();
console.log(` Dapet ${snapshot.size} anime ongoing`);
const toUpdate = [];
snapshot.forEach((doc) => toUpdate.push({ id: doc.id, ...doc.data() }));
for (const anime of toUpdate) {
try {
const html = await fetchPage(anime.url);
const fresh = parseAnimeDetail(html, anime.url);
// Hanya update jika ada episode baru
if (fresh.totalEpisodes > (anime.totalEpisodes || 0)) {
console.log(
` πŸ†• ${anime.title}: ${anime.totalEpisodes} β†’ ${fresh.totalEpisodes} eps`
);
await upsertAnimeDetail(db, { ...anime, ...fresh });
}
await randomDelay();
} catch (err) {
console.error(` ❌ ${anime.title}: ${err.message}`);
}
}
console.log("βœ… Incremental update selesai");
}
// ─── FIREBASE OPERATIONS ──────────────────────────────────────────────────────
/**
* Batch upsert anime dasar (title, url, thumbnail, dll)
* Pake slug dari URL sebagai document ID
*/
async function batchUpsertAnimes(db, animes) {
const BATCH_SIZE = 400; // Firestore max 500 per batch
for (let i = 0; i < animes.length; i += BATCH_SIZE) {
const batch = db.batch();
const chunk = animes.slice(i, i + BATCH_SIZE);
chunk.forEach((anime) => {
const slug = extractSlug(anime.url);
const ref = db.collection("animes").doc(slug);
batch.set(
ref,
{
...anime,
slug,
updatedAt: new Date().toISOString(),
},
{ merge: true }
);
});
await batch.commit();
console.log(` πŸ’Ύ Saved batch ${i / BATCH_SIZE + 1}`);
}
}
/**
* Upsert detail anime lengkap beserta episodes sebagai subcollection
*/
async function upsertAnimeDetail(db, anime) {
const slug = extractSlug(anime.url);
const animeRef = db.collection("animes").doc(slug);
// Pisah episodes dari main doc (biar nggak oversize)
const { episodes, ...animeData } = anime;
// Update main anime document
await animeRef.set(
{
...animeData,
slug,
hasDetails: true,
updatedAt: new Date().toISOString(),
},
{ merge: true }
);
// Upsert episodes ke subcollection
if (episodes && episodes.length > 0) {
const BATCH_SIZE = 400;
for (let i = 0; i < episodes.length; i += BATCH_SIZE) {
const batch = db.batch();
episodes.slice(i, i + BATCH_SIZE).forEach((ep) => {
const epId = `ep-${String(ep.number || i).padStart(4, "0")}`;
const epRef = animeRef.collection("episodes").doc(epId);
batch.set(epRef, { ...ep, animeSlug: slug }, { merge: true });
});
await batch.commit();
}
}
}
/**
* Extract slug dari URL
* e.g. https://anichin.cafe/anime/one-piece/ β†’ one-piece
*/
function extractSlug(url) {
try {
const parts = new URL(url).pathname
.split("/")
.filter(Boolean);
return parts[parts.length - 1] || url;
} catch {
return url.replace(/[^a-z0-9-]/gi, "-").toLowerCase();
}
}
// ─── SEARCH ───────────────────────────────────────────────────────────────────
async function scrapeSearch(query) {
const searchUrl = `${BASE_URL}/?s=${encodeURIComponent(query)}`;
console.log(`πŸ” Search: "${query}"`);
const html = await fetchPage(searchUrl);
const { animes } = parseAnimeList(html);
return animes;
}
// ─── EXPORT ───────────────────────────────────────────────────────────────────
module.exports = {
scrapeAnimeList,
scrapeAnimeDetails,
scrapeIncrementalUpdates,
scrapeSearch,
fetchPage,
parseAnimeList,
parseAnimeDetail,
parseEpisodePage,
extractSlug,
};
// Kalau dirun langsung: node scraper.js
if (require.main === module) {
(async () => {
try {
const animes = await scrapeAnimeList(`${BASE_URL}/anime/`, 5);
await scrapeAnimeDetails(animes.slice(0, 10)); // detail 10 anime pertama
} catch (err) {
console.error("Fatal:", err);
process.exit(1);
}
})();
}