Spaces:

Aqso
/

tim

Paused

App Files Files Community

tim / scraper.js

Aqso's picture

Upload scraper.js

8b75dd4 verified 24 days ago

history blame contribute delete

13.2 kB

	// scraper.js — Anichin.cafe Gacor Scraper
	// Logika: Scrape → Parse → Deduplicate → Upsert ke Firebase

	require("dotenv").config();
	const axios = require("axios");
	const cheerio = require("cheerio");
	const { getDB } = require("./firebase");

	const BASE_URL = process.env.BASE_URL \|\| "https://anichin.cafe";
	const DELAY_MS = parseInt(process.env.SCRAPE_DELAY_MS \|\| "1500");
	const MAX_CONCURRENT = parseInt(process.env.MAX_CONCURRENT \|\| "3");

	// ─── UTILITY ─────────────────────────────────────────────────────────────────

	const sleep = (ms) => new Promise((res) => setTimeout(res, ms));

	const randomDelay = () =>
	sleep(DELAY_MS + Math.floor(Math.random() * 1000));

	const USER_AGENTS = [
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
	];

	function getRandomUA() {
	return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
	}

	async function fetchPage(url, retries = 3) {
	for (let attempt = 1; attempt <= retries; attempt++) {
	try {
	const res = await axios.get(url, {
	headers: {
	"User-Agent": getRandomUA(),
	Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
	Referer: BASE_URL,
	Connection: "keep-alive",
	},
	timeout: 15000,
	});
	return res.data;
	} catch (err) {
	const isLast = attempt === retries;
	console.warn(`⚠️ Attempt ${attempt}/${retries} failed for ${url}: ${err.message}`);
	if (!isLast) await sleep(2000 * attempt); // exponential backoff
	else throw err;
	}
	}
	}

	// ─── PARSERS ─────────────────────────────────────────────────────────────────

	/**
	* Parse halaman list/catalog anime
	* Anichin biasanya pake struktur WordPress + plugin
	*/
	function parseAnimeList(html) {
	const $ = cheerio.load(html);
	const animes = [];

	// Selector untuk card anime di halaman list
	$(".bsx, .bs, article.bs").each((_, el) => {
	const $el = $(el);

	const title =
	$el.find(".tt, h2, .title, a[title]").first().text().trim() \|\|
	$el.find("a").attr("title") \|\|
	"";

	const url =
	$el.find("a").first().attr("href") \|\| "";

	const thumbnail =
	$el.find("img").attr("src") \|\|
	$el.find("img").attr("data-src") \|\|
	"";

	const score =
	parseFloat($el.find(".numscore, .score, .rating").text().trim()) \|\| null;

	const status =
	$el.find(".statuss, .status").text().trim() \|\| "";

	const type =
	$el.find(".typez, .type").text().trim() \|\| "";

	if (title && url) {
	animes.push({
	title,
	url: url.startsWith("http") ? url : `${BASE_URL}${url}`,
	thumbnail,
	score,
	status,
	type,
	});
	}
	});

	// Pagination — cari next page URL
	const nextPage =
	$(".next.page-numbers, a.next, .navigation .next a").attr("href") \|\| null;

	return { animes, nextPage };
	}

	/**
	* Parse halaman detail anime
	*/
	function parseAnimeDetail(html, animeUrl) {
	const $ = cheerio.load(html);

	const title =
	$(".entry-title, h1.title, .animDetail h1").first().text().trim();

	const synopsis =
	$(".entry-content p, .synops, .desc").first().text().trim();

	const thumbnail =
	$(".thumbook img, .thumb img, .poster img").attr("src") \|\|
	$(".thumb img").attr("data-src") \|\|
	"";

	// Info box (Genre, Studio, Status, dll)
	const info = {};
	$(".infox .spe span, .infoanime span, .spe span").each((_, el) => {
	const text = $(el).text();
	const [key, ...val] = text.split(":");
	if (key && val.length) {
	info[key.trim().toLowerCase()] = val.join(":").trim();
	}
	});

	// Genres
	const genres = [];
	$(".genxed a, .genres a, .genre a").each((_, el) => {
	const g = $(el).text().trim();
	if (g) genres.push(g);
	});

	// Episodes list
	const episodes = [];
	$("#episode_by_py li, .eplister ul li, .eps li").each((_, el) => {
	const $ep = $(el);
	const epTitle = $ep.find(".epl-title, .eptitle").text().trim();
	const epUrl = $ep.find("a").attr("href") \|\| "";
	const epNum =
	parseFloat($ep.find(".epl-num").text().trim()) \|\|
	parseFloat(epTitle.replace(/\D/g, "")) \|\|
	null;
	const epDate = $ep.find(".epl-date").text().trim() \|\| "";

	if (epUrl) {
	episodes.push({
	number: epNum,
	title: epTitle,
	url: epUrl.startsWith("http") ? epUrl : `${BASE_URL}${epUrl}`,
	date: epDate,
	});
	}
	});

	// Sort episodes ascending
	episodes.sort((a, b) => (a.number \|\| 0) - (b.number \|\| 0));

	return {
	title,
	url: animeUrl,
	synopsis,
	thumbnail,
	genres,
	totalEpisodes: episodes.length,
	episodes,
	...info,
	scrapedAt: new Date().toISOString(),
	};
	}

	/**
	* Parse halaman episode (ambil streaming links)
	*/
	function parseEpisodePage(html) {
	const $ = cheerio.load(html);

	const streamingLinks = [];

	// Tombol server/mirror
	$(".mirror .btn, .serverselect a, .mirrorselect a, .soraddl a").each((_, el) => {
	const $el = $(el);
	const label = $el.text().trim();
	const href = $el.attr("href") \|\| $el.attr("data-src") \|\| "";
	if (href) streamingLinks.push({ label, url: href });
	});

	// iFrame embed links
	$("iframe[src], .entry-content iframe").each((_, el) => {
	const src = $(el).attr("src") \|\| "";
	if (src) streamingLinks.push({ label: "embed", url: src });
	});

	return { streamingLinks };
	}

	// ─── SCRAPING JOBS ────────────────────────────────────────────────────────────

	/**
	* Scrape semua anime dari halaman list dengan pagination
	*/
	async function scrapeAnimeList(startUrl = `${BASE_URL}/anime/`, maxPages = 10) {
	console.log(`\n🚀 Mulai scrape list anime — max ${maxPages} halaman`);
	const db = getDB();
	const allAnimes = [];
	let currentUrl = startUrl;
	let page = 1;

	while (currentUrl && page <= maxPages) {
	console.log(`📄 Halaman ${page}: ${currentUrl}`);

	try {
	const html = await fetchPage(currentUrl);
	const { animes, nextPage } = parseAnimeList(html);

	console.log(` ✓ Dapet ${animes.length} anime`);
	allAnimes.push(...animes);

	// Batch upsert ke Firebase
	if (animes.length > 0) {
	await batchUpsertAnimes(db, animes);
	}

	currentUrl = nextPage;
	page++;

	if (currentUrl) await randomDelay();
	} catch (err) {
	console.error(`❌ Gagal scrape halaman ${page}: ${err.message}`);
	break;
	}
	}

	console.log(`\n✅ List selesai — total ${allAnimes.length} anime discrape`);
	return allAnimes;
	}

	/**
	* Scrape detail + episode tiap anime
	*/
	async function scrapeAnimeDetails(animes) {
	console.log(`\n🔍 Scrape detail untuk ${animes.length} anime...`);
	const db = getDB();

	// Process dengan concurrency limit
	const results = [];
	const queue = [...animes];
	let active = 0;
	let done = 0;

	async function processNext() {
	if (queue.length === 0) return;
	const anime = queue.shift();
	active++;

	try {
	console.log(` [${done + 1}/${animes.length}] ${anime.title}`);
	const html = await fetchPage(anime.url);
	const detail = parseAnimeDetail(html, anime.url);

	// Merge dengan data dari list
	const merged = { ...anime, ...detail };
	await upsertAnimeDetail(db, merged);
	results.push(merged);
	} catch (err) {
	console.error(` ❌ ${anime.title}: ${err.message}`);
	}

	done++;
	active--;
	await randomDelay();
	await processNext();
	}

	// Jalankan N concurrent workers
	const workers = Array.from({ length: MAX_CONCURRENT }, () => processNext());
	await Promise.all(workers);

	console.log(`\n✅ Detail selesai — ${results.length}/${animes.length} berhasil`);
	return results;
	}

	/**
	* Scrape episode baru saja (incremental update)
	* Cek Firebase → bandingkan episode count → scrape yang kurang
	*/
	async function scrapeIncrementalUpdates() {
	console.log("\n🔄 Incremental update — cek episode baru...");
	const db = getDB();

	// Ambil anime yang statusnya "Ongoing" dari Firebase
	const snapshot = await db
	.collection("animes")
	.where("status", "in", ["Ongoing", "ongoing", "Airing"])
	.limit(50)
	.get();

	console.log(` Dapet ${snapshot.size} anime ongoing`);

	const toUpdate = [];
	snapshot.forEach((doc) => toUpdate.push({ id: doc.id, ...doc.data() }));

	for (const anime of toUpdate) {
	try {
	const html = await fetchPage(anime.url);
	const fresh = parseAnimeDetail(html, anime.url);

	// Hanya update jika ada episode baru
	if (fresh.totalEpisodes > (anime.totalEpisodes \|\| 0)) {
	console.log(
	` 🆕 ${anime.title}: ${anime.totalEpisodes} → ${fresh.totalEpisodes} eps`
	);
	await upsertAnimeDetail(db, { ...anime, ...fresh });
	}

	await randomDelay();
	} catch (err) {
	console.error(` ❌ ${anime.title}: ${err.message}`);
	}
	}

	console.log("✅ Incremental update selesai");
	}

	// ─── FIREBASE OPERATIONS ──────────────────────────────────────────────────────

	/**
	* Batch upsert anime dasar (title, url, thumbnail, dll)
	* Pake slug dari URL sebagai document ID
	*/
	async function batchUpsertAnimes(db, animes) {
	const BATCH_SIZE = 400; // Firestore max 500 per batch
	for (let i = 0; i < animes.length; i += BATCH_SIZE) {
	const batch = db.batch();
	const chunk = animes.slice(i, i + BATCH_SIZE);

	chunk.forEach((anime) => {
	const slug = extractSlug(anime.url);
	const ref = db.collection("animes").doc(slug);
	batch.set(
	ref,
	{
	...anime,
	slug,
	updatedAt: new Date().toISOString(),
	},
	{ merge: true }
	);
	});

	await batch.commit();
	console.log(` 💾 Saved batch ${i / BATCH_SIZE + 1}`);
	}
	}

	/**
	* Upsert detail anime lengkap beserta episodes sebagai subcollection
	*/
	async function upsertAnimeDetail(db, anime) {
	const slug = extractSlug(anime.url);
	const animeRef = db.collection("animes").doc(slug);

	// Pisah episodes dari main doc (biar nggak oversize)
	const { episodes, ...animeData } = anime;

	// Update main anime document
	await animeRef.set(
	{
	...animeData,
	slug,
	hasDetails: true,
	updatedAt: new Date().toISOString(),
	},
	{ merge: true }
	);

	// Upsert episodes ke subcollection
	if (episodes && episodes.length > 0) {
	const BATCH_SIZE = 400;
	for (let i = 0; i < episodes.length; i += BATCH_SIZE) {
	const batch = db.batch();
	episodes.slice(i, i + BATCH_SIZE).forEach((ep) => {
	const epId = `ep-${String(ep.number \|\| i).padStart(4, "0")}`;
	const epRef = animeRef.collection("episodes").doc(epId);
	batch.set(epRef, { ...ep, animeSlug: slug }, { merge: true });
	});
	await batch.commit();
	}
	}
	}

	/**
	* Extract slug dari URL
	* e.g. https://anichin.cafe/anime/one-piece/ → one-piece
	*/
	function extractSlug(url) {
	try {
	const parts = new URL(url).pathname
	.split("/")
	.filter(Boolean);
	return parts[parts.length - 1] \|\| url;
	} catch {
	return url.replace(/[^a-z0-9-]/gi, "-").toLowerCase();
	}
	}

	// ─── SEARCH ───────────────────────────────────────────────────────────────────

	async function scrapeSearch(query) {
	const searchUrl = `${BASE_URL}/?s=${encodeURIComponent(query)}`;
	console.log(`🔍 Search: "${query}"`);

	const html = await fetchPage(searchUrl);
	const { animes } = parseAnimeList(html);
	return animes;
	}

	// ─── EXPORT ───────────────────────────────────────────────────────────────────

	module.exports = {
	scrapeAnimeList,
	scrapeAnimeDetails,
	scrapeIncrementalUpdates,
	scrapeSearch,
	fetchPage,
	parseAnimeList,
	parseAnimeDetail,
	parseEpisodePage,
	extractSlug,
	};

	// Kalau dirun langsung: node scraper.js
	if (require.main === module) {
	(async () => {
	try {
	const animes = await scrapeAnimeList(`${BASE_URL}/anime/`, 5);
	await scrapeAnimeDetails(animes.slice(0, 10)); // detail 10 anime pertama
	} catch (err) {
	console.error("Fatal:", err);
	process.exit(1);
	}
	})();
	}