vyles commited on
Commit
c6f192a
·
verified ·
1 Parent(s): d38c2dc

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +22 -0
  2. index.js +419 -0
  3. package.json +22 -0
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM node:18-alpine
2
+
3
+ ENV CHROME_BIN=/usr/bin/chromium \
4
+ TZ=Asia/Jakarta \
5
+ DEBIAN_FRONTEND=noninteractive
6
+
7
+ RUN apk add --no-cache \
8
+ chromium \
9
+ nss \
10
+ libc6-compat \
11
+ glib \
12
+ libgcc \
13
+ libstdc++ \
14
+ font-noto \
15
+ fontconfig \
16
+ ca-certificates
17
+ WORKDIR /app
18
+ COPY package*.json $WORKDIR
19
+ RUN npm i
20
+ COPY . $WORKDIR
21
+ EXPOSE 7860
22
+ CMD ["node", "."]
index.js ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import express from 'express';
2
+ import { chromium } from 'playwright';
3
+ import cors from 'cors';
4
+ import bodyParser from 'body-parser';
5
+
6
+ const app = express();
7
+ const PORT = process.env.PORT || 7860;
8
+
9
+ // Middleware aplikasi
10
+ app.set('json spaces', 2);
11
+ app.use(bodyParser.urlencoded({ extended: true }));
12
+ app.use(bodyParser.json());
13
+ app.use(express.json({ limit: '500mb' }));
14
+ app.use(cors());
15
+
16
+ // Konstanta konfigurasi
17
+ const DEFAULT_TIMEOUT = 30000;
18
+ const MAX_CONCURRENT_PAGES = 3;
19
+ const CONTENT_SELECTORS = [
20
+ 'main', 'article', '[role="main"]', '.content', '#content',
21
+ '.post-content', '.entry-content', '.article-content', '.page-content',
22
+ '.main-content', '[itemprop="articleBody"]', '.story-body', '.article-body',
23
+ '.detail__body-text', '.detail__body', '.itp_bodycontent'
24
+ ];
25
+
26
+ // Konfigurasi pemblokiran sumber daya
27
+ const BLOCKED_RESOURCES = ['font', 'media', 'websocket', 'image'];
28
+ const BLOCKED_PATTERNS = [
29
+ 'google-analytics', 'doubleclick', 'facebook', 'twitter',
30
+ 'analytics', 'ads', 'tracking', 'pixel'
31
+ ];
32
+ /**
33
+ * Mengatur routing halaman untuk memblokir sumber daya yang tidak perlu
34
+ * Meningkatkan kecepatan loading dengan memblokir gambar, font, media, dan tracker
35
+ * @param {import('playwright').Page} page - Instance halaman Playwright
36
+ */
37
+ async function setupOptimizedRouting(page) {
38
+ await page.route('**/*', (route) => {
39
+ const resourceType = route.request().resourceType();
40
+ const requestUrl = route.request().url();
41
+
42
+ const isBlockedResource = BLOCKED_RESOURCES.includes(resourceType);
43
+ const isBlockedPattern = BLOCKED_PATTERNS.some(pattern =>
44
+ requestUrl.toLowerCase().includes(pattern)
45
+ );
46
+
47
+ if (isBlockedResource || isBlockedPattern) {
48
+ route.abort();
49
+ } else {
50
+ route.continue();
51
+ }
52
+ });
53
+ }
54
+
55
+ /**
56
+ * Melakukan scroll halaman untuk memuat konten dinamis
57
+ * Mensimulasikan perilaku scroll pengguna untuk mengaktifkan lazy loading
58
+ * @param {import('playwright').Page} page - Instance halaman Playwright
59
+ */
60
+ async function optimizedScroll(page) {
61
+ await page.evaluate(() => {
62
+ return new Promise((resolve) => {
63
+ const distance = 300;
64
+ const maxScrolls = 20;
65
+ const delay = 50;
66
+ let currentScroll = 0;
67
+
68
+ const timer = setInterval(() => {
69
+ const scrollHeight = document.body.scrollHeight;
70
+ window.scrollBy(0, distance);
71
+ currentScroll += distance;
72
+
73
+ if (currentScroll >= scrollHeight || currentScroll >= distance * maxScrolls) {
74
+ clearInterval(timer);
75
+ window.scrollTo(0, 0);
76
+ resolve();
77
+ }
78
+ }, delay);
79
+ });
80
+ });
81
+ }
82
+
83
+ /**
84
+ * Mengekstrak konten utama dari URL yang diberikan menggunakan instance halaman Playwright
85
+ * Menangani navigasi, scroll otomatis, dan ekstraksi konten secara inteligent
86
+ * @param {string} url - URL yang akan diekstrak kontennya
87
+ * @param {import('playwright').Page} page - Instance halaman Playwright
88
+ * @returns {Promise<Object>} - Objek hasil ekstraksi konten
89
+ */
90
+ async function extractContentFromUrl(url, page) {
91
+ try {
92
+ page.setDefaultNavigationTimeout(DEFAULT_TIMEOUT);
93
+ page.setDefaultTimeout(DEFAULT_TIMEOUT);
94
+
95
+ await setupOptimizedRouting(page);
96
+
97
+ // Navigasi ke halaman target
98
+ try {
99
+ await page.goto(url, {
100
+ waitUntil: 'domcontentloaded',
101
+ timeout: DEFAULT_TIMEOUT
102
+ });
103
+ } catch (navigationError) {
104
+ await page.goto(url, {
105
+ waitUntil: 'load',
106
+ timeout: DEFAULT_TIMEOUT
107
+ });
108
+ }
109
+
110
+ // Tunggu elemen body dengan timeout yang lebih singkat
111
+ try {
112
+ await page.waitForSelector('body', { state: 'visible', timeout: 5000 });
113
+ } catch (e) {}
114
+
115
+ // Tunggu halaman termuat
116
+ await page.waitForTimeout(1500);
117
+
118
+ // Tunggu konten utama muncul
119
+ await Promise.race([
120
+ page.waitForSelector(CONTENT_SELECTORS[0], { timeout: 3000 }),
121
+ page.waitForTimeout(2000)
122
+ ]);
123
+
124
+ // Lakukan scrolling halaman
125
+ await optimizedScroll(page);
126
+ await page.waitForTimeout(500);
127
+ // Ekstraksi konten halaman
128
+ const content = await page.evaluate((selectors) => {
129
+ const cleanText = (text) => text ? text.replace(/\s+/g, ' ').trim() : '';
130
+ const getTextContent = (element) => {
131
+ if (!element) return '';
132
+ const clone = element.cloneNode(true);
133
+ clone.querySelectorAll('script, style, noscript, iframe, nav, header, footer, aside').forEach(el => el.remove());
134
+ return cleanText(clone.textContent || clone.innerText || '');
135
+ };
136
+
137
+ const title = document.title || '';
138
+ const metaDescription = document.querySelector('meta[name="description"]')?.content ||
139
+ document.querySelector('meta[property="og:description"]')?.content || '';
140
+
141
+ const headings = {
142
+ h1: Array.from(document.querySelectorAll('h1')).map(h1 => getTextContent(h1)).filter(Boolean).slice(0, 5),
143
+ h2: Array.from(document.querySelectorAll('h2')).map(h2 => getTextContent(h2)).filter(Boolean).slice(0, 10)
144
+ };
145
+
146
+ const paragraphs = Array.from(document.querySelectorAll('p'))
147
+ .map(p => getTextContent(p))
148
+ .filter(text => text.length > 20)
149
+ .slice(0, 50);
150
+
151
+ // Cari elemen konten utama
152
+ let mainContent = null;
153
+ for (const selector of selectors) {
154
+ const element = document.querySelector(selector);
155
+ if (element && getTextContent(element).length > 100) {
156
+ mainContent = element;
157
+ break;
158
+ }
159
+ }
160
+
161
+ // Strategi cadangan jika konten utama tidak ditemukan
162
+ if (!mainContent) {
163
+ const contentCandidates = Array.from(document.querySelectorAll('div, section, article'))
164
+ .map(el => ({ element: el, text: getTextContent(el) }))
165
+ .filter(candidate => candidate.text.length > 200)
166
+ .sort((a, b) => b.text.length - a.text.length);
167
+
168
+ mainContent = contentCandidates[0]?.element || document.body;
169
+ }
170
+
171
+ const mainText = getTextContent(mainContent);
172
+ const allText = mainText || paragraphs.slice(0, 20).join(' ') || document.body.innerText || '';
173
+ const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length;
174
+
175
+ return {
176
+ title: title.trim(),
177
+ metaDescription: metaDescription.trim(),
178
+ headings,
179
+ paragraphs: paragraphs.slice(0, 20),
180
+ mainText: mainText.slice(0, 10000),
181
+ wordCount,
182
+ hasContent: wordCount > 50,
183
+ url: window.location.href
184
+ };
185
+ }, CONTENT_SELECTORS);
186
+ return {
187
+ url,
188
+ success: true,
189
+ content,
190
+ extractedAt: new Date().toISOString()
191
+ };
192
+ } catch (error) {
193
+ return {
194
+ url,
195
+ success: false,
196
+ error: error.message,
197
+ extractedAt: new Date().toISOString()
198
+ };
199
+ }
200
+ }
201
+
202
+ // Route API
203
+ /**
204
+ * Memvalidasi dan memfilter URL untuk memastikan format yang benar
205
+ * Fungsi ini memisahkan URL valid dan tidak valid dari array input
206
+ * @param {string[]} urls - Array URL yang akan divalidasi
207
+ * @returns {Object} - Object yang berisi URL valid dan tidak valid
208
+ */
209
+ function validateUrls(urls) {
210
+ const validUrls = [];
211
+ const invalidUrls = [];
212
+
213
+ urls.forEach(url => {
214
+ try {
215
+ new URL(url);
216
+ validUrls.push(url);
217
+ } catch (error) {
218
+ invalidUrls.push(url);
219
+ }
220
+ });
221
+
222
+ return { validUrls, invalidUrls };
223
+ }
224
+
225
+ /**
226
+ * Memproses multiple URL secara konkuren dengan kontrol paralelisme yang terbatas
227
+ * Fungsi ini menggunakan semaphore untuk membatasi jumlah tab yang berjalan bersamaan
228
+ * @param {string[]} urls - Array URL yang akan diproses
229
+ * @param {import('playwright').Browser} browser - Instance browser Playwright
230
+ * @returns {Promise<Object[]>} - Array hasil ekstraksi dari semua URL
231
+ */
232
+ async function processUrlsConcurrently(urls, browser) {
233
+ const contextOptions = {
234
+ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
235
+ viewport: { width: 1920, height: 1080 },
236
+ locale: 'en-US',
237
+ timezoneId: 'America/New_York',
238
+ ignoreHTTPSErrors: true
239
+ };
240
+
241
+ const results = [];
242
+ const semaphore = new Array(MAX_CONCURRENT_PAGES).fill(null);
243
+
244
+ for (let i = 0; i < urls.length; i++) {
245
+ const url = urls[i];
246
+
247
+ // Tunggu slot tersedia
248
+ while (semaphore.filter(s => s === null).length === 0) {
249
+ await Promise.race(semaphore.filter(Boolean));
250
+ }
251
+
252
+ // Cari slot kosong
253
+ const slotIndex = semaphore.findIndex(s => s === null);
254
+
255
+ // Mulai proses ekstraksi
256
+ const extractionPromise = (async () => {
257
+ const context = await browser.newContext(contextOptions);
258
+ const page = await context.newPage();
259
+
260
+ try {
261
+ const result = await extractContentFromUrl(url, page);
262
+ return result;
263
+ } catch (error) {
264
+ return {
265
+ url,
266
+ success: false,
267
+ error: error.message,
268
+ extractedAt: new Date().toISOString()
269
+ };
270
+ } finally {
271
+ await context.close();
272
+ }
273
+ })();
274
+
275
+ semaphore[slotIndex] = extractionPromise
276
+ .then(result => {
277
+ results[slotIndex] = result;
278
+ semaphore[slotIndex] = null;
279
+ })
280
+ .catch(() => {
281
+ results[slotIndex] = {
282
+ url,
283
+ success: false,
284
+ error: 'Ekstraksi gagal',
285
+ extractedAt: new Date().toISOString()
286
+ };
287
+ semaphore[slotIndex] = null;
288
+ });
289
+ }
290
+
291
+ // Tunggu semua proses ekstraksi selesai
292
+ await Promise.all(semaphore.filter(Boolean));
293
+
294
+ return results.filter(Boolean);
295
+ }
296
+
297
+ /**
298
+ * Endpoint utama untuk mengekstrak konten dari multiple URL
299
+ * Menerima array URL dalam request body dan mengembalikan konten yang diekstrak
300
+ */
301
+ app.post('/extract-content', async (req, res) => {
302
+ const { urls } = req.body;
303
+
304
+ // Validasi input
305
+ if (!urls || !Array.isArray(urls) || urls.length === 0) {
306
+ return res.status(400).json({
307
+ success: false,
308
+ message: 'Body harus berisi array urls yang tidak kosong.'
309
+ });
310
+ }
311
+
312
+ if (urls.length > 10) {
313
+ return res.status(400).json({
314
+ success: false,
315
+ message: 'Maksimal 10 URLs per request.'
316
+ });
317
+ }
318
+
319
+ const { validUrls, invalidUrls } = validateUrls(urls);
320
+
321
+ if (invalidUrls.length > 0) {
322
+ return res.status(400).json({
323
+ success: false,
324
+ message: 'Format URL tidak valid.',
325
+ invalidUrls
326
+ });
327
+ }
328
+
329
+ let browser;
330
+ try {
331
+ browser = await chromium.launch({
332
+ args: [
333
+ '--no-sandbox',
334
+ '--disable-setuid-sandbox',
335
+ '--disable-dev-shm-usage',
336
+ '--disable-accelerated-2d-canvas',
337
+ '--disable-gpu',
338
+ '--disable-blink-features=AutomationControlled',
339
+ '--disable-web-security',
340
+ '--disable-features=IsolateOrigins,site-per-process',
341
+ '--disable-background-timer-throttling',
342
+ '--disable-backgrounding-occluded-windows',
343
+ '--disable-renderer-backgrounding'
344
+ ],
345
+ executablePath: process.env.CHROME_BIN,
346
+ headless: true,
347
+ });
348
+
349
+ // Proses URL secara konkuren dengan kontrol paralelisme
350
+ const results = await processUrlsConcurrently(validUrls, browser);
351
+ const successCount = results.filter(r => r.success).length;
352
+ const failCount = results.filter(r => !r.success).length;
353
+ const emptyContentCount = results.filter(r => r.success && (!r.content.hasContent || r.content.wordCount < 50)).length;
354
+ res.json({
355
+ success: true,
356
+ message: `Berhasil memproses ${validUrls.length} URLs.`,
357
+ statistics: {
358
+ total: validUrls.length,
359
+ success: successCount,
360
+ failed: failCount,
361
+ emptyContent: emptyContentCount
362
+ },
363
+ results
364
+ });
365
+ } catch (error) {
366
+ res.status(500).json({
367
+ success: false,
368
+ message: 'Terjadi kesalahan saat memproses URLs.',
369
+ error: error.message
370
+ });
371
+ } finally {
372
+ if (browser) await browser.close();
373
+ }
374
+ });
375
+ /**
376
+ * Endpoint informasi API yang menampilkan detail tentang Content Extractor
377
+ * Memberikan informasi endpoint yang tersedia dan cara penggunaan
378
+ */
379
+ app.get('/', (req, res) => {
380
+ res.json({
381
+ success: true,
382
+ message: "API Ekstrak Konten",
383
+ hostname: "https://" + req.hostname,
384
+ endpoints: {
385
+ 'POST /extract-content': 'Ekstrak konten dari URL',
386
+ 'GET /': 'Informasi API'
387
+ },
388
+ });
389
+ });
390
+ // Middleware penanganan error global
391
+ /**
392
+ * Menangani error yang terjadi pada aplikasi secara global
393
+ * Mengembalikan response error yang konsisten dengan informasi detail di mode development
394
+ */
395
+ app.use((err, req, res, next) => {
396
+ res.status(500).json({
397
+ success: false,
398
+ message: 'Terjadi kesalahan internal server.',
399
+ error: err.message
400
+ });
401
+ });
402
+ // Handler untuk endpoint yang tidak ditemukan (404)
403
+ /**
404
+ * Menangani request ke endpoint yang tidak ada
405
+ * Mengembalikan response 404 dengan format konsisten
406
+ */
407
+ app.use((req, res) => {
408
+ res.status(404).json({
409
+ success: false,
410
+ message: 'Endpoint tidak ditemukan.'
411
+ });
412
+ });
413
+ /**
414
+ * Memulai server Express pada port yang ditentukan
415
+ * Menampilkan pesan saat server berhasil dijalankan
416
+ */
417
+ app.listen(PORT, () => {
418
+ console.log(`🚀 API berjalan pada port ${PORT}`);
419
+ });
package.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "content-extractor",
3
+ "version": "1.0.0",
4
+ "description": "API untuk mengekstrak konten dari multiple URL",
5
+ "main": "index.js",
6
+ "type": "module",
7
+ "scripts": {
8
+ "start": "node index.js"
9
+ },
10
+ "author": "Vyles",
11
+ "license": "MIT",
12
+ "dependencies": {
13
+ "body-parser": "^1.20.2",
14
+ "cors": "^2.8.5",
15
+ "express": "^4.18.2",
16
+ "morgan": "^1.10.0",
17
+ "path-to-regexp": "^6.2.1",
18
+ "pdfkit": "^0.13.0",
19
+ "playwright": "^1.40.0",
20
+ "serve-favicon": "^2.5.0"
21
+ }
22
+ }