LT-ML / backend /controllers /search.controller.js
WokoVN's picture
Upload 75 files
bb57163 verified
import braveService from '../services/brave.service.js';
import scrapeService from '../services/scrape.service.js';
import geminiService from '../services/gemini.service.js';
import pLimit from 'p-limit';
// Max 5 URLs, max 3 concurrent scrapes (axios is fast; Puppeteer fallback is slow)
const MAX_SCRAPE_URLS = 5;
const scrapeLimit = pLimit(3);
// Wrap a promise with a hard timeout
const withTimeout = (promise, ms, label) =>
Promise.race([
promise,
new Promise((_, reject) =>
setTimeout(() => reject(new Error(`Timeout after ${ms}ms: ${label}`)), ms)
),
]);
class SearchController {
async search(req, res) {
try {
const { query, language, freshness } = req.body;
if (!query) {
return res.status(400).json({ error: 'Query is required' });
}
const options = {};
if (language) options.language = language;
if (freshness) options.freshness = freshness;
const data = await braveService.search(query, options);
res.json(data);
} catch (error) {
console.error('Search Error:', error.message);
res.status(500).json({
error: 'Failed to perform search',
details: error.response?.data || error.message
});
}
}
async searchAndSummarize(req, res) {
try {
const { query, language, freshness } = req.body;
if (!query) {
return res.status(400).json({ error: 'Query is required' });
}
const options = {};
if (language) options.language = language;
if (freshness) options.freshness = freshness;
// 1. Search bằng Brave
console.log('[BRAVE API] Starting search request...');
let searchData;
try {
searchData = await braveService.search(query, options);
console.log('[BRAVE API] Search successful, found results');
} catch (err) {
console.error('[BRAVE API] ERROR:', err.message);
console.error('[BRAVE API] Status:', err.response?.status);
console.error('[BRAVE API] Details:', err.response?.data);
if (err.response?.status === 429) {
return res.status(429).json({
error: 'Rate limit exceeded',
api: 'Brave Search API',
details: 'Vuot qua gioi han API Brave Search. Vui long thu lai sau it phut.'
});
}
throw err;
}
// 2. Lấy URLs từ kết quả (ưu tiên news, fallback về web)
let urls = [];
if (searchData.news?.results && searchData.news.results.length > 0) {
urls = searchData.news.results.slice(0, MAX_SCRAPE_URLS).map(r => r.url);
} else if (searchData.web?.results && searchData.web.results.length > 0) {
urls = searchData.web.results
.filter(r => r.type === 'search_result')
.slice(0, MAX_SCRAPE_URLS)
.map(r => r.url);
}
if (urls.length === 0) {
return res.status(404).json({ error: 'Khong tim thay ket qua nao' });
}
// 3. Scrape tất cả URLs bằng Puppeteer
console.log(`[PUPPETEER] Scraping ${urls.length} articles...`);
const scrapePromises = urls.map(async (url, index) => {
try {
const scraped = await scrapeService.scrapeUrl(url);
console.log(`[PUPPETEER] Successfully scraped: ${url}`);
return {
title: scraped.title || `Bai ${index + 1}`,
source: new URL(url).hostname,
content: scraped.text || '',
url: url
};
} catch (err) {
console.error(`[PUPPETEER] Failed to scrape ${url}:`, err.message);
return null;
}
});
const articles = (await Promise.all(scrapePromises)).filter(a => a !== null);
console.log(`[PUPPETEER] Successfully scraped ${articles.length}/${urls.length} articles`);
if (articles.length === 0) {
return res.status(500).json({ error: 'Khong the scrape duoc bai bao nao' });
}
// 4. Gửi tất cả vào Gemini để tóm tắt
console.log(`[GEMINI API] Starting summarization of ${articles.length} articles...`);
let summary;
try {
summary = await geminiService.summarizeMultipleNews(articles, query);
console.log('[GEMINI API] Summarization successful');
} catch (err) {
console.error('[GEMINI API] ERROR:', err.message);
console.error('[GEMINI API] Status:', err.response?.status);
console.error('[GEMINI API] Details:', err.response?.data);
if (err.message.includes('PROHIBITED_CONTENT')) {
return res.status(400).json({
error: 'Prohibited content',
api: 'Google Gemini API',
details: 'Nội dung không phù hợp. Gemini AI đã chặn nội dung này vì vi phạm tiêu chuẩn an toàn.'
});
}
if (err.response?.status === 429 || err.message.includes('429')) {
return res.status(429).json({
error: 'Rate limit exceeded',
api: 'Google Gemini API',
details: 'Vuot qua gioi han API Gemini. Vui long thu lai sau it phut.'
});
}
throw err;
}
console.log('[SUCCESS] Request completed successfully');
res.json({
summary: summary.summary,
totalArticles: summary.totalArticles,
articles: articles.map(a => ({
title: a.title,
source: a.source,
url: a.url
}))
});
} catch (error) {
console.error('[ERROR] Search and Summarize Error:', error.message);
console.error('[ERROR] Stack:', error.stack);
res.status(500).json({
error: 'Failed to search and summarize',
details: error.response?.data?.error || error.message
});
}
}
async scrapeAndSummarize(req, res) {
const tTotal = Date.now();
try {
const { urls: rawUrls, query } = req.body;
if (!rawUrls || !Array.isArray(rawUrls) || rawUrls.length === 0) {
return res.status(400).json({ error: 'URLs array is required' });
}
// Cap to MAX_SCRAPE_URLS to avoid long waits
const urls = rawUrls.slice(0, MAX_SCRAPE_URLS);
console.log(`\n${'='.repeat(60)}`);
console.log(`[REQUEST] scrapeAndSummarize — ${urls.length} URLs, query="${query?.substring(0,40)}"`);
urls.forEach((u, i) => console.log(` [${i+1}] ${u.substring(0, 80)}`));
// 1. Scrape URLs (axios fast path, Puppeteer fallback) with 30s total timeout
const tScrapeStart = Date.now();
console.log(`[SCRAPE] ⏳ Starting scrape of ${urls.length} URLs (concurrency: 3)...`);
const scrapeWork = Promise.all(
urls.map((url, index) =>
scrapeLimit(async () => {
const t = Date.now();
try {
const scraped = await scrapeService.scrapeUrl(url);
console.log(`[SCRAPE] ✅ [${index+1}/${urls.length}] ${Date.now()-t}ms — ${url.substring(0, 60)}`);
return {
title: scraped.title || `Bài ${index + 1}`,
source: new URL(url).hostname,
content: scraped.text || '',
url,
};
} catch (err) {
console.error(`[SCRAPE] ❌ [${index+1}/${urls.length}] ${Date.now()-t}ms — ${url.substring(0, 60)}: ${err.message}`);
return null;
}
})
)
);
const rawArticles = await withTimeout(scrapeWork, 90000, 'scrapeAndSummarize');
const articles = rawArticles.filter((a) => a !== null);
console.log(`[SCRAPE] 🏁 Done: ${articles.length}/${urls.length} OK in ${Date.now()-tScrapeStart}ms (total elapsed: ${Date.now()-tTotal}ms)`);
if (articles.length === 0) {
return res.status(500).json({ error: 'Khong the scrape duoc bai bao nao' });
}
// 2. Send to Gemini
const tGeminiStart = Date.now();
console.log(`[GEMINI] ⏳ Summarizing ${articles.length} articles...`);
let summary;
try {
summary = await geminiService.summarizeMultipleNews(articles, query || '');
console.log(`[GEMINI] ✅ Done in ${Date.now()-tGeminiStart}ms (total elapsed: ${Date.now()-tTotal}ms)`);
} catch (err) {
console.error('[GEMINI API] ERROR:', err.message);
console.error('[GEMINI API] Status:', err.response?.status);
console.error('[GEMINI API] Details:', err.response?.data);
if (err.message.includes('PROHIBITED_CONTENT')) {
return res.status(400).json({
error: 'Prohibited content',
api: 'Google Gemini API',
details: 'Nội dung không phù hợp. Gemini AI đã chặn nội dung này vì vi phạm tiêu chuẩn an toàn.'
});
}
if (err.response?.status === 429 || err.message.includes('429')) {
return res.status(429).json({
error: 'Rate limit exceeded',
api: 'Google Gemini API',
details: 'Vuot qua gioi han API Gemini. Vui long thu lai sau it phut.'
});
}
throw err;
}
console.log(`[SUCCESS] 🏁 scrapeAndSummarize done in ${Date.now()-tTotal}ms total`);
console.log('='.repeat(60));
res.json({
summary: summary.summary,
totalArticles: summary.totalArticles
});
} catch (error) {
console.error('[ERROR] Scrape and Summarize Error:', error.message);
console.error('[ERROR] Stack:', error.stack);
res.status(500).json({
error: 'Failed to scrape and summarize',
details: error.message
});
}
}
}
export default new SearchController();