Kraft102 commited on
Commit
5ec082b
·
1 Parent(s): e523b2e

feat(extension): add google auth check and advanced crawl options (recursive, directory, keywords)

Browse files
apps/backend/src/mcp/ingestionHandlers.ts CHANGED
@@ -72,32 +72,39 @@ export async function ingestionStartHandler(params: any): Promise<any> {
72
 
73
  export async function ingestionCrawlHandler(params: any): Promise<any> {
74
  await initializeAdapters();
75
- const url = params.url;
76
-
77
  if (!url) {
78
  return { success: false, error: 'URL is required' };
79
  }
80
 
81
  try {
82
- // Force the Web Crawler adapter to handle this URL
83
- // In a real system, the engine might route based on pattern, but here we invoke directly or rely on the engine identifying it's a URL
84
- // Since we registered WebCrawler, ingestionEngine.ingestFrom(url) should work if the adapter's fetch handles it.
85
- // However, standard ingestFrom typically takes an adapter ID or similar if it's scanning.
86
- // But WebCrawlerAdapter.fetch takes a source string.
87
- // We need to ensure dataIngestionEngine routes "http..." strings to WebCrawlerAdapter.
88
-
89
- // For now, we can manually invoke the adapter if we can access it, OR simpler:
90
- // We trust ingestFrom to iterate adapters or we specifically use the crawler instance if we exposed it.
91
- // Let's assume ingestFrom(url) works if we modified DataIngestionEngine to handle specific inputs,
92
- // OR we just perform the crawl here directly using a new instance if needed, but better to go through engine.
93
-
94
- const count = await dataIngestionEngine.ingestFrom(url);
 
 
 
 
 
 
95
  return {
96
  success: true,
97
- message: `Crawled and ingested ${url}`,
98
  count
99
  };
100
  } catch (error: any) {
 
101
  return { success: false, error: error.message };
102
  }
103
  }
@@ -106,7 +113,7 @@ export async function ingestionHarvestHandler(params: any): Promise<any> {
106
  // Trigger the autonomous harvester
107
  // We don't await the full process to prevent timeout, but we start it.
108
  autonomousHarvester.startHarvest().catch(err => console.error("Harvest background error:", err));
109
-
110
  return {
111
  success: true,
112
  message: 'Autonomous Harvest Sequence Initiated',
@@ -119,7 +126,7 @@ export async function ingestionStatusHandler(params: any): Promise<any> {
119
  // Get real stats from Vector Store
120
  const vectorStore = getNeo4jVectorStore();
121
  const stats = await vectorStore.getStatistics();
122
-
123
  const engineStatus = dataIngestionEngine.getStatus();
124
 
125
  return {
 
72
 
73
  export async function ingestionCrawlHandler(params: any): Promise<any> {
74
  await initializeAdapters();
75
+ const { url, mode, depth, keywords } = params;
76
+
77
  if (!url) {
78
  return { success: false, error: 'URL is required' };
79
  }
80
 
81
  try {
82
+ const crawler = dataIngestionEngine.getAdapter('Web Crawler') as WebCrawlerAdapter;
83
+
84
+ if (!crawler) {
85
+ return { success: false, error: 'Web Crawler adapter not initialized' };
86
+ }
87
+
88
+ // Configure crawler with request parameters
89
+ crawler.configure({
90
+ mode: mode || 'single',
91
+ maxDepth: depth !== undefined ? depth : 1,
92
+ keywords: keywords || []
93
+ });
94
+
95
+ // Add URL to crawler queue
96
+ crawler.addUrls([url]);
97
+
98
+ // Trigger specific ingestion for the crawler
99
+ const count = await dataIngestionEngine.ingestFrom('Web Crawler');
100
+
101
  return {
102
  success: true,
103
+ message: `Crawled ${url} (Mode: ${mode || 'single'}, Depth: ${depth}, Keywords: ${keywords?.length || 0})`,
104
  count
105
  };
106
  } catch (error: any) {
107
+ console.error('Crawl handler error:', error);
108
  return { success: false, error: error.message };
109
  }
110
  }
 
113
  // Trigger the autonomous harvester
114
  // We don't await the full process to prevent timeout, but we start it.
115
  autonomousHarvester.startHarvest().catch(err => console.error("Harvest background error:", err));
116
+
117
  return {
118
  success: true,
119
  message: 'Autonomous Harvest Sequence Initiated',
 
126
  // Get real stats from Vector Store
127
  const vectorStore = getNeo4jVectorStore();
128
  const stats = await vectorStore.getStatistics();
129
+
130
  const engineStatus = dataIngestionEngine.getStatus();
131
 
132
  return {
apps/backend/src/services/ingestion/WebCrawlerAdapter.ts CHANGED
@@ -1,6 +1,13 @@
1
  import axios from 'axios';
2
  import * as cheerio from 'cheerio';
3
  import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
 
 
 
 
 
 
 
4
 
5
  export class WebCrawlerAdapter implements DataSourceAdapter {
6
  name = 'Web Crawler';
@@ -8,27 +15,92 @@ export class WebCrawlerAdapter implements DataSourceAdapter {
8
 
9
  private visitedUrls: Set<string> = new Set();
10
  private urlsToFetch: string[] = [];
 
 
 
 
 
11
 
12
- constructor() {}
 
 
 
 
 
 
13
 
14
  /** Add URLs to fetch queue */
15
  addUrls(urls: string[]): void {
16
- this.urlsToFetch = urls.filter(u => u.startsWith('http'));
 
17
  }
18
 
19
  async fetch(): Promise<any[]> {
20
  const results: any[] = [];
21
- for (const url of this.urlsToFetch) {
 
 
 
 
 
 
 
 
 
22
  if (this.visitedUrls.has(url)) continue;
23
  this.visitedUrls.add(url);
24
 
25
- console.log(`🕷️ Crawling: ${url}`);
 
26
  try {
27
  const response = await axios.get(url, {
28
  timeout: 10000,
29
- headers: { 'User-Agent': 'WidgeTDC-Crawler/1.0' }
30
  });
31
- results.push({ url, html: response.data });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  } catch (error: any) {
33
  console.error(`Crawl failed for ${url}:`, error.message);
34
  }
@@ -57,7 +129,8 @@ export class WebCrawlerAdapter implements DataSourceAdapter {
57
  metadata: {
58
  url: item.url,
59
  description: description,
60
- crawledAt: new Date().toISOString()
 
61
  },
62
  timestamp: new Date()
63
  };
 
1
  import axios from 'axios';
2
  import * as cheerio from 'cheerio';
3
  import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
4
+ import { URL } from 'url';
5
+
6
+ interface CrawlerConfig {
7
+ mode: 'single' | 'directory' | 'recursive';
8
+ maxDepth: number;
9
+ keywords: string[];
10
+ }
11
 
12
  export class WebCrawlerAdapter implements DataSourceAdapter {
13
  name = 'Web Crawler';
 
15
 
16
  private visitedUrls: Set<string> = new Set();
17
  private urlsToFetch: string[] = [];
18
+ private config: CrawlerConfig = {
19
+ mode: 'single',
20
+ maxDepth: 1,
21
+ keywords: []
22
+ };
23
 
24
+ constructor() { }
25
+
26
+ configure(config: Partial<CrawlerConfig>) {
27
+ if (config.mode) this.config.mode = config.mode;
28
+ if (config.maxDepth !== undefined) this.config.maxDepth = config.maxDepth;
29
+ if (config.keywords) this.config.keywords = config.keywords;
30
+ }
31
 
32
  /** Add URLs to fetch queue */
33
  addUrls(urls: string[]): void {
34
+ const validUrls = urls.filter(u => u.startsWith('http'));
35
+ this.urlsToFetch.push(...validUrls);
36
  }
37
 
38
  async fetch(): Promise<any[]> {
39
  const results: any[] = [];
40
+ const queue = this.urlsToFetch.map(url => ({ url, depth: 0 }));
41
+
42
+ // Clear initial queue as we are processing it now
43
+ this.urlsToFetch = [];
44
+
45
+ while (queue.length > 0) {
46
+ const item = queue.shift();
47
+ if (!item) break;
48
+ const { url, depth } = item;
49
+
50
  if (this.visitedUrls.has(url)) continue;
51
  this.visitedUrls.add(url);
52
 
53
+ console.log(`🕷️ Crawling: ${url} (Depth: ${depth}/${this.config.maxDepth})`);
54
+
55
  try {
56
  const response = await axios.get(url, {
57
  timeout: 10000,
58
+ headers: { 'User-Agent': 'WidgeTDC-Search-Bot/1.0' }
59
  });
60
+
61
+ const html = response.data;
62
+ const $ = cheerio.load(html);
63
+ const textContent = $('body').text().toLowerCase();
64
+
65
+ // Check keywords if configured
66
+ if (this.config.keywords.length > 0) {
67
+ const hasKeyword = this.config.keywords.some(kw => textContent.includes(kw.toLowerCase()));
68
+ if (!hasKeyword) {
69
+ console.log(`⏭️ Skipping ${url} - no matching keywords`);
70
+ continue;
71
+ }
72
+ }
73
+
74
+ results.push({ url, html });
75
+
76
+ // Find matching links if we haven't reached max depth
77
+ if (depth < this.config.maxDepth && this.config.mode !== 'single') {
78
+ const baseUrl = new URL(url);
79
+ const links = $('a[href]').map((_, el) => $(el).attr('href')).get();
80
+
81
+ for (const link of links) {
82
+ try {
83
+ const absoluteUrl = new URL(link, url).toString();
84
+
85
+ // Domain constraint
86
+ if (new URL(absoluteUrl).hostname !== baseUrl.hostname) continue;
87
+
88
+ // Mode constraint: Directory
89
+ if (this.config.mode === 'directory') {
90
+ const basePath = baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1);
91
+ const linkPath = new URL(absoluteUrl).pathname;
92
+ if (!linkPath.startsWith(basePath)) continue;
93
+ }
94
+
95
+ if (!this.visitedUrls.has(absoluteUrl)) {
96
+ queue.push({ url: absoluteUrl, depth: depth + 1 });
97
+ }
98
+ } catch (e) {
99
+ // Invalid URL, ignore
100
+ }
101
+ }
102
+ }
103
+
104
  } catch (error: any) {
105
  console.error(`Crawl failed for ${url}:`, error.message);
106
  }
 
129
  metadata: {
130
  url: item.url,
131
  description: description,
132
+ crawledAt: new Date().toISOString(),
133
+ keywordsMatched: this.config.keywords
134
  },
135
  timestamp: new Date()
136
  };
browser-extension/background.js CHANGED
@@ -34,7 +34,7 @@ async function handleIngestion(data) {
34
  try {
35
  // Option 1: Send raw HTML (heavy)
36
  // Option 2: Send URL and let backend crawl (cleaner) -> We use this via ingestion.crawl
37
-
38
  const response = await fetch(`${API_BASE}/api/mcp/route`, {
39
  method: 'POST',
40
  headers: { 'Content-Type': 'application/json' },
@@ -68,7 +68,10 @@ async function handleCrawl(data) {
68
  tool: 'ingestion.crawl',
69
  params: {
70
  url: url,
71
- depth: depth || 2
 
 
 
72
  }
73
  })
74
  });
 
34
  try {
35
  // Option 1: Send raw HTML (heavy)
36
  // Option 2: Send URL and let backend crawl (cleaner) -> We use this via ingestion.crawl
37
+
38
  const response = await fetch(`${API_BASE}/api/mcp/route`, {
39
  method: 'POST',
40
  headers: { 'Content-Type': 'application/json' },
 
68
  tool: 'ingestion.crawl',
69
  params: {
70
  url: url,
71
+ depth: depth || 1,
72
+ mode: data.mode || 'single',
73
+ keywords: data.keywords || [],
74
+ user: data.user
75
  }
76
  })
77
  });
browser-extension/manifest.json CHANGED
@@ -7,10 +7,12 @@
7
  "activeTab",
8
  "storage",
9
  "contextMenus",
10
- "tabs"
 
11
  ],
12
  "host_permissions": [
13
  "http://localhost:3000/*",
 
14
  "https://*.widgetdc.com/*"
15
  ],
16
  "background": {
 
7
  "activeTab",
8
  "storage",
9
  "contextMenus",
10
+ "tabs",
11
+ "identity"
12
  ],
13
  "host_permissions": [
14
  "http://localhost:3000/*",
15
+ "http://localhost:3001/*",
16
  "https://*.widgetdc.com/*"
17
  ],
18
  "background": {
browser-extension/popup.html CHANGED
@@ -113,24 +113,59 @@
113
  </div>
114
 
115
  <div class="content">
116
- <div class="card">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  <div id="currentUrl" class="url-display">Loading...</div>
118
 
119
  <button id="btnIngest" class="btn-primary">
120
  <span>📥</span> Ingest Current Page
121
  </button>
122
 
123
- <div class="range-container">
124
- <label>Crawl Depth: <span id="depthVal">1</span></label>
125
- <input type="range" id="depthRange" min="1" max="3" value="1">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  </div>
127
 
128
  <button id="btnCrawl" class="btn-secondary">
129
- <span>🕸️</span> Crawl Site
130
  </button>
131
  </div>
132
 
133
- <div class="card">
134
  <label>Auto-Monitoring</label>
135
  <button id="btnMonitor" class="btn-secondary">
136
  <span>👀</span> Add to Watch Feed
 
113
  </div>
114
 
115
  <div class="content">
116
+ <!-- Auth Status -->
117
+ <div id="authSection" class="card" style="display:none; border-color: #3b82f6;">
118
+ <div style="display:flex; align-items:center; gap:8px;">
119
+ <span id="userAvatar" style="font-size:16px;">👤</span>
120
+ <span id="userEmail" style="font-size:12px; color:#cbd5e1;">Detecting user...</span>
121
+ </div>
122
+ </div>
123
+
124
+ <div id="loginWarning" class="card" style="border-color: #ef4444; background: rgba(239,68,68,0.1);">
125
+ <div style="font-size: 12px; text-align: center;">
126
+ Please sign in to Chrome to use this extension.
127
+ </div>
128
+ </div>
129
+
130
+ <div class="card" id="mainControls" style="opacity: 0.5; pointer-events: none;">
131
  <div id="currentUrl" class="url-display">Loading...</div>
132
 
133
  <button id="btnIngest" class="btn-primary">
134
  <span>📥</span> Ingest Current Page
135
  </button>
136
 
137
+ <div style="margin: 12px 0; border-top: 1px solid rgba(255,255,255,0.1); padding-top: 12px;">
138
+ <label style="color: #60a5fa; font-weight: bold; margin-bottom: 8px;">Advanced Crawl</label>
139
+
140
+ <!-- Mode Selection -->
141
+ <div style="margin-bottom: 10px;">
142
+ <label>Mode</label>
143
+ <select id="crawlMode" style="width: 100%; padding: 6px; background: rgba(0,0,0,0.3); color: white; border: 1px solid rgba(255,255,255,0.2); border-radius: 4px;">
144
+ <option value="single">Single Page</option>
145
+ <option value="directory">This Directory Only</option>
146
+ <option value="recursive">Recursive (Deep)</option>
147
+ </select>
148
+ </div>
149
+
150
+ <!-- Keywords -->
151
+ <div style="margin-bottom: 10px;">
152
+ <label>Keywords (comma separated)</label>
153
+ <input type="text" id="crawlKeywords" placeholder="e.g. documentation, api, guide" style="width: 100%; padding: 6px; background: rgba(0,0,0,0.3); color: white; border: 1px solid rgba(255,255,255,0.2); border-radius: 4px; box-sizing: border-box;">
154
+ </div>
155
+
156
+ <!-- Depth -->
157
+ <div class="range-container">
158
+ <label>Depth Limit: <span id="depthVal">1</span></label>
159
+ <input type="range" id="depthRange" min="1" max="5" value="1">
160
+ </div>
161
  </div>
162
 
163
  <button id="btnCrawl" class="btn-secondary">
164
+ <span>🕸️</span> Start Crawl
165
  </button>
166
  </div>
167
 
168
+ <div class="card" id="monitorControls" style="opacity: 0.5; pointer-events: none;">
169
  <label>Auto-Monitoring</label>
170
  <button id="btnMonitor" class="btn-secondary">
171
  <span>👀</span> Add to Watch Feed
browser-extension/popup.js CHANGED
@@ -8,13 +8,46 @@ document.addEventListener('DOMContentLoaded', async () => {
8
  const depthVal = document.getElementById('depthVal');
9
  const statusMsg = document.getElementById('statusMsg');
10
 
11
- // Get current tab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  const [tab] = await chrome.tabs.query({ active: true, currentWindow: true });
13
  if (tab?.url) {
14
  currentUrlEl.textContent = tab.url;
15
  }
16
 
17
- // Check connection
18
  chrome.runtime.sendMessage({ action: 'check_connection' }, (response) => {
19
  if (response && response.success) {
20
  statusDot.classList.add('connected');
@@ -30,9 +63,13 @@ document.addEventListener('DOMContentLoaded', async () => {
30
  // Handle Ingest
31
  btnIngest.addEventListener('click', () => {
32
  setStatus('Processing...', 'info');
33
- chrome.runtime.sendMessage({
34
- action: 'ingest_page',
35
- data: { url: tab.url, title: tab.title }
 
 
 
 
36
  }, (response) => {
37
  if (response && response.success) {
38
  setStatus('Page ingested successfully', 'success');
@@ -44,13 +81,21 @@ document.addEventListener('DOMContentLoaded', async () => {
44
 
45
  // Handle Crawl
46
  btnCrawl.addEventListener('click', () => {
 
 
47
  setStatus('Starting crawler...', 'info');
48
- chrome.runtime.sendMessage({
49
- action: 'crawl_site',
50
- data: { url: tab.url, depth: parseInt(depthRange.value) }
 
 
 
 
 
 
51
  }, (response) => {
52
  if (response && response.success) {
53
- setStatus(`Crawl started (Depth: ${depthRange.value})`, 'success');
54
  } else {
55
  setStatus('Crawl failed to start', 'error');
56
  }
 
8
  const depthVal = document.getElementById('depthVal');
9
  const statusMsg = document.getElementById('statusMsg');
10
 
11
+ // Auth Elements
12
+ const authSection = document.getElementById('authSection');
13
+ const userEmail = document.getElementById('userEmail');
14
+ const loginWarning = document.getElementById('loginWarning');
15
+ const mainControls = document.getElementById('mainControls');
16
+ const monitorControls = document.getElementById('monitorControls');
17
+
18
+ // Crawl Options
19
+ const crawlMode = document.getElementById('crawlMode');
20
+ const crawlKeywords = document.getElementById('crawlKeywords');
21
+
22
+ let currentUser = null;
23
+
24
+ // 1. Check Identity
25
+ chrome.identity.getProfileUserInfo({ accountStatus: 'ANY' }, (userInfo) => {
26
+ if (userInfo.email) {
27
+ currentUser = userInfo;
28
+ authSection.style.display = 'block';
29
+ userEmail.textContent = userInfo.email;
30
+ loginWarning.style.display = 'none';
31
+ mainControls.style.opacity = '1';
32
+ mainControls.style.pointerEvents = 'auto';
33
+ monitorControls.style.opacity = '1';
34
+ monitorControls.style.pointerEvents = 'auto';
35
+ } else {
36
+ // Not logged in
37
+ mainControls.style.opacity = '0.5';
38
+ mainControls.style.pointerEvents = 'none';
39
+ monitorControls.style.opacity = '0.5';
40
+ monitorControls.style.pointerEvents = 'none';
41
+ }
42
+ });
43
+
44
+ // 2. Get current tab
45
  const [tab] = await chrome.tabs.query({ active: true, currentWindow: true });
46
  if (tab?.url) {
47
  currentUrlEl.textContent = tab.url;
48
  }
49
 
50
+ // 3. Check connection
51
  chrome.runtime.sendMessage({ action: 'check_connection' }, (response) => {
52
  if (response && response.success) {
53
  statusDot.classList.add('connected');
 
63
  // Handle Ingest
64
  btnIngest.addEventListener('click', () => {
65
  setStatus('Processing...', 'info');
66
+ chrome.runtime.sendMessage({
67
+ action: 'ingest_page',
68
+ data: {
69
+ url: tab.url,
70
+ title: tab.title,
71
+ user: currentUser
72
+ }
73
  }, (response) => {
74
  if (response && response.success) {
75
  setStatus('Page ingested successfully', 'success');
 
81
 
82
  // Handle Crawl
83
  btnCrawl.addEventListener('click', () => {
84
+ const keywords = crawlKeywords.value.split(',').map(k => k.trim()).filter(k => k.length > 0);
85
+
86
  setStatus('Starting crawler...', 'info');
87
+ chrome.runtime.sendMessage({
88
+ action: 'crawl_site',
89
+ data: {
90
+ url: tab.url,
91
+ depth: parseInt(depthRange.value),
92
+ mode: crawlMode.value,
93
+ keywords: keywords,
94
+ user: currentUser
95
+ }
96
  }, (response) => {
97
  if (response && response.success) {
98
+ setStatus(`Crawl started (${crawlMode.value}, Depth: ${depthRange.value})`, 'success');
99
  } else {
100
  setStatus('Crawl failed to start', 'error');
101
  }