import * as dotenv from 'dotenv'; import * as fs from 'fs/promises'; import * as path from 'path'; import { execSync } from 'child_process'; dotenv.config({ path: path.resolve(process.cwd(), 'apps/backend/.env') }); import { knowledgeAcquisition } from '../services/KnowledgeAcquisitionService.js'; import { neo4jAdapter } from '../adapters/Neo4jAdapter.js'; import { getPgVectorStore } from '../platform/vector/PgVectorStoreAdapter.js'; const REPO_URL = 'https://github.com/sindresorhus/awesome.git'; const CLONE_DIR = path.resolve(process.cwd(), 'data', 'repos', 'awesome'); const MAX_FILE_BYTES = 2 * 1024 * 1024; // 2MB safety cap // Curated subset of relevant sublists for WidgeTDC (security, OSINT, AI/agents, data, DevOps) const CURATED_FILES: { title: string; relPath: string; category: string }[] = [ { title: 'Awesome OSS Catalog (root)', relPath: 'README.md', category: 'awesome-list' }, { title: 'Security', relPath: 'security/README.md', category: 'security' }, { title: 'Big Data', relPath: 'big-data/README.md', category: 'data-pipelines' }, { title: 'Data Science', relPath: 'data-science/README.md', category: 'data-science' }, { title: 'Machine Learning', relPath: 'machine-learning/README.md', category: 'machine-learning' }, { title: 'NLP', relPath: 'nlp/README.md', category: 'nlp' }, { title: 'DevOps', relPath: 'devops/README.md', category: 'devops' }, { title: 'Sysadmin', relPath: 'sysadmin/README.md', category: 'sysadmin' }, { title: 'Databases', relPath: 'databases/README.md', category: 'databases' }, { title: 'Analytics', relPath: 'analytics/README.md', category: 'analytics' }, { title: 'Privacy', relPath: 'privacy/README.md', category: 'privacy' }, { title: 'Cryptography', relPath: 'cryptography/README.md', category: 'cryptography' }, { title: 'Incident Response', relPath: 'incident-response/README.md', category: 'incident-response' }, { title: 'Threat Intelligence', relPath: 'threat-intelligence/README.md', category: 'threat-intel' }, { title: 'Forensics', relPath: 'forensics/README.md', category: 'forensics' }, { title: 'Networking', relPath: 'networking/README.md', category: 'networking' }, { title: 'Cloud', relPath: 'cloud/README.md', category: 'cloud' }, { title: 'GraphQL', relPath: 'graphql/README.md', category: 'api' }, { title: 'Selfhosted', relPath: 'selfhosted/README.md', category: 'selfhosted' }, { title: 'Research', relPath: 'research/README.md', category: 'research' }, { title: 'Open Source Alternatives', relPath: 'opensource-alternatives/README.md', category: 'oss-alternatives' } ]; async function ensureRepo(): Promise { await fs.mkdir(path.dirname(CLONE_DIR), { recursive: true }); const repoExists = await fs .access(path.join(CLONE_DIR, '.git')) .then(() => true) .catch(() => false); const cmd = repoExists ? `git -C "${CLONE_DIR}" pull` : `git clone --depth=1 "${REPO_URL}" "${CLONE_DIR}"`; console.log(repoExists ? '🔄 Pulling latest awesome list…' : '📥 Cloning awesome list…'); execSync(cmd, { stdio: 'inherit' }); console.log('✅ Repository ready at', CLONE_DIR); } async function ingest(): Promise { console.log('🚀 Starting curated ingestion of sindresorhus/awesome…'); try { await ensureRepo(); console.log('🔌 Connecting to databases...'); const vectorStore = getPgVectorStore(); await vectorStore.initialize(); for (const entry of CURATED_FILES) { const targetFile = path.join(CLONE_DIR, entry.relPath); const fileExists = await fs .access(targetFile) .then(() => true) .catch(() => false); if (!fileExists) { console.warn(`⚠️ Skipping missing file: ${entry.relPath}`); continue; } const stat = await fs.stat(targetFile); if (stat.size > MAX_FILE_BYTES) { console.warn(`⚠️ Skipping oversized file (${stat.size} bytes): ${entry.relPath}`); continue; } console.log(`📄 Ingesting ${entry.title} (${entry.relPath})…`); const result = await knowledgeAcquisition.acquire({ type: 'file', content: targetFile, metadata: { title: entry.title, source: REPO_URL, category: entry.category } }); if (result.success) { console.log('✅ Ingestion Successful!'); console.log('-----------------------------------'); console.log(`📄 Source ID: ${result.sourceId}`); console.log(`🧩 Chunks: ${result.chunks}`); console.log(`🏷️ Entities: ${result.entitiesExtracted}`); console.log(`🔢 Vectors: ${result.vectorsStored}`); console.log(`🕸️ Graph Nodes: ${result.graphNodesCreated}`); console.log(`⏱️ Duration: ${result.duration}ms`); console.log('-----------------------------------'); } else { console.error('❌ Ingestion failed', result.errors); } } } catch (error) { console.error('💥 Fatal error during ingestion:', error); } finally { await neo4jAdapter.close(); process.exit(0); } } ingest();