Spaces:
Paused
Paused
| import * as dotenv from 'dotenv'; | |
| import * as fs from 'fs/promises'; | |
| import * as path from 'path'; | |
| import { execSync } from 'child_process'; | |
| dotenv.config({ path: path.resolve(process.cwd(), 'apps/backend/.env') }); | |
| import { knowledgeAcquisition } from '../services/KnowledgeAcquisitionService.js'; | |
| import { neo4jAdapter } from '../adapters/Neo4jAdapter.js'; | |
| import { getPgVectorStore } from '../platform/vector/PgVectorStoreAdapter.js'; | |
| const REPO_URL = 'https://github.com/sindresorhus/awesome.git'; | |
| const CLONE_DIR = path.resolve(process.cwd(), 'data', 'repos', 'awesome'); | |
| const MAX_FILE_BYTES = 2 * 1024 * 1024; // 2MB safety cap | |
| // Curated subset of relevant sublists for WidgeTDC (security, OSINT, AI/agents, data, DevOps) | |
| const CURATED_FILES: { title: string; relPath: string; category: string }[] = [ | |
| { title: 'Awesome OSS Catalog (root)', relPath: 'README.md', category: 'awesome-list' }, | |
| { title: 'Security', relPath: 'security/README.md', category: 'security' }, | |
| { title: 'Big Data', relPath: 'big-data/README.md', category: 'data-pipelines' }, | |
| { title: 'Data Science', relPath: 'data-science/README.md', category: 'data-science' }, | |
| { title: 'Machine Learning', relPath: 'machine-learning/README.md', category: 'machine-learning' }, | |
| { title: 'NLP', relPath: 'nlp/README.md', category: 'nlp' }, | |
| { title: 'DevOps', relPath: 'devops/README.md', category: 'devops' }, | |
| { title: 'Sysadmin', relPath: 'sysadmin/README.md', category: 'sysadmin' }, | |
| { title: 'Databases', relPath: 'databases/README.md', category: 'databases' }, | |
| { title: 'Analytics', relPath: 'analytics/README.md', category: 'analytics' }, | |
| { title: 'Privacy', relPath: 'privacy/README.md', category: 'privacy' }, | |
| { title: 'Cryptography', relPath: 'cryptography/README.md', category: 'cryptography' }, | |
| { title: 'Incident Response', relPath: 'incident-response/README.md', category: 'incident-response' }, | |
| { title: 'Threat Intelligence', relPath: 'threat-intelligence/README.md', category: 'threat-intel' }, | |
| { title: 'Forensics', relPath: 'forensics/README.md', category: 'forensics' }, | |
| { title: 'Networking', relPath: 'networking/README.md', category: 'networking' }, | |
| { title: 'Cloud', relPath: 'cloud/README.md', category: 'cloud' }, | |
| { title: 'GraphQL', relPath: 'graphql/README.md', category: 'api' }, | |
| { title: 'Selfhosted', relPath: 'selfhosted/README.md', category: 'selfhosted' }, | |
| { title: 'Research', relPath: 'research/README.md', category: 'research' }, | |
| { title: 'Open Source Alternatives', relPath: 'opensource-alternatives/README.md', category: 'oss-alternatives' } | |
| ]; | |
| async function ensureRepo(): Promise<void> { | |
| await fs.mkdir(path.dirname(CLONE_DIR), { recursive: true }); | |
| const repoExists = await fs | |
| .access(path.join(CLONE_DIR, '.git')) | |
| .then(() => true) | |
| .catch(() => false); | |
| const cmd = repoExists | |
| ? `git -C "${CLONE_DIR}" pull` | |
| : `git clone --depth=1 "${REPO_URL}" "${CLONE_DIR}"`; | |
| console.log(repoExists ? '🔄 Pulling latest awesome list…' : '📥 Cloning awesome list…'); | |
| execSync(cmd, { stdio: 'inherit' }); | |
| console.log('✅ Repository ready at', CLONE_DIR); | |
| } | |
| async function ingest(): Promise<void> { | |
| console.log('🚀 Starting curated ingestion of sindresorhus/awesome…'); | |
| try { | |
| await ensureRepo(); | |
| console.log('🔌 Connecting to databases...'); | |
| const vectorStore = getPgVectorStore(); | |
| await vectorStore.initialize(); | |
| for (const entry of CURATED_FILES) { | |
| const targetFile = path.join(CLONE_DIR, entry.relPath); | |
| const fileExists = await fs | |
| .access(targetFile) | |
| .then(() => true) | |
| .catch(() => false); | |
| if (!fileExists) { | |
| console.warn(`⚠️ Skipping missing file: ${entry.relPath}`); | |
| continue; | |
| } | |
| const stat = await fs.stat(targetFile); | |
| if (stat.size > MAX_FILE_BYTES) { | |
| console.warn(`⚠️ Skipping oversized file (${stat.size} bytes): ${entry.relPath}`); | |
| continue; | |
| } | |
| console.log(`📄 Ingesting ${entry.title} (${entry.relPath})…`); | |
| const result = await knowledgeAcquisition.acquire({ | |
| type: 'file', | |
| content: targetFile, | |
| metadata: { | |
| title: entry.title, | |
| source: REPO_URL, | |
| category: entry.category | |
| } | |
| }); | |
| if (result.success) { | |
| console.log('✅ Ingestion Successful!'); | |
| console.log('-----------------------------------'); | |
| console.log(`📄 Source ID: ${result.sourceId}`); | |
| console.log(`🧩 Chunks: ${result.chunks}`); | |
| console.log(`🏷️ Entities: ${result.entitiesExtracted}`); | |
| console.log(`🔢 Vectors: ${result.vectorsStored}`); | |
| console.log(`🕸️ Graph Nodes: ${result.graphNodesCreated}`); | |
| console.log(`⏱️ Duration: ${result.duration}ms`); | |
| console.log('-----------------------------------'); | |
| } else { | |
| console.error('❌ Ingestion failed', result.errors); | |
| } | |
| } | |
| } catch (error) { | |
| console.error('💥 Fatal error during ingestion:', error); | |
| } finally { | |
| await neo4jAdapter.close(); | |
| process.exit(0); | |
| } | |
| } | |
| ingest(); | |