Spaces:

Kraft102
/

widgetdc-cortex

Paused

App Files Files Community

widgetdc-cortex / apps /backend /src /scripts /ingest_awesome.ts

Kraft102

Initial deployment - WidgeTDC Cortex Backend v2.1.0

529090e 2 months ago

raw

history blame contribute delete

5.53 kB

	import * as dotenv from 'dotenv';
	import * as fs from 'fs/promises';
	import * as path from 'path';
	import { execSync } from 'child_process';

	dotenv.config({ path: path.resolve(process.cwd(), 'apps/backend/.env') });

	import { knowledgeAcquisition } from '../services/KnowledgeAcquisitionService.js';
	import { neo4jAdapter } from '../adapters/Neo4jAdapter.js';
	import { getPgVectorStore } from '../platform/vector/PgVectorStoreAdapter.js';

	const REPO_URL = 'https://github.com/sindresorhus/awesome.git';
	const CLONE_DIR = path.resolve(process.cwd(), 'data', 'repos', 'awesome');
	const MAX_FILE_BYTES = 2 * 1024 * 1024; // 2MB safety cap

	// Curated subset of relevant sublists for WidgeTDC (security, OSINT, AI/agents, data, DevOps)
	const CURATED_FILES: { title: string; relPath: string; category: string }[] = [
	{ title: 'Awesome OSS Catalog (root)', relPath: 'README.md', category: 'awesome-list' },
	{ title: 'Security', relPath: 'security/README.md', category: 'security' },
	{ title: 'Big Data', relPath: 'big-data/README.md', category: 'data-pipelines' },
	{ title: 'Data Science', relPath: 'data-science/README.md', category: 'data-science' },
	{ title: 'Machine Learning', relPath: 'machine-learning/README.md', category: 'machine-learning' },
	{ title: 'NLP', relPath: 'nlp/README.md', category: 'nlp' },
	{ title: 'DevOps', relPath: 'devops/README.md', category: 'devops' },
	{ title: 'Sysadmin', relPath: 'sysadmin/README.md', category: 'sysadmin' },
	{ title: 'Databases', relPath: 'databases/README.md', category: 'databases' },
	{ title: 'Analytics', relPath: 'analytics/README.md', category: 'analytics' },
	{ title: 'Privacy', relPath: 'privacy/README.md', category: 'privacy' },
	{ title: 'Cryptography', relPath: 'cryptography/README.md', category: 'cryptography' },
	{ title: 'Incident Response', relPath: 'incident-response/README.md', category: 'incident-response' },
	{ title: 'Threat Intelligence', relPath: 'threat-intelligence/README.md', category: 'threat-intel' },
	{ title: 'Forensics', relPath: 'forensics/README.md', category: 'forensics' },
	{ title: 'Networking', relPath: 'networking/README.md', category: 'networking' },
	{ title: 'Cloud', relPath: 'cloud/README.md', category: 'cloud' },
	{ title: 'GraphQL', relPath: 'graphql/README.md', category: 'api' },
	{ title: 'Selfhosted', relPath: 'selfhosted/README.md', category: 'selfhosted' },
	{ title: 'Research', relPath: 'research/README.md', category: 'research' },
	{ title: 'Open Source Alternatives', relPath: 'opensource-alternatives/README.md', category: 'oss-alternatives' }
	];

	async function ensureRepo(): Promise<void> {
	await fs.mkdir(path.dirname(CLONE_DIR), { recursive: true });

	const repoExists = await fs
	.access(path.join(CLONE_DIR, '.git'))
	.then(() => true)
	.catch(() => false);

	const cmd = repoExists
	? `git -C "${CLONE_DIR}" pull`
	: `git clone --depth=1 "${REPO_URL}" "${CLONE_DIR}"`;

	console.log(repoExists ? '🔄 Pulling latest awesome list…' : '📥 Cloning awesome list…');
	execSync(cmd, { stdio: 'inherit' });
	console.log('✅ Repository ready at', CLONE_DIR);
	}

	async function ingest(): Promise<void> {
	console.log('🚀 Starting curated ingestion of sindresorhus/awesome…');

	try {
	await ensureRepo();

	console.log('🔌 Connecting to databases...');
	const vectorStore = getPgVectorStore();
	await vectorStore.initialize();

	for (const entry of CURATED_FILES) {
	const targetFile = path.join(CLONE_DIR, entry.relPath);
	const fileExists = await fs
	.access(targetFile)
	.then(() => true)
	.catch(() => false);

	if (!fileExists) {
	console.warn(`⚠️ Skipping missing file: ${entry.relPath}`);
	continue;
	}

	const stat = await fs.stat(targetFile);
	if (stat.size > MAX_FILE_BYTES) {
	console.warn(`⚠️ Skipping oversized file (${stat.size} bytes): ${entry.relPath}`);
	continue;
	}

	console.log(`📄 Ingesting ${entry.title} (${entry.relPath})…`);
	const result = await knowledgeAcquisition.acquire({
	type: 'file',
	content: targetFile,
	metadata: {
	title: entry.title,
	source: REPO_URL,
	category: entry.category
	}
	});

	if (result.success) {
	console.log('✅ Ingestion Successful!');
	console.log('-----------------------------------');
	console.log(`📄 Source ID: ${result.sourceId}`);
	console.log(`🧩 Chunks: ${result.chunks}`);
	console.log(`🏷️ Entities: ${result.entitiesExtracted}`);
	console.log(`🔢 Vectors: ${result.vectorsStored}`);
	console.log(`🕸️ Graph Nodes: ${result.graphNodesCreated}`);
	console.log(`⏱️ Duration: ${result.duration}ms`);
	console.log('-----------------------------------');
	} else {
	console.error('❌ Ingestion failed', result.errors);
	}
	}
	} catch (error) {
	console.error('💥 Fatal error during ingestion:', error);
	} finally {
	await neo4jAdapter.close();
	process.exit(0);
	}
	}

	ingest();