Spaces:

Kraft102
/

widgetdc-cortex

Paused

File size: 5,527 Bytes

529090e

import * as dotenv from 'dotenv';
import * as fs from 'fs/promises';
import * as path from 'path';
import { execSync } from 'child_process';

dotenv.config({ path: path.resolve(process.cwd(), 'apps/backend/.env') });

import { knowledgeAcquisition } from '../services/KnowledgeAcquisitionService.js';
import { neo4jAdapter } from '../adapters/Neo4jAdapter.js';
import { getPgVectorStore } from '../platform/vector/PgVectorStoreAdapter.js';

const REPO_URL = 'https://github.com/sindresorhus/awesome.git';
const CLONE_DIR = path.resolve(process.cwd(), 'data', 'repos', 'awesome');
const MAX_FILE_BYTES = 2 * 1024 * 1024; // 2MB safety cap

// Curated subset of relevant sublists for WidgeTDC (security, OSINT, AI/agents, data, DevOps)
const CURATED_FILES: { title: string; relPath: string; category: string }[] = [
    { title: 'Awesome OSS Catalog (root)', relPath: 'README.md', category: 'awesome-list' },
    { title: 'Security', relPath: 'security/README.md', category: 'security' },
    { title: 'Big Data', relPath: 'big-data/README.md', category: 'data-pipelines' },
    { title: 'Data Science', relPath: 'data-science/README.md', category: 'data-science' },
    { title: 'Machine Learning', relPath: 'machine-learning/README.md', category: 'machine-learning' },
    { title: 'NLP', relPath: 'nlp/README.md', category: 'nlp' },
    { title: 'DevOps', relPath: 'devops/README.md', category: 'devops' },
    { title: 'Sysadmin', relPath: 'sysadmin/README.md', category: 'sysadmin' },
    { title: 'Databases', relPath: 'databases/README.md', category: 'databases' },
    { title: 'Analytics', relPath: 'analytics/README.md', category: 'analytics' },
    { title: 'Privacy', relPath: 'privacy/README.md', category: 'privacy' },
    { title: 'Cryptography', relPath: 'cryptography/README.md', category: 'cryptography' },
    { title: 'Incident Response', relPath: 'incident-response/README.md', category: 'incident-response' },
    { title: 'Threat Intelligence', relPath: 'threat-intelligence/README.md', category: 'threat-intel' },
    { title: 'Forensics', relPath: 'forensics/README.md', category: 'forensics' },
    { title: 'Networking', relPath: 'networking/README.md', category: 'networking' },
    { title: 'Cloud', relPath: 'cloud/README.md', category: 'cloud' },
    { title: 'GraphQL', relPath: 'graphql/README.md', category: 'api' },
    { title: 'Selfhosted', relPath: 'selfhosted/README.md', category: 'selfhosted' },
    { title: 'Research', relPath: 'research/README.md', category: 'research' },
    { title: 'Open Source Alternatives', relPath: 'opensource-alternatives/README.md', category: 'oss-alternatives' }
];

async function ensureRepo(): Promise<void> {
    await fs.mkdir(path.dirname(CLONE_DIR), { recursive: true });

    const repoExists = await fs
        .access(path.join(CLONE_DIR, '.git'))
        .then(() => true)
        .catch(() => false);

    const cmd = repoExists
        ? `git -C "${CLONE_DIR}" pull`
        : `git clone --depth=1 "${REPO_URL}" "${CLONE_DIR}"`;

    console.log(repoExists ? '🔄 Pulling latest awesome list…' : '📥 Cloning awesome list…');
    execSync(cmd, { stdio: 'inherit' });
    console.log('✅ Repository ready at', CLONE_DIR);
}

async function ingest(): Promise<void> {
    console.log('🚀 Starting curated ingestion of sindresorhus/awesome…');

    try {
        await ensureRepo();

        console.log('🔌 Connecting to databases...');
        const vectorStore = getPgVectorStore();
        await vectorStore.initialize();

        for (const entry of CURATED_FILES) {
            const targetFile = path.join(CLONE_DIR, entry.relPath);
            const fileExists = await fs
                .access(targetFile)
                .then(() => true)
                .catch(() => false);

            if (!fileExists) {
                console.warn(`⚠️ Skipping missing file: ${entry.relPath}`);
                continue;
            }

            const stat = await fs.stat(targetFile);
            if (stat.size > MAX_FILE_BYTES) {
                console.warn(`⚠️ Skipping oversized file (${stat.size} bytes): ${entry.relPath}`);
                continue;
            }

            console.log(`📄 Ingesting ${entry.title} (${entry.relPath})…`);
            const result = await knowledgeAcquisition.acquire({
                type: 'file',
                content: targetFile,
                metadata: {
                    title: entry.title,
                    source: REPO_URL,
                    category: entry.category
                }
            });

            if (result.success) {
                console.log('✅ Ingestion Successful!');
                console.log('-----------------------------------');
                console.log(`📄 Source ID: ${result.sourceId}`);
                console.log(`🧩 Chunks: ${result.chunks}`);
                console.log(`🏷️ Entities: ${result.entitiesExtracted}`);
                console.log(`🔢 Vectors: ${result.vectorsStored}`);
                console.log(`🕸️ Graph Nodes: ${result.graphNodesCreated}`);
                console.log(`⏱️ Duration: ${result.duration}ms`);
                console.log('-----------------------------------');
            } else {
                console.error('❌ Ingestion failed', result.errors);
            }
        }
    } catch (error) {
        console.error('💥 Fatal error during ingestion:', error);
    } finally {
        await neo4jAdapter.close();
        process.exit(0);
    }
}

ingest();