Spaces:
Paused
Paused
File size: 5,646 Bytes
34367da | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | import * as dotenv from 'dotenv';
import * as fs from 'fs/promises';
import * as path from 'path';
import { execSync } from 'child_process';
dotenv.config({ path: path.resolve(process.cwd(), 'apps/backend/.env') });
import { knowledgeAcquisition } from '../services/KnowledgeAcquisitionService.js';
import { neo4jAdapter } from '../adapters/Neo4jAdapter.js';
import { getPgVectorStore } from '../platform/vector/PgVectorStoreAdapter.js';
const REPO_URL = 'https://github.com/sindresorhus/awesome.git';
const CLONE_DIR = path.resolve(process.cwd(), 'data', 'repos', 'awesome');
const MAX_FILE_BYTES = 2 * 1024 * 1024; // 2MB safety cap
// Curated subset of relevant sublists for WidgeTDC (security, OSINT, AI/agents, data, DevOps)
const CURATED_FILES: { title: string; relPath: string; category: string }[] = [
{ title: 'Awesome OSS Catalog (root)', relPath: 'README.md', category: 'awesome-list' },
{ title: 'Security', relPath: 'security/README.md', category: 'security' },
{ title: 'Big Data', relPath: 'big-data/README.md', category: 'data-pipelines' },
{ title: 'Data Science', relPath: 'data-science/README.md', category: 'data-science' },
{ title: 'Machine Learning', relPath: 'machine-learning/README.md', category: 'machine-learning' },
{ title: 'NLP', relPath: 'nlp/README.md', category: 'nlp' },
{ title: 'DevOps', relPath: 'devops/README.md', category: 'devops' },
{ title: 'Sysadmin', relPath: 'sysadmin/README.md', category: 'sysadmin' },
{ title: 'Databases', relPath: 'databases/README.md', category: 'databases' },
{ title: 'Analytics', relPath: 'analytics/README.md', category: 'analytics' },
{ title: 'Privacy', relPath: 'privacy/README.md', category: 'privacy' },
{ title: 'Cryptography', relPath: 'cryptography/README.md', category: 'cryptography' },
{ title: 'Incident Response', relPath: 'incident-response/README.md', category: 'incident-response' },
{ title: 'Threat Intelligence', relPath: 'threat-intelligence/README.md', category: 'threat-intel' },
{ title: 'Forensics', relPath: 'forensics/README.md', category: 'forensics' },
{ title: 'Networking', relPath: 'networking/README.md', category: 'networking' },
{ title: 'Cloud', relPath: 'cloud/README.md', category: 'cloud' },
{ title: 'GraphQL', relPath: 'graphql/README.md', category: 'api' },
{ title: 'Selfhosted', relPath: 'selfhosted/README.md', category: 'selfhosted' },
{ title: 'Research', relPath: 'research/README.md', category: 'research' },
{ title: 'Open Source Alternatives', relPath: 'opensource-alternatives/README.md', category: 'oss-alternatives' }
];
async function ensureRepo(): Promise<void> {
await fs.mkdir(path.dirname(CLONE_DIR), { recursive: true });
const repoExists = await fs
.access(path.join(CLONE_DIR, '.git'))
.then(() => true)
.catch(() => false);
const cmd = repoExists
? `git -C "${CLONE_DIR}" pull`
: `git clone --depth=1 "${REPO_URL}" "${CLONE_DIR}"`;
console.log(repoExists ? '🔄 Pulling latest awesome list…' : '📥 Cloning awesome list…');
execSync(cmd, { stdio: 'inherit' });
console.log('✅ Repository ready at', CLONE_DIR);
}
async function ingest(): Promise<void> {
console.log('🚀 Starting curated ingestion of sindresorhus/awesome…');
try {
await ensureRepo();
console.log('🔌 Connecting to databases...');
const vectorStore = getPgVectorStore();
await vectorStore.initialize();
for (const entry of CURATED_FILES) {
const targetFile = path.join(CLONE_DIR, entry.relPath);
const fileExists = await fs
.access(targetFile)
.then(() => true)
.catch(() => false);
if (!fileExists) {
console.warn(`⚠️ Skipping missing file: ${entry.relPath}`);
continue;
}
const stat = await fs.stat(targetFile);
if (stat.size > MAX_FILE_BYTES) {
console.warn(`⚠️ Skipping oversized file (${stat.size} bytes): ${entry.relPath}`);
continue;
}
console.log(`📄 Ingesting ${entry.title} (${entry.relPath})…`);
const result = await knowledgeAcquisition.acquire({
type: 'file',
content: targetFile,
metadata: {
title: entry.title,
source: REPO_URL,
category: entry.category
}
});
if (result.success) {
console.log('✅ Ingestion Successful!');
console.log('-----------------------------------');
console.log(`📄 Source ID: ${result.sourceId}`);
console.log(`🧩 Chunks: ${result.chunks}`);
console.log(`🏷️ Entities: ${result.entitiesExtracted}`);
console.log(`🔢 Vectors: ${result.vectorsStored}`);
console.log(`🕸️ Graph Nodes: ${result.graphNodesCreated}`);
console.log(`⏱️ Duration: ${result.duration}ms`);
console.log('-----------------------------------');
} else {
console.error('❌ Ingestion failed', result.errors);
}
}
} catch (error) {
console.error('💥 Fatal error during ingestion:', error);
} finally {
await neo4jAdapter.close();
process.exit(0);
}
}
ingest();
|