import axios from 'axios'; import * as fs from 'fs/promises'; import * as path from 'path'; import * as dotenv from 'dotenv'; // Load env vars dotenv.config({ path: path.resolve(process.cwd(), 'apps/backend/.env') }); const GITHUB_API = 'https://api.github.com'; const HF_API = 'https://huggingface.co/api'; interface DiscoveredRepo { name: string; url: string; description: string | null; stars: number; source: 'github' | 'huggingface'; topics?: string[]; } async function searchGithub(query: string): Promise { try { const headers: any = { 'Accept': 'application/vnd.github.v3+json', }; if (process.env.GITHUB_TOKEN) { headers['Authorization'] = `token ${process.env.GITHUB_TOKEN}`; } console.log(`šŸ” Searching GitHub for: ${query}`); const response = await axios.get(`${GITHUB_API}/search/repositories`, { params: { q: query, sort: 'stars', order: 'desc', per_page: 20 }, headers }); return response.data.items.map((item: any) => ({ name: item.full_name, url: item.html_url, description: item.description, stars: item.stargazers_count, source: 'github', topics: item.topics })); } catch (error: any) { console.error(`āŒ GitHub search failed: ${error.message}`); return []; } } async function getGithubOrgRepos(org: string): Promise { try { const headers: any = { 'Accept': 'application/vnd.github.v3+json', }; if (process.env.GITHUB_TOKEN) { headers['Authorization'] = `token ${process.env.GITHUB_TOKEN}`; } console.log(`šŸ¢ Fetching repos for org: ${org}`); const response = await axios.get(`${GITHUB_API}/users/${org}/repos`, { params: { sort: 'updated', per_page: 100 }, headers }); return response.data.map((item: any) => ({ name: item.full_name, url: item.html_url, description: item.description, stars: item.stargazers_count, source: 'github', topics: item.topics })); } catch (error: any) { console.error(`āŒ GitHub org fetch failed: ${error.message}`); return []; } } async function searchHuggingFace(tag: string): Promise { try { console.log(`šŸ¤— Searching Hugging Face for tag: ${tag}`); const response = await axios.get(`${HF_API}/models`, { params: { filter: tag, sort: 'likes', direction: -1, limit: 20 } }); return response.data.map((item: any) => ({ name: item.modelId, url: `https://huggingface.co/${item.modelId}`, description: null, // HF API doesn't always return description in list stars: item.likes, source: 'huggingface', topics: item.tags })); } catch (error: any) { console.error(`āŒ Hugging Face search failed: ${error.message}`); return []; } } async function main() { console.log('šŸš€ Starting Danish OSINT Discovery...'); const allRepos: DiscoveredRepo[] = []; // 1. Alexandra Institute Repos const alexandraRepos = await getGithubOrgRepos('alexandrainst'); allRepos.push(...alexandraRepos); // 2. GitHub Searches const ghQueries = [ 'topic:danish', 'topic:denmark', 'language:Danish', '"Danish NLP"', 'dk-bert' ]; for (const q of ghQueries) { const results = await searchGithub(q); allRepos.push(...results); // Sleep briefly to be nice to API await new Promise(r => setTimeout(r, 1000)); } // 3. Hugging Face Searches const hfTags = ['da', 'danish']; for (const tag of hfTags) { const results = await searchHuggingFace(tag); allRepos.push(...results); } // Deduplicate const uniqueRepos = Array.from(new Map(allRepos.map(item => [item.url, item])).values()); console.log(`\nāœ… Found ${uniqueRepos.length} unique resources.`); // Output to file const outputPath = path.resolve(process.cwd(), 'data', 'danish_osint_discovery.json'); await fs.mkdir(path.dirname(outputPath), { recursive: true }); await fs.writeFile(outputPath, JSON.stringify(uniqueRepos, null, 2)); console.log(`šŸ’¾ Saved results to ${outputPath}`); // Generate a snippet for ingest_curated_repos.ts const ingestSnippet = uniqueRepos .filter(r => r.source === 'github') // Only GitHub for now for the ingest script .slice(0, 20) // Top 20 for example .map(r => ({ key: r.name.replace('/', '-').toLowerCase(), url: r.url + '.git', files: [{ title: r.name, relPath: 'README.md', category: 'danish-osint' }] })); const snippetPath = path.resolve(process.cwd(), 'data', 'ingest_snippet.json'); await fs.writeFile(snippetPath, JSON.stringify(ingestSnippet, null, 2)); console.log(`šŸ“ Generated ingestion snippet for top 20 GitHub repos at ${snippetPath}`); } main().catch(console.error);