Spaces:
Paused
Paused
| import axios from 'axios'; | |
| import * as fs from 'fs/promises'; | |
| import * as path from 'path'; | |
| import * as dotenv from 'dotenv'; | |
| // Load env vars | |
| dotenv.config({ path: path.resolve(process.cwd(), 'apps/backend/.env') }); | |
| const GITHUB_API = 'https://api.github.com'; | |
| const HF_API = 'https://huggingface.co/api'; | |
| interface DiscoveredRepo { | |
| name: string; | |
| url: string; | |
| description: string | null; | |
| stars: number; | |
| source: 'github' | 'huggingface'; | |
| topics?: string[]; | |
| } | |
| async function searchGithub(query: string): Promise<DiscoveredRepo[]> { | |
| try { | |
| const headers: any = { | |
| 'Accept': 'application/vnd.github.v3+json', | |
| }; | |
| if (process.env.GITHUB_TOKEN) { | |
| headers['Authorization'] = `token ${process.env.GITHUB_TOKEN}`; | |
| } | |
| console.log(`π Searching GitHub for: ${query}`); | |
| const response = await axios.get(`${GITHUB_API}/search/repositories`, { | |
| params: { q: query, sort: 'stars', order: 'desc', per_page: 20 }, | |
| headers | |
| }); | |
| return response.data.items.map((item: any) => ({ | |
| name: item.full_name, | |
| url: item.html_url, | |
| description: item.description, | |
| stars: item.stargazers_count, | |
| source: 'github', | |
| topics: item.topics | |
| })); | |
| } catch (error: any) { | |
| console.error(`β GitHub search failed: ${error.message}`); | |
| return []; | |
| } | |
| } | |
| async function getGithubOrgRepos(org: string): Promise<DiscoveredRepo[]> { | |
| try { | |
| const headers: any = { | |
| 'Accept': 'application/vnd.github.v3+json', | |
| }; | |
| if (process.env.GITHUB_TOKEN) { | |
| headers['Authorization'] = `token ${process.env.GITHUB_TOKEN}`; | |
| } | |
| console.log(`π’ Fetching repos for org: ${org}`); | |
| const response = await axios.get(`${GITHUB_API}/users/${org}/repos`, { | |
| params: { sort: 'updated', per_page: 100 }, | |
| headers | |
| }); | |
| return response.data.map((item: any) => ({ | |
| name: item.full_name, | |
| url: item.html_url, | |
| description: item.description, | |
| stars: item.stargazers_count, | |
| source: 'github', | |
| topics: item.topics | |
| })); | |
| } catch (error: any) { | |
| console.error(`β GitHub org fetch failed: ${error.message}`); | |
| return []; | |
| } | |
| } | |
| async function searchHuggingFace(tag: string): Promise<DiscoveredRepo[]> { | |
| try { | |
| console.log(`π€ Searching Hugging Face for tag: ${tag}`); | |
| const response = await axios.get(`${HF_API}/models`, { | |
| params: { filter: tag, sort: 'likes', direction: -1, limit: 20 } | |
| }); | |
| return response.data.map((item: any) => ({ | |
| name: item.modelId, | |
| url: `https://huggingface.co/${item.modelId}`, | |
| description: null, // HF API doesn't always return description in list | |
| stars: item.likes, | |
| source: 'huggingface', | |
| topics: item.tags | |
| })); | |
| } catch (error: any) { | |
| console.error(`β Hugging Face search failed: ${error.message}`); | |
| return []; | |
| } | |
| } | |
| async function main() { | |
| console.log('π Starting Danish OSINT Discovery...'); | |
| const allRepos: DiscoveredRepo[] = []; | |
| // 1. Alexandra Institute Repos | |
| const alexandraRepos = await getGithubOrgRepos('alexandrainst'); | |
| allRepos.push(...alexandraRepos); | |
| // 2. GitHub Searches | |
| const ghQueries = [ | |
| 'topic:danish', | |
| 'topic:denmark', | |
| 'language:Danish', | |
| '"Danish NLP"', | |
| 'dk-bert' | |
| ]; | |
| for (const q of ghQueries) { | |
| const results = await searchGithub(q); | |
| allRepos.push(...results); | |
| // Sleep briefly to be nice to API | |
| await new Promise(r => setTimeout(r, 1000)); | |
| } | |
| // 3. Hugging Face Searches | |
| const hfTags = ['da', 'danish']; | |
| for (const tag of hfTags) { | |
| const results = await searchHuggingFace(tag); | |
| allRepos.push(...results); | |
| } | |
| // Deduplicate | |
| const uniqueRepos = Array.from(new Map(allRepos.map(item => [item.url, item])).values()); | |
| console.log(`\nβ Found ${uniqueRepos.length} unique resources.`); | |
| // Output to file | |
| const outputPath = path.resolve(process.cwd(), 'data', 'danish_osint_discovery.json'); | |
| await fs.mkdir(path.dirname(outputPath), { recursive: true }); | |
| await fs.writeFile(outputPath, JSON.stringify(uniqueRepos, null, 2)); | |
| console.log(`πΎ Saved results to ${outputPath}`); | |
| // Generate a snippet for ingest_curated_repos.ts | |
| const ingestSnippet = uniqueRepos | |
| .filter(r => r.source === 'github') // Only GitHub for now for the ingest script | |
| .slice(0, 20) // Top 20 for example | |
| .map(r => ({ | |
| key: r.name.replace('/', '-').toLowerCase(), | |
| url: r.url + '.git', | |
| files: [{ title: r.name, relPath: 'README.md', category: 'danish-osint' }] | |
| })); | |
| const snippetPath = path.resolve(process.cwd(), 'data', 'ingest_snippet.json'); | |
| await fs.writeFile(snippetPath, JSON.stringify(ingestSnippet, null, 2)); | |
| console.log(`π Generated ingestion snippet for top 20 GitHub repos at ${snippetPath}`); | |
| } | |
| main().catch(console.error); | |