widgettdc-api / apps /backend /src /scripts /discover_danish_osint.ts
Kraft102's picture
Update backend source
34367da verified
import axios from 'axios';
import * as fs from 'fs/promises';
import * as path from 'path';
import * as dotenv from 'dotenv';
// Load env vars
dotenv.config({ path: path.resolve(process.cwd(), 'apps/backend/.env') });
const GITHUB_API = 'https://api.github.com';
const HF_API = 'https://huggingface.co/api';
interface DiscoveredRepo {
name: string;
url: string;
description: string | null;
stars: number;
source: 'github' | 'huggingface';
topics?: string[];
}
async function searchGithub(query: string): Promise<DiscoveredRepo[]> {
try {
const headers: any = {
'Accept': 'application/vnd.github.v3+json',
};
if (process.env.GITHUB_TOKEN) {
headers['Authorization'] = `token ${process.env.GITHUB_TOKEN}`;
}
console.log(`πŸ” Searching GitHub for: ${query}`);
const response = await axios.get(`${GITHUB_API}/search/repositories`, {
params: { q: query, sort: 'stars', order: 'desc', per_page: 20 },
headers
});
return response.data.items.map((item: any) => ({
name: item.full_name,
url: item.html_url,
description: item.description,
stars: item.stargazers_count,
source: 'github',
topics: item.topics
}));
} catch (error: any) {
console.error(`❌ GitHub search failed: ${error.message}`);
return [];
}
}
async function getGithubOrgRepos(org: string): Promise<DiscoveredRepo[]> {
try {
const headers: any = {
'Accept': 'application/vnd.github.v3+json',
};
if (process.env.GITHUB_TOKEN) {
headers['Authorization'] = `token ${process.env.GITHUB_TOKEN}`;
}
console.log(`🏒 Fetching repos for org: ${org}`);
const response = await axios.get(`${GITHUB_API}/users/${org}/repos`, {
params: { sort: 'updated', per_page: 100 },
headers
});
return response.data.map((item: any) => ({
name: item.full_name,
url: item.html_url,
description: item.description,
stars: item.stargazers_count,
source: 'github',
topics: item.topics
}));
} catch (error: any) {
console.error(`❌ GitHub org fetch failed: ${error.message}`);
return [];
}
}
async function searchHuggingFace(tag: string): Promise<DiscoveredRepo[]> {
try {
console.log(`πŸ€— Searching Hugging Face for tag: ${tag}`);
const response = await axios.get(`${HF_API}/models`, {
params: { filter: tag, sort: 'likes', direction: -1, limit: 20 }
});
return response.data.map((item: any) => ({
name: item.modelId,
url: `https://huggingface.co/${item.modelId}`,
description: null, // HF API doesn't always return description in list
stars: item.likes,
source: 'huggingface',
topics: item.tags
}));
} catch (error: any) {
console.error(`❌ Hugging Face search failed: ${error.message}`);
return [];
}
}
async function main() {
console.log('πŸš€ Starting Danish OSINT Discovery...');
const allRepos: DiscoveredRepo[] = [];
// 1. Alexandra Institute Repos
const alexandraRepos = await getGithubOrgRepos('alexandrainst');
allRepos.push(...alexandraRepos);
// 2. GitHub Searches
const ghQueries = [
'topic:danish',
'topic:denmark',
'language:Danish',
'"Danish NLP"',
'dk-bert'
];
for (const q of ghQueries) {
const results = await searchGithub(q);
allRepos.push(...results);
// Sleep briefly to be nice to API
await new Promise(r => setTimeout(r, 1000));
}
// 3. Hugging Face Searches
const hfTags = ['da', 'danish'];
for (const tag of hfTags) {
const results = await searchHuggingFace(tag);
allRepos.push(...results);
}
// Deduplicate
const uniqueRepos = Array.from(new Map(allRepos.map(item => [item.url, item])).values());
console.log(`\nβœ… Found ${uniqueRepos.length} unique resources.`);
// Output to file
const outputPath = path.resolve(process.cwd(), 'data', 'danish_osint_discovery.json');
await fs.mkdir(path.dirname(outputPath), { recursive: true });
await fs.writeFile(outputPath, JSON.stringify(uniqueRepos, null, 2));
console.log(`πŸ’Ύ Saved results to ${outputPath}`);
// Generate a snippet for ingest_curated_repos.ts
const ingestSnippet = uniqueRepos
.filter(r => r.source === 'github') // Only GitHub for now for the ingest script
.slice(0, 20) // Top 20 for example
.map(r => ({
key: r.name.replace('/', '-').toLowerCase(),
url: r.url + '.git',
files: [{ title: r.name, relPath: 'README.md', category: 'danish-osint' }]
}));
const snippetPath = path.resolve(process.cwd(), 'data', 'ingest_snippet.json');
await fs.writeFile(snippetPath, JSON.stringify(ingestSnippet, null, 2));
console.log(`πŸ“ Generated ingestion snippet for top 20 GitHub repos at ${snippetPath}`);
}
main().catch(console.error);