Spaces:
Paused
Paused
File size: 4,972 Bytes
34367da | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import axios from 'axios';
import * as fs from 'fs/promises';
import * as path from 'path';
import * as dotenv from 'dotenv';
// Load env vars
dotenv.config({ path: path.resolve(process.cwd(), 'apps/backend/.env') });
const GITHUB_API = 'https://api.github.com';
const HF_API = 'https://huggingface.co/api';
interface DiscoveredRepo {
name: string;
url: string;
description: string | null;
stars: number;
source: 'github' | 'huggingface';
topics?: string[];
}
async function searchGithub(query: string): Promise<DiscoveredRepo[]> {
try {
const headers: any = {
'Accept': 'application/vnd.github.v3+json',
};
if (process.env.GITHUB_TOKEN) {
headers['Authorization'] = `token ${process.env.GITHUB_TOKEN}`;
}
console.log(`π Searching GitHub for: ${query}`);
const response = await axios.get(`${GITHUB_API}/search/repositories`, {
params: { q: query, sort: 'stars', order: 'desc', per_page: 20 },
headers
});
return response.data.items.map((item: any) => ({
name: item.full_name,
url: item.html_url,
description: item.description,
stars: item.stargazers_count,
source: 'github',
topics: item.topics
}));
} catch (error: any) {
console.error(`β GitHub search failed: ${error.message}`);
return [];
}
}
async function getGithubOrgRepos(org: string): Promise<DiscoveredRepo[]> {
try {
const headers: any = {
'Accept': 'application/vnd.github.v3+json',
};
if (process.env.GITHUB_TOKEN) {
headers['Authorization'] = `token ${process.env.GITHUB_TOKEN}`;
}
console.log(`π’ Fetching repos for org: ${org}`);
const response = await axios.get(`${GITHUB_API}/users/${org}/repos`, {
params: { sort: 'updated', per_page: 100 },
headers
});
return response.data.map((item: any) => ({
name: item.full_name,
url: item.html_url,
description: item.description,
stars: item.stargazers_count,
source: 'github',
topics: item.topics
}));
} catch (error: any) {
console.error(`β GitHub org fetch failed: ${error.message}`);
return [];
}
}
async function searchHuggingFace(tag: string): Promise<DiscoveredRepo[]> {
try {
console.log(`π€ Searching Hugging Face for tag: ${tag}`);
const response = await axios.get(`${HF_API}/models`, {
params: { filter: tag, sort: 'likes', direction: -1, limit: 20 }
});
return response.data.map((item: any) => ({
name: item.modelId,
url: `https://huggingface.co/${item.modelId}`,
description: null, // HF API doesn't always return description in list
stars: item.likes,
source: 'huggingface',
topics: item.tags
}));
} catch (error: any) {
console.error(`β Hugging Face search failed: ${error.message}`);
return [];
}
}
async function main() {
console.log('π Starting Danish OSINT Discovery...');
const allRepos: DiscoveredRepo[] = [];
// 1. Alexandra Institute Repos
const alexandraRepos = await getGithubOrgRepos('alexandrainst');
allRepos.push(...alexandraRepos);
// 2. GitHub Searches
const ghQueries = [
'topic:danish',
'topic:denmark',
'language:Danish',
'"Danish NLP"',
'dk-bert'
];
for (const q of ghQueries) {
const results = await searchGithub(q);
allRepos.push(...results);
// Sleep briefly to be nice to API
await new Promise(r => setTimeout(r, 1000));
}
// 3. Hugging Face Searches
const hfTags = ['da', 'danish'];
for (const tag of hfTags) {
const results = await searchHuggingFace(tag);
allRepos.push(...results);
}
// Deduplicate
const uniqueRepos = Array.from(new Map(allRepos.map(item => [item.url, item])).values());
console.log(`\nβ
Found ${uniqueRepos.length} unique resources.`);
// Output to file
const outputPath = path.resolve(process.cwd(), 'data', 'danish_osint_discovery.json');
await fs.mkdir(path.dirname(outputPath), { recursive: true });
await fs.writeFile(outputPath, JSON.stringify(uniqueRepos, null, 2));
console.log(`πΎ Saved results to ${outputPath}`);
// Generate a snippet for ingest_curated_repos.ts
const ingestSnippet = uniqueRepos
.filter(r => r.source === 'github') // Only GitHub for now for the ingest script
.slice(0, 20) // Top 20 for example
.map(r => ({
key: r.name.replace('/', '-').toLowerCase(),
url: r.url + '.git',
files: [{ title: r.name, relPath: 'README.md', category: 'danish-osint' }]
}));
const snippetPath = path.resolve(process.cwd(), 'data', 'ingest_snippet.json');
await fs.writeFile(snippetPath, JSON.stringify(ingestSnippet, null, 2));
console.log(`π Generated ingestion snippet for top 20 GitHub repos at ${snippetPath}`);
}
main().catch(console.error);
|