devops-metrics-interpreter / scripts /extract-knowledge.ts
ruben de la fuente
feat: initial deployment to HuggingFace Spaces
0e13326
import fs from 'fs'
import path from 'path'
import { PDFParse } from 'pdf-parse'
import OpenAI from 'openai'
const PDF_DIR = path.join(process.cwd(), 'data/pdfs')
const OUTPUT = path.join(process.cwd(), 'data/benchmarks.json')
const CHUNK_SIZE = 8000
const client = new OpenAI({
baseURL: process.env.OLLAMA_BASE_URL ?? 'http://localhost:11434/v1',
apiKey: 'ollama',
})
const model = process.env.LLM_MODEL ?? 'llama3.1:8b'
const EXTRACTION_SYSTEM_PROMPT = `You are extracting DevOps benchmark data from State of DevOps Reports.
Extract any benchmark data you find about:
- Deployment frequency (elite/high/medium/low performers)
- Lead time for changes
- Change failure rate
- Mean time to restore (MTTR)
- Patterns (combinations of metrics and what they indicate)
- Key insights and statistics
Return ONLY valid JSON with this structure:
{
"deploymentFrequency": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
"leadTime": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
"changeFailureRate": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
"mttr": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
"patterns": [{ "id": "...", "signature": "...", "interpretation": "...", "improvements": ["..."] }],
"keyInsights": ["..."]
}
If a section has no data in this chunk, use null for that field. Return empty arrays for patterns/keyInsights if none found.`
function splitIntoChunks(text: string, size: number): string[] {
const chunks: string[] = []
for (let i = 0; i < text.length; i += size) {
chunks.push(text.slice(i, i + size))
}
return chunks
}
function mergeExtractions(extractions: any[]): any {
// Merge by taking the most specific (non-null) value for each tier field
// Concatenate patterns and keyInsights arrays, deduplicating by id/content
const result = {
deploymentFrequency: { elite: '', high: '', medium: '', low: '' },
leadTime: { elite: '', high: '', medium: '', low: '' },
changeFailureRate: { elite: '', high: '', medium: '', low: '' },
mttr: { elite: '', high: '', medium: '', low: '' },
patterns: [] as any[],
keyInsights: [] as string[],
}
for (const ext of extractions) {
if (!ext) continue
for (const metric of ['deploymentFrequency', 'leadTime', 'changeFailureRate', 'mttr'] as const) {
if (ext[metric]) {
for (const band of ['elite', 'high', 'medium', 'low'] as const) {
if (ext[metric][band] && !result[metric][band]) {
result[metric][band] = ext[metric][band]
}
}
}
}
if (Array.isArray(ext.patterns)) {
for (const p of ext.patterns) {
if (!result.patterns.find((existing: any) => existing.id === p.id)) {
result.patterns.push(p)
}
}
}
if (Array.isArray(ext.keyInsights)) {
for (const insight of ext.keyInsights) {
if (!result.keyInsights.includes(insight)) {
result.keyInsights.push(insight)
}
}
}
}
return result
}
async function extractFromChunk(chunk: string): Promise<any> {
const response = await client.chat.completions.create({
model,
messages: [
{ role: 'system', content: EXTRACTION_SYSTEM_PROMPT },
{ role: 'user', content: `Extract benchmark data from this text:\n\n${chunk}` },
],
})
const content = response.choices[0]?.message?.content ?? '{}'
try {
// Extract JSON from response (model may wrap it in markdown)
const jsonMatch = content.match(/\{[\s\S]*\}/)
return jsonMatch ? JSON.parse(jsonMatch[0]) : {}
} catch {
console.warn('Failed to parse chunk response, skipping')
return {}
}
}
async function main() {
const files = fs.readdirSync(PDF_DIR).filter(f => f.endsWith('.pdf'))
if (files.length === 0) {
console.log('No PDFs found in data/pdfs/. Please add PDF files first.')
process.exit(1)
}
console.log(`Found ${files.length} PDF(s): ${files.join(', ')}`)
let allText = ''
for (const file of files) {
console.log(`Extracting text from ${file}...`)
const buffer = fs.readFileSync(path.join(PDF_DIR, file))
const parser = new PDFParse({ data: new Uint8Array(buffer) })
const result = await parser.getText()
allText += `\n\n--- ${file} ---\n${result.text}`
await parser.destroy()
}
const chunks = splitIntoChunks(allText, CHUNK_SIZE)
console.log(`Processing ${chunks.length} chunks...`)
const extractions: any[] = []
for (let i = 0; i < chunks.length; i++) {
console.log(` Chunk ${i + 1}/${chunks.length}...`)
const result = await extractFromChunk(chunks[i])
extractions.push(result)
}
const merged = mergeExtractions(extractions)
fs.writeFileSync(OUTPUT, JSON.stringify(merged, null, 2))
console.log(`\nSaved benchmarks.json with:`)
console.log(` - ${merged.patterns.length} patterns`)
console.log(` - ${merged.keyInsights.length} key insights`)
}
main().catch(console.error)