Spaces:

rdlf
/

devops-metrics-interpreter

Sleeping

devops-metrics-interpreter / scripts /extract-knowledge.ts

ruben de la fuente

feat: initial deployment to HuggingFace Spaces

0e13326 21 days ago

4.98 kB

	import fs from 'fs'
	import path from 'path'
	import { PDFParse } from 'pdf-parse'
	import OpenAI from 'openai'

	const PDF_DIR = path.join(process.cwd(), 'data/pdfs')
	const OUTPUT = path.join(process.cwd(), 'data/benchmarks.json')
	const CHUNK_SIZE = 8000

	const client = new OpenAI({
	baseURL: process.env.OLLAMA_BASE_URL ?? 'http://localhost:11434/v1',
	apiKey: 'ollama',
	})
	const model = process.env.LLM_MODEL ?? 'llama3.1:8b'

	const EXTRACTION_SYSTEM_PROMPT = `You are extracting DevOps benchmark data from State of DevOps Reports.
	Extract any benchmark data you find about:
	- Deployment frequency (elite/high/medium/low performers)
	- Lead time for changes
	- Change failure rate
	- Mean time to restore (MTTR)
	- Patterns (combinations of metrics and what they indicate)
	- Key insights and statistics

	Return ONLY valid JSON with this structure:
	{
	"deploymentFrequency": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
	"leadTime": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
	"changeFailureRate": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
	"mttr": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
	"patterns": [{ "id": "...", "signature": "...", "interpretation": "...", "improvements": ["..."] }],
	"keyInsights": ["..."]
	}

	If a section has no data in this chunk, use null for that field. Return empty arrays for patterns/keyInsights if none found.`

	function splitIntoChunks(text: string, size: number): string[] {
	const chunks: string[] = []
	for (let i = 0; i < text.length; i += size) {
	chunks.push(text.slice(i, i + size))
	}
	return chunks
	}

	function mergeExtractions(extractions: any[]): any {
	// Merge by taking the most specific (non-null) value for each tier field
	// Concatenate patterns and keyInsights arrays, deduplicating by id/content
	const result = {
	deploymentFrequency: { elite: '', high: '', medium: '', low: '' },
	leadTime: { elite: '', high: '', medium: '', low: '' },
	changeFailureRate: { elite: '', high: '', medium: '', low: '' },
	mttr: { elite: '', high: '', medium: '', low: '' },
	patterns: [] as any[],
	keyInsights: [] as string[],
	}

	for (const ext of extractions) {
	if (!ext) continue
	for (const metric of ['deploymentFrequency', 'leadTime', 'changeFailureRate', 'mttr'] as const) {
	if (ext[metric]) {
	for (const band of ['elite', 'high', 'medium', 'low'] as const) {
	if (ext[metric][band] && !result[metric][band]) {
	result[metric][band] = ext[metric][band]
	}
	}
	}
	}
	if (Array.isArray(ext.patterns)) {
	for (const p of ext.patterns) {
	if (!result.patterns.find((existing: any) => existing.id === p.id)) {
	result.patterns.push(p)
	}
	}
	}
	if (Array.isArray(ext.keyInsights)) {
	for (const insight of ext.keyInsights) {
	if (!result.keyInsights.includes(insight)) {
	result.keyInsights.push(insight)
	}
	}
	}
	}

	return result
	}

	async function extractFromChunk(chunk: string): Promise<any> {
	const response = await client.chat.completions.create({
	model,
	messages: [
	{ role: 'system', content: EXTRACTION_SYSTEM_PROMPT },
	{ role: 'user', content: `Extract benchmark data from this text:\n\n${chunk}` },
	],
	})

	const content = response.choices[0]?.message?.content ?? '{}'
	try {
	// Extract JSON from response (model may wrap it in markdown)
	const jsonMatch = content.match(/\{[\s\S]*\}/)
	return jsonMatch ? JSON.parse(jsonMatch[0]) : {}
	} catch {
	console.warn('Failed to parse chunk response, skipping')
	return {}
	}
	}

	async function main() {
	const files = fs.readdirSync(PDF_DIR).filter(f => f.endsWith('.pdf'))

	if (files.length === 0) {
	console.log('No PDFs found in data/pdfs/. Please add PDF files first.')
	process.exit(1)
	}

	console.log(`Found ${files.length} PDF(s): ${files.join(', ')}`)

	let allText = ''
	for (const file of files) {
	console.log(`Extracting text from ${file}...`)
	const buffer = fs.readFileSync(path.join(PDF_DIR, file))
	const parser = new PDFParse({ data: new Uint8Array(buffer) })
	const result = await parser.getText()
	allText += `\n\n--- ${file} ---\n${result.text}`
	await parser.destroy()
	}

	const chunks = splitIntoChunks(allText, CHUNK_SIZE)
	console.log(`Processing ${chunks.length} chunks...`)

	const extractions: any[] = []
	for (let i = 0; i < chunks.length; i++) {
	console.log(` Chunk ${i + 1}/${chunks.length}...`)
	const result = await extractFromChunk(chunks[i])
	extractions.push(result)
	}

	const merged = mergeExtractions(extractions)
	fs.writeFileSync(OUTPUT, JSON.stringify(merged, null, 2))
	console.log(`\nSaved benchmarks.json with:`)
	console.log(` - ${merged.patterns.length} patterns`)
	console.log(` - ${merged.keyInsights.length} key insights`)
	}

	main().catch(console.error)