import fs from 'fs' import path from 'path' import { PDFParse } from 'pdf-parse' import OpenAI from 'openai' const PDF_DIR = path.join(process.cwd(), 'data/pdfs') const OUTPUT = path.join(process.cwd(), 'data/benchmarks.json') const CHUNK_SIZE = 8000 const client = new OpenAI({ baseURL: process.env.OLLAMA_BASE_URL ?? 'http://localhost:11434/v1', apiKey: 'ollama', }) const model = process.env.LLM_MODEL ?? 'llama3.1:8b' const EXTRACTION_SYSTEM_PROMPT = `You are extracting DevOps benchmark data from State of DevOps Reports. Extract any benchmark data you find about: - Deployment frequency (elite/high/medium/low performers) - Lead time for changes - Change failure rate - Mean time to restore (MTTR) - Patterns (combinations of metrics and what they indicate) - Key insights and statistics Return ONLY valid JSON with this structure: { "deploymentFrequency": { "elite": "...", "high": "...", "medium": "...", "low": "..." }, "leadTime": { "elite": "...", "high": "...", "medium": "...", "low": "..." }, "changeFailureRate": { "elite": "...", "high": "...", "medium": "...", "low": "..." }, "mttr": { "elite": "...", "high": "...", "medium": "...", "low": "..." }, "patterns": [{ "id": "...", "signature": "...", "interpretation": "...", "improvements": ["..."] }], "keyInsights": ["..."] } If a section has no data in this chunk, use null for that field. Return empty arrays for patterns/keyInsights if none found.` function splitIntoChunks(text: string, size: number): string[] { const chunks: string[] = [] for (let i = 0; i < text.length; i += size) { chunks.push(text.slice(i, i + size)) } return chunks } function mergeExtractions(extractions: any[]): any { // Merge by taking the most specific (non-null) value for each tier field // Concatenate patterns and keyInsights arrays, deduplicating by id/content const result = { deploymentFrequency: { elite: '', high: '', medium: '', low: '' }, leadTime: { elite: '', high: '', medium: '', low: '' }, changeFailureRate: { elite: '', high: '', medium: '', low: '' }, mttr: { elite: '', high: '', medium: '', low: '' }, patterns: [] as any[], keyInsights: [] as string[], } for (const ext of extractions) { if (!ext) continue for (const metric of ['deploymentFrequency', 'leadTime', 'changeFailureRate', 'mttr'] as const) { if (ext[metric]) { for (const band of ['elite', 'high', 'medium', 'low'] as const) { if (ext[metric][band] && !result[metric][band]) { result[metric][band] = ext[metric][band] } } } } if (Array.isArray(ext.patterns)) { for (const p of ext.patterns) { if (!result.patterns.find((existing: any) => existing.id === p.id)) { result.patterns.push(p) } } } if (Array.isArray(ext.keyInsights)) { for (const insight of ext.keyInsights) { if (!result.keyInsights.includes(insight)) { result.keyInsights.push(insight) } } } } return result } async function extractFromChunk(chunk: string): Promise { const response = await client.chat.completions.create({ model, messages: [ { role: 'system', content: EXTRACTION_SYSTEM_PROMPT }, { role: 'user', content: `Extract benchmark data from this text:\n\n${chunk}` }, ], }) const content = response.choices[0]?.message?.content ?? '{}' try { // Extract JSON from response (model may wrap it in markdown) const jsonMatch = content.match(/\{[\s\S]*\}/) return jsonMatch ? JSON.parse(jsonMatch[0]) : {} } catch { console.warn('Failed to parse chunk response, skipping') return {} } } async function main() { const files = fs.readdirSync(PDF_DIR).filter(f => f.endsWith('.pdf')) if (files.length === 0) { console.log('No PDFs found in data/pdfs/. Please add PDF files first.') process.exit(1) } console.log(`Found ${files.length} PDF(s): ${files.join(', ')}`) let allText = '' for (const file of files) { console.log(`Extracting text from ${file}...`) const buffer = fs.readFileSync(path.join(PDF_DIR, file)) const parser = new PDFParse({ data: new Uint8Array(buffer) }) const result = await parser.getText() allText += `\n\n--- ${file} ---\n${result.text}` await parser.destroy() } const chunks = splitIntoChunks(allText, CHUNK_SIZE) console.log(`Processing ${chunks.length} chunks...`) const extractions: any[] = [] for (let i = 0; i < chunks.length; i++) { console.log(` Chunk ${i + 1}/${chunks.length}...`) const result = await extractFromChunk(chunks[i]) extractions.push(result) } const merged = mergeExtractions(extractions) fs.writeFileSync(OUTPUT, JSON.stringify(merged, null, 2)) console.log(`\nSaved benchmarks.json with:`) console.log(` - ${merged.patterns.length} patterns`) console.log(` - ${merged.keyInsights.length} key insights`) } main().catch(console.error)