Spaces:

rdlf
/

devops-metrics-interpreter

Sleeping

File size: 4,976 Bytes

0e13326

import fs from 'fs'
import path from 'path'
import { PDFParse } from 'pdf-parse'
import OpenAI from 'openai'

const PDF_DIR = path.join(process.cwd(), 'data/pdfs')
const OUTPUT = path.join(process.cwd(), 'data/benchmarks.json')
const CHUNK_SIZE = 8000

const client = new OpenAI({
  baseURL: process.env.OLLAMA_BASE_URL ?? 'http://localhost:11434/v1',
  apiKey: 'ollama',
})
const model = process.env.LLM_MODEL ?? 'llama3.1:8b'

const EXTRACTION_SYSTEM_PROMPT = `You are extracting DevOps benchmark data from State of DevOps Reports.
Extract any benchmark data you find about:
- Deployment frequency (elite/high/medium/low performers)
- Lead time for changes
- Change failure rate
- Mean time to restore (MTTR)
- Patterns (combinations of metrics and what they indicate)
- Key insights and statistics

Return ONLY valid JSON with this structure:
{
  "deploymentFrequency": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
  "leadTime": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
  "changeFailureRate": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
  "mttr": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
  "patterns": [{ "id": "...", "signature": "...", "interpretation": "...", "improvements": ["..."] }],
  "keyInsights": ["..."]
}

If a section has no data in this chunk, use null for that field. Return empty arrays for patterns/keyInsights if none found.`

function splitIntoChunks(text: string, size: number): string[] {
  const chunks: string[] = []
  for (let i = 0; i < text.length; i += size) {
    chunks.push(text.slice(i, i + size))
  }
  return chunks
}

function mergeExtractions(extractions: any[]): any {
  // Merge by taking the most specific (non-null) value for each tier field
  // Concatenate patterns and keyInsights arrays, deduplicating by id/content
  const result = {
    deploymentFrequency: { elite: '', high: '', medium: '', low: '' },
    leadTime: { elite: '', high: '', medium: '', low: '' },
    changeFailureRate: { elite: '', high: '', medium: '', low: '' },
    mttr: { elite: '', high: '', medium: '', low: '' },
    patterns: [] as any[],
    keyInsights: [] as string[],
  }

  for (const ext of extractions) {
    if (!ext) continue
    for (const metric of ['deploymentFrequency', 'leadTime', 'changeFailureRate', 'mttr'] as const) {
      if (ext[metric]) {
        for (const band of ['elite', 'high', 'medium', 'low'] as const) {
          if (ext[metric][band] && !result[metric][band]) {
            result[metric][band] = ext[metric][band]
          }
        }
      }
    }
    if (Array.isArray(ext.patterns)) {
      for (const p of ext.patterns) {
        if (!result.patterns.find((existing: any) => existing.id === p.id)) {
          result.patterns.push(p)
        }
      }
    }
    if (Array.isArray(ext.keyInsights)) {
      for (const insight of ext.keyInsights) {
        if (!result.keyInsights.includes(insight)) {
          result.keyInsights.push(insight)
        }
      }
    }
  }

  return result
}

async function extractFromChunk(chunk: string): Promise<any> {
  const response = await client.chat.completions.create({
    model,
    messages: [
      { role: 'system', content: EXTRACTION_SYSTEM_PROMPT },
      { role: 'user', content: `Extract benchmark data from this text:\n\n${chunk}` },
    ],
  })

  const content = response.choices[0]?.message?.content ?? '{}'
  try {
    // Extract JSON from response (model may wrap it in markdown)
    const jsonMatch = content.match(/\{[\s\S]*\}/)
    return jsonMatch ? JSON.parse(jsonMatch[0]) : {}
  } catch {
    console.warn('Failed to parse chunk response, skipping')
    return {}
  }
}

async function main() {
  const files = fs.readdirSync(PDF_DIR).filter(f => f.endsWith('.pdf'))

  if (files.length === 0) {
    console.log('No PDFs found in data/pdfs/. Please add PDF files first.')
    process.exit(1)
  }

  console.log(`Found ${files.length} PDF(s): ${files.join(', ')}`)

  let allText = ''
  for (const file of files) {
    console.log(`Extracting text from ${file}...`)
    const buffer = fs.readFileSync(path.join(PDF_DIR, file))
    const parser = new PDFParse({ data: new Uint8Array(buffer) })
    const result = await parser.getText()
    allText += `\n\n--- ${file} ---\n${result.text}`
    await parser.destroy()
  }

  const chunks = splitIntoChunks(allText, CHUNK_SIZE)
  console.log(`Processing ${chunks.length} chunks...`)

  const extractions: any[] = []
  for (let i = 0; i < chunks.length; i++) {
    console.log(`  Chunk ${i + 1}/${chunks.length}...`)
    const result = await extractFromChunk(chunks[i])
    extractions.push(result)
  }

  const merged = mergeExtractions(extractions)
  fs.writeFileSync(OUTPUT, JSON.stringify(merged, null, 2))
  console.log(`\nSaved benchmarks.json with:`)
  console.log(`  - ${merged.patterns.length} patterns`)
  console.log(`  - ${merged.keyInsights.length} key insights`)
}

main().catch(console.error)