Spaces:
Sleeping
Sleeping
File size: 4,976 Bytes
0e13326 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | import fs from 'fs'
import path from 'path'
import { PDFParse } from 'pdf-parse'
import OpenAI from 'openai'
const PDF_DIR = path.join(process.cwd(), 'data/pdfs')
const OUTPUT = path.join(process.cwd(), 'data/benchmarks.json')
const CHUNK_SIZE = 8000
const client = new OpenAI({
baseURL: process.env.OLLAMA_BASE_URL ?? 'http://localhost:11434/v1',
apiKey: 'ollama',
})
const model = process.env.LLM_MODEL ?? 'llama3.1:8b'
const EXTRACTION_SYSTEM_PROMPT = `You are extracting DevOps benchmark data from State of DevOps Reports.
Extract any benchmark data you find about:
- Deployment frequency (elite/high/medium/low performers)
- Lead time for changes
- Change failure rate
- Mean time to restore (MTTR)
- Patterns (combinations of metrics and what they indicate)
- Key insights and statistics
Return ONLY valid JSON with this structure:
{
"deploymentFrequency": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
"leadTime": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
"changeFailureRate": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
"mttr": { "elite": "...", "high": "...", "medium": "...", "low": "..." },
"patterns": [{ "id": "...", "signature": "...", "interpretation": "...", "improvements": ["..."] }],
"keyInsights": ["..."]
}
If a section has no data in this chunk, use null for that field. Return empty arrays for patterns/keyInsights if none found.`
function splitIntoChunks(text: string, size: number): string[] {
const chunks: string[] = []
for (let i = 0; i < text.length; i += size) {
chunks.push(text.slice(i, i + size))
}
return chunks
}
function mergeExtractions(extractions: any[]): any {
// Merge by taking the most specific (non-null) value for each tier field
// Concatenate patterns and keyInsights arrays, deduplicating by id/content
const result = {
deploymentFrequency: { elite: '', high: '', medium: '', low: '' },
leadTime: { elite: '', high: '', medium: '', low: '' },
changeFailureRate: { elite: '', high: '', medium: '', low: '' },
mttr: { elite: '', high: '', medium: '', low: '' },
patterns: [] as any[],
keyInsights: [] as string[],
}
for (const ext of extractions) {
if (!ext) continue
for (const metric of ['deploymentFrequency', 'leadTime', 'changeFailureRate', 'mttr'] as const) {
if (ext[metric]) {
for (const band of ['elite', 'high', 'medium', 'low'] as const) {
if (ext[metric][band] && !result[metric][band]) {
result[metric][band] = ext[metric][band]
}
}
}
}
if (Array.isArray(ext.patterns)) {
for (const p of ext.patterns) {
if (!result.patterns.find((existing: any) => existing.id === p.id)) {
result.patterns.push(p)
}
}
}
if (Array.isArray(ext.keyInsights)) {
for (const insight of ext.keyInsights) {
if (!result.keyInsights.includes(insight)) {
result.keyInsights.push(insight)
}
}
}
}
return result
}
async function extractFromChunk(chunk: string): Promise<any> {
const response = await client.chat.completions.create({
model,
messages: [
{ role: 'system', content: EXTRACTION_SYSTEM_PROMPT },
{ role: 'user', content: `Extract benchmark data from this text:\n\n${chunk}` },
],
})
const content = response.choices[0]?.message?.content ?? '{}'
try {
// Extract JSON from response (model may wrap it in markdown)
const jsonMatch = content.match(/\{[\s\S]*\}/)
return jsonMatch ? JSON.parse(jsonMatch[0]) : {}
} catch {
console.warn('Failed to parse chunk response, skipping')
return {}
}
}
async function main() {
const files = fs.readdirSync(PDF_DIR).filter(f => f.endsWith('.pdf'))
if (files.length === 0) {
console.log('No PDFs found in data/pdfs/. Please add PDF files first.')
process.exit(1)
}
console.log(`Found ${files.length} PDF(s): ${files.join(', ')}`)
let allText = ''
for (const file of files) {
console.log(`Extracting text from ${file}...`)
const buffer = fs.readFileSync(path.join(PDF_DIR, file))
const parser = new PDFParse({ data: new Uint8Array(buffer) })
const result = await parser.getText()
allText += `\n\n--- ${file} ---\n${result.text}`
await parser.destroy()
}
const chunks = splitIntoChunks(allText, CHUNK_SIZE)
console.log(`Processing ${chunks.length} chunks...`)
const extractions: any[] = []
for (let i = 0; i < chunks.length; i++) {
console.log(` Chunk ${i + 1}/${chunks.length}...`)
const result = await extractFromChunk(chunks[i])
extractions.push(result)
}
const merged = mergeExtractions(extractions)
fs.writeFileSync(OUTPUT, JSON.stringify(merged, null, 2))
console.log(`\nSaved benchmarks.json with:`)
console.log(` - ${merged.patterns.length} patterns`)
console.log(` - ${merged.keyInsights.length} key insights`)
}
main().catch(console.error)
|