Spaces:
Sleeping
Sleeping
| import fs from 'fs' | |
| import path from 'path' | |
| import { PDFParse } from 'pdf-parse' | |
| import OpenAI from 'openai' | |
| const PDF_DIR = path.join(process.cwd(), 'data/pdfs') | |
| const OUTPUT = path.join(process.cwd(), 'data/benchmarks.json') | |
| const CHUNK_SIZE = 8000 | |
| const client = new OpenAI({ | |
| baseURL: process.env.OLLAMA_BASE_URL ?? 'http://localhost:11434/v1', | |
| apiKey: 'ollama', | |
| }) | |
| const model = process.env.LLM_MODEL ?? 'llama3.1:8b' | |
| const EXTRACTION_SYSTEM_PROMPT = `You are extracting DevOps benchmark data from State of DevOps Reports. | |
| Extract any benchmark data you find about: | |
| - Deployment frequency (elite/high/medium/low performers) | |
| - Lead time for changes | |
| - Change failure rate | |
| - Mean time to restore (MTTR) | |
| - Patterns (combinations of metrics and what they indicate) | |
| - Key insights and statistics | |
| Return ONLY valid JSON with this structure: | |
| { | |
| "deploymentFrequency": { "elite": "...", "high": "...", "medium": "...", "low": "..." }, | |
| "leadTime": { "elite": "...", "high": "...", "medium": "...", "low": "..." }, | |
| "changeFailureRate": { "elite": "...", "high": "...", "medium": "...", "low": "..." }, | |
| "mttr": { "elite": "...", "high": "...", "medium": "...", "low": "..." }, | |
| "patterns": [{ "id": "...", "signature": "...", "interpretation": "...", "improvements": ["..."] }], | |
| "keyInsights": ["..."] | |
| } | |
| If a section has no data in this chunk, use null for that field. Return empty arrays for patterns/keyInsights if none found.` | |
| function splitIntoChunks(text: string, size: number): string[] { | |
| const chunks: string[] = [] | |
| for (let i = 0; i < text.length; i += size) { | |
| chunks.push(text.slice(i, i + size)) | |
| } | |
| return chunks | |
| } | |
| function mergeExtractions(extractions: any[]): any { | |
| // Merge by taking the most specific (non-null) value for each tier field | |
| // Concatenate patterns and keyInsights arrays, deduplicating by id/content | |
| const result = { | |
| deploymentFrequency: { elite: '', high: '', medium: '', low: '' }, | |
| leadTime: { elite: '', high: '', medium: '', low: '' }, | |
| changeFailureRate: { elite: '', high: '', medium: '', low: '' }, | |
| mttr: { elite: '', high: '', medium: '', low: '' }, | |
| patterns: [] as any[], | |
| keyInsights: [] as string[], | |
| } | |
| for (const ext of extractions) { | |
| if (!ext) continue | |
| for (const metric of ['deploymentFrequency', 'leadTime', 'changeFailureRate', 'mttr'] as const) { | |
| if (ext[metric]) { | |
| for (const band of ['elite', 'high', 'medium', 'low'] as const) { | |
| if (ext[metric][band] && !result[metric][band]) { | |
| result[metric][band] = ext[metric][band] | |
| } | |
| } | |
| } | |
| } | |
| if (Array.isArray(ext.patterns)) { | |
| for (const p of ext.patterns) { | |
| if (!result.patterns.find((existing: any) => existing.id === p.id)) { | |
| result.patterns.push(p) | |
| } | |
| } | |
| } | |
| if (Array.isArray(ext.keyInsights)) { | |
| for (const insight of ext.keyInsights) { | |
| if (!result.keyInsights.includes(insight)) { | |
| result.keyInsights.push(insight) | |
| } | |
| } | |
| } | |
| } | |
| return result | |
| } | |
| async function extractFromChunk(chunk: string): Promise<any> { | |
| const response = await client.chat.completions.create({ | |
| model, | |
| messages: [ | |
| { role: 'system', content: EXTRACTION_SYSTEM_PROMPT }, | |
| { role: 'user', content: `Extract benchmark data from this text:\n\n${chunk}` }, | |
| ], | |
| }) | |
| const content = response.choices[0]?.message?.content ?? '{}' | |
| try { | |
| // Extract JSON from response (model may wrap it in markdown) | |
| const jsonMatch = content.match(/\{[\s\S]*\}/) | |
| return jsonMatch ? JSON.parse(jsonMatch[0]) : {} | |
| } catch { | |
| console.warn('Failed to parse chunk response, skipping') | |
| return {} | |
| } | |
| } | |
| async function main() { | |
| const files = fs.readdirSync(PDF_DIR).filter(f => f.endsWith('.pdf')) | |
| if (files.length === 0) { | |
| console.log('No PDFs found in data/pdfs/. Please add PDF files first.') | |
| process.exit(1) | |
| } | |
| console.log(`Found ${files.length} PDF(s): ${files.join(', ')}`) | |
| let allText = '' | |
| for (const file of files) { | |
| console.log(`Extracting text from ${file}...`) | |
| const buffer = fs.readFileSync(path.join(PDF_DIR, file)) | |
| const parser = new PDFParse({ data: new Uint8Array(buffer) }) | |
| const result = await parser.getText() | |
| allText += `\n\n--- ${file} ---\n${result.text}` | |
| await parser.destroy() | |
| } | |
| const chunks = splitIntoChunks(allText, CHUNK_SIZE) | |
| console.log(`Processing ${chunks.length} chunks...`) | |
| const extractions: any[] = [] | |
| for (let i = 0; i < chunks.length; i++) { | |
| console.log(` Chunk ${i + 1}/${chunks.length}...`) | |
| const result = await extractFromChunk(chunks[i]) | |
| extractions.push(result) | |
| } | |
| const merged = mergeExtractions(extractions) | |
| fs.writeFileSync(OUTPUT, JSON.stringify(merged, null, 2)) | |
| console.log(`\nSaved benchmarks.json with:`) | |
| console.log(` - ${merged.patterns.length} patterns`) | |
| console.log(` - ${merged.keyInsights.length} key insights`) | |
| } | |
| main().catch(console.error) | |