AbdulElahGwaith's picture
Upload folder using huggingface_hub
88df9e4 verified
/**
* Readability Analysis Script
*
* This script analyzes the readability of rendered documentation content using standard
* readability metrics. It has a dual purpose:
*
* 1. **GitHub Actions Workflow**: Automatically runs on pull requests to analyze changed
* content files and post readability reports as PR comments.
*
* 2. **Local Development Tool**: Can be run locally by writers to test readability
* of specific files during content creation and editing.
*
* WORKFLOW USAGE:
* - Triggered automatically on PRs with content changes
* - Uses CHANGED_FILES environment variable from get-changed-files action
* - Outputs report to console and saves to readability-report.md for PR commenting
* - Requires local server to be running to analyze rendered content
*
* LOCAL USAGE:
* 1. Start local development server: `npm start`
* 2. Run analysis on specific files:
* - Single file: `npm run readability-report -- --paths content/file.md`
* - Multiple files: `npm run readability-report -- --paths content/file1.md content/file2.md`
* - Get help: `npm run readability-report -- --help`
*
* FEATURES:
* - Analyzes rendered content (not raw Markdown) to account for Liquid templating
* - Calculates multiple readability metrics (Flesch Reading Ease, Gunning Fog, FORCAST, etc.)
* - Provides detailed reports with improvement recommendations
* - Filters out code blocks and non-prose elements for accurate analysis
*
* REQUIREMENTS:
* - Local server running on localhost:4000
* - Content files must be in content/ directory
* - Files must be accessible via rendered site URLs
*/
import fs from 'fs'
import path from 'path'
import cheerio from 'cheerio'
import { fetchWithRetry } from '@/frame/lib/fetch-utils'
interface ReadabilityMetrics {
fleschReadingEase: number
fleschKincaidGrade: number
gunningFog: number
colemanLiau: number
automatedReadabilityIndex: number
smogIndex: number
forcastGrade: number
}
interface PageReadability {
path: string
url: string
title: string
metrics: ReadabilityMetrics
wordCount: number
sentenceCount: number
estimatedReadingTime: string
}
async function main() {
// Check for help flag
const args = process.argv.slice(2)
if (args.includes('--help') || args.includes('-h')) {
console.log(`
Usage: npm run readability-report [-- --paths <file1> <file2> ...]
Examples:
# Analyze specific files
npm run readability-report -- --paths content/copilot/using-github-copilot.md content/get-started/quickstart.md
# Analyze a single file
npm run readability-report -- --paths content/copilot/using-github-copilot.md
# Use environment variable (for CI)
CHANGED_FILES="content/file1.md content/file2.md" npm run readability-report
Note: Requires a local server running on localhost:4000 (npm start)
`)
return
}
console.log('Starting readability analysis...')
// Get changed content files from environment variable or command line arguments
const changedFiles = getChangedContentFiles()
if (changedFiles.length === 0) {
console.log('No content files to analyze. Use --help for usage information.')
return
}
console.log(`Analyzing readability for ${changedFiles.length} changed files:`)
for (const file of changedFiles) {
console.log(` - ${file}`)
}
// Wait for server to be ready
await waitForServer()
// Analyze each changed file
const results: PageReadability[] = []
for (const filePath of changedFiles) {
try {
const result = await analyzeFile(filePath)
if (result) {
results.push(result)
console.log(`βœ“ Analyzed: ${result.path}`)
}
} catch (error) {
console.error(`βœ— Failed to analyze ${filePath}:`, (error as Error).message)
}
}
// Generate and output report
const report = generateReport(results)
// Always output to console for local development
console.log(`\n${report}`)
// If running in CI, also save report for commenting on PR
if (process.env.GITHUB_ACTIONS) {
fs.writeFileSync('readability-report.md', report)
console.log('\nReport saved to readability-report.md')
}
}
function getChangedContentFiles(): string[] {
// Check for command line arguments first
const args = process.argv.slice(2)
const pathsIndex = args.indexOf('--paths')
if (pathsIndex !== -1 && pathsIndex + 1 < args.length) {
// Get all arguments after --paths until we hit another flag or end
const paths: string[] = []
for (let i = pathsIndex + 1; i < args.length; i++) {
if (args[i].startsWith('--')) break
paths.push(args[i])
}
return paths.filter((filePath) => {
// Filter for content files only (not data files, READMEs, etc.)
return (
filePath.endsWith('.md') &&
filePath.split(path.sep)[0] === 'content' &&
path.basename(filePath) !== 'README.md'
)
})
}
// Fall back to environment variable (for CI)
const spaceSeparatedList = process.env.CHANGED_FILES || ''
return spaceSeparatedList.split(/\s+/g).filter((filePath) => {
// Filter for content files only (not data files, READMEs, etc.)
return (
filePath.endsWith('.md') &&
filePath.split(path.sep)[0] === 'content' &&
path.basename(filePath) !== 'README.md'
)
})
}
function makeURL(urlPath: string): string {
return `http://localhost:4000${urlPath}`
}
async function waitForServer(): Promise<void> {
console.log('Waiting for server to be ready...')
const maxAttempts = 30
const delayMs = 2000
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
try {
const response = await fetchWithRetry(makeURL('/'), undefined, {
timeout: 5000,
})
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
}
console.log('Server is ready!')
return
} catch (error) {
if (attempt === maxAttempts) {
console.error('Server failed to start. Last error:', (error as Error).message)
throw new Error(`Server failed to start after ${maxAttempts} attempts`)
}
console.log(
`Attempt ${attempt}/${maxAttempts} failed (${(error as Error).message}), retrying in ${delayMs}ms...`,
)
await new Promise((resolve) => setTimeout(resolve, delayMs))
}
}
}
async function analyzeFile(filePath: string): Promise<PageReadability | null> {
// Convert file path to URL path
// content/get-started/foo.md -> /get-started/foo
const urlPath = `/${filePath
.replace(/^content\//, '')
.replace(/\.md$/, '')
.replace(/\/index$/, '')}`
try {
// Fetch the rendered page
const response = await fetchWithRetry(makeURL(urlPath), undefined, {
timeout: 30000,
throwHttpErrors: false,
})
if (response.status !== 200) {
console.warn(`Skipping ${urlPath}: HTTP ${response.status}`)
return null
}
// Parse HTML and extract content
const body = await response.text()
const $ = cheerio.load(body)
// Get page title
const title = $('h1').first().text().trim() || $('title').text().trim() || 'Untitled'
// Extract main content text (excluding navigation, sidebars, etc.)
// Focus on the main article content with more specific selectors
const contentSelectors = [
'article .markdown-body', // Most specific - article content in docs
'.markdown-body', // GitHub markdown content
'article', // Generic article
'main', // Main content area
'[data-testid="lead"]', // Lead content
'#article-contents', // Fallback
]
let contentText = ''
for (const selector of contentSelectors) {
const element = $(selector)
if (element.length > 0) {
// Remove code blocks and other non-prose elements
element.find('pre, code, .highlight').remove()
element.find('nav, .breadcrumb, .pagination').remove()
element.find('[data-testid="breadcrumbs"]').remove()
element.find('.js-search-results').remove()
element.find('aside, .sidebar').remove()
element.find('.edit-this-page').remove()
// Get the text content directly, which preserves natural spacing
contentText = element.text()
// Clean up the text to remove excessive whitespace
contentText = contentText
.replace(/\s+/g, ' ') // Replace multiple whitespace with single space
.trim()
break
}
}
if (!contentText.trim()) {
console.warn(`No content found for ${urlPath}`)
return null
}
// Calculate readability metrics
const metrics = await calculateReadability(contentText)
// Calculate estimated reading time based on word count and complexity
const estimatedReadingTime = estimateReadingTime(
countWords(contentText),
metrics.fleschReadingEase,
)
return {
path: filePath,
url: urlPath,
title,
metrics,
wordCount: countWords(contentText),
sentenceCount: countSentences(contentText),
estimatedReadingTime,
}
} catch (error) {
throw new Error(`Failed to analyze ${urlPath}: ${(error as Error).message}`)
}
}
async function calculateReadability(text: string): Promise<ReadabilityMetrics> {
// Direct implementation of readability formulas
const words = countWords(text)
const sentences = countSentences(text)
const syllables = countSyllables(text)
const complexWords = countComplexWords(text)
const singleSyllableWords = countSingleSyllableWords(text)
if (sentences === 0 || words === 0) {
return {
fleschReadingEase: 0,
fleschKincaidGrade: 0,
gunningFog: 0,
colemanLiau: 0,
automatedReadabilityIndex: 0,
smogIndex: 0,
forcastGrade: 0,
}
}
// Flesch Reading Ease: 206.835 - (1.015 Γ— ASL) - (84.6 Γ— ASW)
const avgSentenceLength = words / sentences
const avgSyllablesPerWord = syllables / words
const fleschReadingEase = Math.max(
0,
206.835 - 1.015 * avgSentenceLength - 84.6 * avgSyllablesPerWord,
)
// Flesch-Kincaid Grade Level: (0.39 Γ— ASL) + (11.8 Γ— ASW) - 15.59
const fleschKincaidGrade = 0.39 * avgSentenceLength + 11.8 * avgSyllablesPerWord - 15.59
// Gunning Fog Index: 0.4 Γ— (ASL + percentage of complex words)
const complexWordPercentage = (complexWords / words) * 100
const gunningFog = 0.4 * (avgSentenceLength + complexWordPercentage)
// Coleman-Liau Index: 0.0588 Γ— L - 0.296 Γ— S - 15.8
const avgCharsPer100Words = (text.replace(/\s/g, '').length / words) * 100
const avgSentencesPer100Words = (sentences / words) * 100
const colemanLiau = 0.0588 * avgCharsPer100Words - 0.296 * avgSentencesPer100Words - 15.8
// Automated Readability Index: 4.71 Γ— (characters/words) + 0.5 Γ— (words/sentences) - 21.43
const avgCharsPerWord = text.replace(/\s/g, '').length / words
const automatedReadabilityIndex = 4.71 * avgCharsPerWord + 0.5 * avgSentenceLength - 21.43
// SMOG Index: approximately 1.0430 Γ— sqrt(complex words Γ— 30/sentences) + 3.1291
const smogIndex = 1.043 * Math.sqrt((complexWords * 30) / sentences) + 3.1291
// FORCAST Grade Level: 20 - (N / 10), where N = single-syllable words per 150 words
// Scale to 150-word sample proportion
const singleSyllablePer150Words = (singleSyllableWords / words) * 150
const forcastGrade = Math.max(5, 20 - singleSyllablePer150Words / 10) // Can't go below 5th grade
return {
fleschReadingEase: Math.round(fleschReadingEase * 100) / 100,
fleschKincaidGrade: Math.max(0, Math.round(fleschKincaidGrade * 100) / 100),
gunningFog: Math.max(0, Math.round(gunningFog * 100) / 100),
colemanLiau: Math.max(0, Math.round(colemanLiau * 100) / 100),
automatedReadabilityIndex: Math.max(0, Math.round(automatedReadabilityIndex * 100) / 100),
smogIndex: Math.max(0, Math.round(smogIndex * 100) / 100),
forcastGrade: Math.max(5, Math.round(forcastGrade * 100) / 100),
}
}
function countSyllablesInWord(word: string): number {
// Remove non-alphabetic characters
const cleanWord = word.toLowerCase().replace(/[^a-z]/g, '')
if (cleanWord.length === 0) return 0
// Count vowel groups
const vowelGroups = cleanWord.match(/[aeiouy]+/g) || []
let syllables = vowelGroups.length
// Subtract silent 'e' at end
if (cleanWord.endsWith('e') && syllables > 1) {
syllables -= 1
}
// Ensure at least 1 syllable per word
return Math.max(1, syllables)
}
function countWords(text: string): number {
// Clean the text and split into words
const cleanText = text
.replace(/\s+/g, ' ') // Replace multiple whitespace with single space
.trim()
if (!cleanText) return 0
// Split on whitespace and filter out empty strings
const words = cleanText.split(/\s+/).filter((word) => {
// Remove punctuation and check if there are actual letters/numbers
const cleanWord = word.replace(/[^\w]/g, '')
return cleanWord.length > 0
})
return words.length
}
function countSentences(text: string): number {
// Clean and normalize the text first
const cleanText = text
.replace(/\s+/g, ' ') // Replace multiple whitespace with single space
.trim()
// Split on sentence-ending punctuation, being more conservative
// Only count actual sentence-ending punctuation, not structural breaks
const sentences = cleanText
.split(/[.!?]+(?=\s|$)/) // Only split on punctuation followed by space or end
.map((s) => s.trim())
.filter((s) => s.length > 0)
return Math.max(1, sentences.length)
}
function countSyllables(text: string): number {
// Simple syllable counting approximation
const words = text.toLowerCase().split(/\s+/)
let syllableCount = 0
for (const word of words) {
if (word.length === 0) continue
syllableCount += countSyllablesInWord(word)
}
return syllableCount
}
function countSingleSyllableWords(text: string): number {
// Count words with exactly 1 syllable
const words = text.toLowerCase().split(/\s+/)
let singleSyllableCount = 0
for (const word of words) {
if (word.length === 0) continue
const syllables = countSyllablesInWord(word)
if (syllables === 1) {
singleSyllableCount += 1
}
}
return singleSyllableCount
}
function countComplexWords(text: string): number {
// Count words with 3+ syllables (approximation)
const words = text.toLowerCase().split(/\s+/)
let complexCount = 0
for (const word of words) {
if (word.length === 0) continue
const syllables = countSyllablesInWord(word)
if (syllables >= 3) {
complexCount += 1
}
}
return complexCount
}
function estimateReadingTime(wordCount: number, fleschReadingEase: number): string {
// Base reading speed in words per minute (WPM)
// Average adult reading speed is around 200-250 WPM for normal text
let baseWPM = 200
// Adjust reading speed based on text complexity (Flesch Reading Ease score)
// Higher scores = easier text = faster reading
// Lower scores = harder text = slower reading
if (fleschReadingEase >= 90) {
baseWPM = 250 // Very easy - fast reading
} else if (fleschReadingEase >= 80) {
baseWPM = 230 // Easy - slightly faster
} else if (fleschReadingEase >= 70) {
baseWPM = 210 // Fairly easy - normal speed
} else if (fleschReadingEase >= 60) {
baseWPM = 200 // Standard - average speed
} else if (fleschReadingEase >= 50) {
baseWPM = 180 // Fairly difficult - slower
} else if (fleschReadingEase >= 30) {
baseWPM = 160 // Difficult - much slower
} else {
baseWPM = 140 // Very difficult - very slow
}
// Calculate reading time in minutes
const readingTimeMinutes = wordCount / baseWPM
// Format the output
if (readingTimeMinutes < 1) {
return '< 1 min'
} else if (readingTimeMinutes < 60) {
return `${Math.round(readingTimeMinutes)} min`
} else {
const hours = Math.floor(readingTimeMinutes / 60)
const minutes = Math.round(readingTimeMinutes % 60)
return minutes > 0 ? `${hours}h ${minutes}m` : `${hours}h`
}
}
function generateReport(results: PageReadability[]): string {
if (results.length === 0) {
return '## πŸ“Š Readability Report\n\nNo content changes found to analyze.'
}
let report = '## πŸ“Š Readability Report\n\n'
report += `Analyzed ${results.length} changed documentation page${results.length === 1 ? '' : 's'}.\n\n`
// Summary table
report += '### Summary\n\n'
report += '| Page | Flesch Reading Ease | Grade Level | Reading Time | Words |\n'
report += '|------|:-------------------:|:-----------:|:------------:|:-----:|\n'
for (const result of results) {
const grade = result.metrics.fleschKincaidGrade
const ease = result.metrics.fleschReadingEase
const readingTime = result.estimatedReadingTime
// Determine readability assessment
let easeAssessment = '❓'
if (ease >= 60) easeAssessment = '🟒'
else if (ease >= 30) easeAssessment = '🟑'
else easeAssessment = 'πŸ”΄'
report += `| [${result.title}](${result.url}) | ${ease} ${easeAssessment} | ${grade} | ${readingTime} | ${result.wordCount} |\n`
}
// Detailed metrics
report += '\n### Detailed Metrics\n\n'
for (const result of results) {
report += `#### ${result.title}\n\n`
report += `**File:** \`${result.path}\` \n`
report += `**URL:** ${result.url} \n`
report += `**Words:** ${result.wordCount} | **Sentences:** ${result.sentenceCount} | **Est. Reading Time:** ${result.estimatedReadingTime}\n\n`
report += '| Metric | Score | Target | Assessment |\n'
report += '|--------|:-----:|:------:|:----------:|\n'
const metrics = [
{
name: 'Flesch Reading Ease',
score: result.metrics.fleschReadingEase,
target: '60+',
assessment:
result.metrics.fleschReadingEase >= 60
? '🟒 Good'
: result.metrics.fleschReadingEase >= 30
? '🟑 Fair'
: 'πŸ”΄ Difficult',
},
{
name: 'Flesch-Kincaid Grade',
score: result.metrics.fleschKincaidGrade,
target: '8 or less',
assessment:
result.metrics.fleschKincaidGrade <= 8
? '🟒 Good'
: result.metrics.fleschKincaidGrade <= 12
? '🟑 Fair'
: 'πŸ”΄ High',
},
{
name: 'Gunning Fog Index',
score: result.metrics.gunningFog,
target: '8 or less',
assessment:
result.metrics.gunningFog <= 8
? '🟒 Good'
: result.metrics.gunningFog <= 12
? '🟑 Fair'
: 'πŸ”΄ High',
},
{
name: 'Coleman-Liau Index',
score: result.metrics.colemanLiau,
target: '8 or less',
assessment:
result.metrics.colemanLiau <= 8
? '🟒 Good'
: result.metrics.colemanLiau <= 12
? '🟑 Fair'
: 'πŸ”΄ High',
},
{
name: 'Automated Readability Index',
score: result.metrics.automatedReadabilityIndex,
target: '8 or less',
assessment:
result.metrics.automatedReadabilityIndex <= 8
? '🟒 Good'
: result.metrics.automatedReadabilityIndex <= 12
? '🟑 Fair'
: 'πŸ”΄ High',
},
{
name: 'FORCAST Grade Level',
score: result.metrics.forcastGrade,
target: '9-10',
assessment:
result.metrics.forcastGrade <= 10
? '🟒 Good'
: result.metrics.forcastGrade <= 12
? '🟑 Fair'
: 'πŸ”΄ High',
},
]
for (const metric of metrics) {
report += `| ${metric.name} | ${metric.score} | ${metric.target} | ${metric.assessment} |\n`
}
report += '\n'
}
// Guidelines
report += '### πŸ“– Readability Guidelines\n\n'
report += '**Target Audience:** Technical users (developers, administrators)\n\n'
report += '**Reading Time Estimation:**\n'
report += '- Based on complexity-adjusted reading speed (140-250 WPM)\n'
report += '- Easier content (higher Flesch scores) = faster reading\n'
report += '- More complex content = slower reading pace\n\n'
report += '**General Tips for Improvement:**\n'
report += '- **Sentences:** Aim for 15-20 words per sentence on average\n'
report += '- **Word choice:** Choose simpler alternatives when possible\n'
report += '- **Paragraphs:** Keep paragraphs under 75-100 words\n'
report +=
'- **Voice:** Use active voice (e.g., "Click the button" vs "The button should be clicked")\n'
report += '- **Technical terms:** Define acronyms and jargon on first use\n'
report +=
'- **FORCAST improvement:** Use more single-syllable words (aim for 110+ per 150 words)\n\n'
// Add overall recommendations based on results
const avgFleschEase =
results.reduce((sum, r) => sum + r.metrics.fleschReadingEase, 0) / results.length
if (avgFleschEase < 60) {
if (results.length === 1) {
report +=
'**Priority Focus:** This page scored below 60 for Flesch Reading Ease. Consider breaking up complex sentences and using simpler vocabulary.\n\n'
} else {
report +=
'**Priority Focus:** Several pages scored below 60 for Flesch Reading Ease. Consider breaking up complex sentences and using simpler vocabulary.\n\n'
}
}
return report
}
try {
await main()
} catch (error) {
console.error('Readability analysis failed:', error)
process.exit(1)
}