AccessibilityCheckerBackend / api /upload-document.js
accessibilitychecker's picture
Upload folder using huggingface_hub
bbfde3f verified
const Busboy = require('busboy');
const JSZip = require('jszip');
const { applyCorsHeaders, handleCorsPreflight } = require('../lib/cors-middleware');
let analyzePowerPoint;
try {
const pptxAnalyzer = require('../lib/pptx-analyzer');
analyzePowerPoint = pptxAnalyzer.analyzePowerPoint;
} catch (err) {
console.error('Failed to load pptx-analyzer:', err);
}
// Helper function to send JSON with proper headers
function sendJson(res, status, data) {
res.setHeader('Content-Type', 'application/json');
res.status(status).end(JSON.stringify(data));
}
// Helper function to extract text from paragraph XML - moved to top for availability
function extractTextFromParagraph(paragraphXml) {
const textMatches = paragraphXml.match(/<w:t[^>]*>(.*?)<\/w:t>/g);
if (!textMatches) return '';
return textMatches
.map(t => t.replace(/<w:t[^>]*>|<\/w:t>/g, ''))
.join('')
.trim();
}
module.exports = async (req, res) => {
if (handleCorsPreflight(req, res, { allowedMethods: 'POST, OPTIONS' })) {
return;
}
applyCorsHeaders(req, res, { allowedMethods: 'POST, OPTIONS' });
if (req.method !== 'POST') {
sendJson(res, 405, { error: 'Method not allowed' });
return;
}
try {
const busboy = Busboy({ headers: req.headers });
let fileData = null;
let filename = null;
busboy.on('file', (fieldname, file, info) => {
filename = info.filename;
const chunks = [];
file.on('data', (chunk) => {
chunks.push(chunk);
});
file.on('end', () => {
fileData = Buffer.concat(chunks);
});
});
busboy.on('finish', async () => {
if (!fileData || !filename) {
sendJson(res, 400, { error: 'No file uploaded' });
return;
}
const filenameLower = filename.toLowerCase();
// Support both PowerPoint and Word documents
const isPowerPoint = ['.pptx', '.ppt', '.pps', '.pot', '.potx', '.ppsx'].some(ext => filenameLower.endsWith(ext));
const isWord = filenameLower.endsWith('.docx');
if (!isPowerPoint && !isWord) {
sendJson(res, 400, { error: 'Please upload a PowerPoint or Word document (.docx, .pptx)' });
return;
}
try {
let report;
if (isPowerPoint) {
// Route PowerPoint files to the PowerPoint analyzer
if (!analyzePowerPoint) {
throw new Error('PowerPoint analyzer not available');
}
report = await analyzePowerPoint(fileData, filename);
} else {
// Route Word documents to the Word analyzer
report = await analyzeDocx(fileData, filename);
}
sendJson(res, 200, {
fileName: filename,
suggestedFileName: filename,
report: report
});
} catch (error) {
console.error('Analysis error:', error);
sendJson(res, 500, { error: error.message });
}
});
req.pipe(busboy);
} catch (error) {
console.error('Upload error:', error);
sendJson(res, 500, { error: error.message });
}
};
module.exports.analyzeDocx = analyzeDocx;
async function analyzeDocx(fileData, filename) {
const report = {
fileName: filename,
suggestedFileName: filename,
summary: { fixed: 0, flagged: 0 },
details: {
// Requirement 1: Lists are formatted correctly
hyphenatedParagraphsNeedingLists: [],
formattedListsCount: 0,
// Requirement 2: Images have alt text (max 250 chars)
imagesMissingAltText: [],
imagesWithAltTextOver250Chars: [],
imagesWithValidAltText: 0,
}
};
try {
const zip = await JSZip.loadAsync(fileData);
// Read core documents needed for the two requirements
const documentXml = await zip.file('word/document.xml')?.async('string');
const relsXml = await zip.file('word/_rels/document.xml.rels')?.async('string');
// ===== REQUIREMENT 1: Check for lists formatted correctly =====
if (documentXml) {
const listIssues = analyzeListFormatting(documentXml);
if (listIssues.hyphenatedParagraphs.length > 0) {
report.details.hyphenatedParagraphsNeedingLists = listIssues.hyphenatedParagraphs;
report.summary.flagged += listIssues.hyphenatedParagraphs.length;
}
report.details.formattedListsCount = listIssues.properlyFormattedLists;
}
// ===== REQUIREMENT 2: Check for images with alt text =====
if (relsXml && documentXml) {
const imageAnalysis = analyzeImageAltText(documentXml, relsXml);
if (imageAnalysis.missingAltText.length > 0) {
report.details.imagesMissingAltText = imageAnalysis.missingAltText;
report.summary.flagged += imageAnalysis.missingAltText.length;
}
if (imageAnalysis.altTextOver250Chars.length > 0) {
report.details.imagesWithAltTextOver250Chars = imageAnalysis.altTextOver250Chars;
report.summary.flagged += imageAnalysis.altTextOver250Chars.length;
}
report.details.imagesWithValidAltText = imageAnalysis.validAltTextCount;
}
return report;
} catch (error) {
console.error('[analyzeDocx] Error analyzing document:', error);
return {
fileName: filename,
error: error.message,
summary: { fixed: 0, flagged: 0 },
details: {}
};
}
}
// ===== HELPER FUNCTIONS =====
/**
* Analyze list formatting in the document
* Detects hyphenated paragraphs that should be formatted as lists
*/
function analyzeListFormatting(documentXml) {
const results = {
hyphenatedParagraphs: [],
properlyFormattedLists: 0
};
if (!documentXml) return results;
// Extract all paragraphs
const paragraphMatches = documentXml.match(/<w:p[^>]*>([\s\S]*?)<\/w:p>/g) || [];
paragraphMatches.forEach((paragraph, index) => {
// Extract text content from paragraph
const textMatches = paragraph.match(/<w:t[^>]*>(.*?)<\/w:t>/g) || [];
const text = textMatches
.map(t => t.replace(/<w:t[^>]*>|<\/w:t>/g, ''))
.join('')
.trim();
// Check if paragraph starts with hyphen/dash (indicates list formatting issue)
if (text && /^[-–—]\s+/.test(text)) {
results.hyphenatedParagraphs.push({
index: index + 1,
text: text.substring(0, 100), // First 100 chars
message: 'This paragraph appears to be a list item but is formatted as a regular paragraph'
});
}
// Count properly formatted lists (pPr contains pStyle with list references)
if (paragraph.includes('pStyle w:val="ListParagraph"') || paragraph.includes('numPr')) {
results.properlyFormattedLists++;
}
});
return results;
}
/**
* Analyze image alt text requirements
* Checks for missing alt text and validates length
*/
function analyzeImageAltText(documentXml, relsXml) {
const results = {
missingAltText: [],
altTextOver250Chars: [],
validAltTextCount: 0
};
if (!documentXml || !relsXml) return results;
// Find all images/drawings
const drawingMatches = documentXml.match(/<wp:inline[^>]*>[\s\S]*?<\/wp:inline>|<wp:anchor[^>]*>[\s\S]*?<\/wp:anchor>/g) || [];
drawingMatches.forEach((drawing, index) => {
// Extract relationship ID to find the image file
const rIdMatch = drawing.match(/r:embed="(rId\d+)"/);
if (!rIdMatch) return;
const rId = rIdMatch[1];
// Extract alternate text (docProperties)
const altTextMatch = drawing.match(/<wp:docPr[^>]*descr="([^"]*)"/) || drawing.match(/<wp:cNvPicPr[^>]*>[\s\S]*?<a:picLocks[^>]*descr="([^"]*)"/);
const altText = altTextMatch ? altTextMatch[1] : null;
// Also check for extent/alt description in other formats
const titleMatch = drawing.match(/<wp:docPr[^>]*name="([^"]*)"[^>]*title="([^"]*)"/) || drawing.match(/<wp:docPr[^>]*title="([^"]*)"[^>]*name="([^"]*)"/);
// Check if this image has proper alt text
if (!altText || altText.trim() === '') {
results.missingAltText.push({
index: index + 1,
rId: rId,
message: 'Image is missing alt text description'
});
} else if (altText.length > 250) {
results.altTextOver250Chars.push({
index: index + 1,
rId: rId,
altText: altText.substring(0, 100) + '...',
length: altText.length,
message: `Alt text is ${altText.length} characters (max 250)`
});
} else {
// Valid alt text
results.validAltTextCount++;
}
});
return results;
}