import * as cheerio from 'cheerio'; import fs from 'fs'; export async function extractData(html) { try { const $ = cheerio.load(html); // ============================================================================ // UTILITY FUNCTIONS // ============================================================================ const getCellText = (cell) => { const text = $(cell).text().replace(/\s+/g, ' ').trim(); return text === '' || text === 'Nil' ? 'Nil' : text; }; const extractAmount = (htmlString) => { if (!htmlString) return 'Nil'; let rsMatch = htmlString.match(/Rs\s* \s*([\d,]+)/i); if (!rsMatch) { rsMatch = htmlString.match(/Rs\s+([\d,]+)/i); } if (rsMatch) { return `Rs ${rsMatch[1]}`; } const beforeDescMatch = htmlString.match(/([\d,]+)\s*(?: |~)/); if (beforeDescMatch) { return `Rs ${beforeDescMatch[1]}`; } const numMatch = htmlString.match(/([\d,]+)/); return numMatch ? `Rs ${numMatch[1]}` : 'Nil'; }; const parseMultiEntryCell = (cell) => { if (!cell) return 'Nil'; const $cell = $(cell); const htmlContent = $cell.html() || ''; const textContent = $cell.text().trim(); if (!htmlContent || !textContent || textContent === 'Nil') { return 'Nil'; } const entries = htmlContent.split(/\s*/i); const parsedEntries = []; entries.forEach(entry => { if (!entry || entry.trim() === '') return; const $entry = $('
').html(entry); const description = $entry.find('.desc').text().trim(); if (description) { const amount = extractAmount(entry); parsedEntries.push(`${description}: ${amount}`); } else { const amount = extractAmount(entry); if (amount !== 'Nil') { parsedEntries.push(amount); } } }); return parsedEntries.length > 0 ? parsedEntries.join('\n') : textContent; }; const parsePropertyCell = (cell) => { if (!cell) return 'Nil'; const $cell = $(cell); const htmlContent = $cell.html() || ''; const textContent = $cell.text().trim(); if (!htmlContent || !textContent || textContent === 'Nil') { return 'Nil'; } const properties = htmlContent.split(/\s*/i); const parsedProperties = []; properties.forEach(prop => { if (!prop || prop.trim() === '') return; const $prop = $('
').html(prop); const mainDesc = $prop.find('.desc').first().text().trim(); if (!mainDesc) return; const details = { description: mainDesc, totalArea: '', builtUpArea: '', inherited: '', purchaseDate: '', purchaseCost: '', currentValue: '' }; const fullText = prop; const totalAreaMatch = fullText.match(/Total Area[:\s]*([^<]+)/i); if (totalAreaMatch) details.totalArea = totalAreaMatch[1].trim(); const builtAreaMatch = fullText.match(/Built Up Area[:\s]*([^<]+)/i); if (builtAreaMatch) details.builtUpArea = builtAreaMatch[1].trim(); const inheritedMatch = fullText.match(/Whether Inherited[:\s]*([YN])/i); if (inheritedMatch) details.inherited = inheritedMatch[1]; const purchaseDateMatch = fullText.match(/Purchase Date[:\s]*([^<]+)/i); if (purchaseDateMatch) details.purchaseDate = purchaseDateMatch[1].trim(); const purchaseCostMatch = fullText.match(/Purchase Cost[:\s]*([\d.]+)/i); if (purchaseCostMatch) details.purchaseCost = purchaseCostMatch[1]; const valueMatch = fullText.match(/(\d[\d,]*)\s*(?: |~)/); if (valueMatch) details.currentValue = `Rs ${valueMatch[1]}`; let formatted = `${details.description}`; if (details.totalArea) formatted += ` | Area: ${details.totalArea}`; if (details.builtUpArea) formatted += ` | Built: ${details.builtUpArea}`; if (details.inherited) formatted += ` | Inherited: ${details.inherited === 'Y' ? 'Yes' : 'No'}`; if (details.purchaseDate && details.purchaseDate !== '0000-00-00') formatted += ` | Date: ${details.purchaseDate}`; if (details.currentValue) formatted += ` | Value: ${details.currentValue}`; parsedProperties.push(formatted); }); return parsedProperties.length > 0 ? parsedProperties.join('\n\n') : textContent; }; // ============================================================================ // EXTRACT BASIC INFO (Always works) // ============================================================================ const candidateName = $('h2').first().text().replace(/\(Winner\)/gi, '').trim(); const constituencyText = $('h5').first().text().trim(); const partyDiv = $('div:contains("Party:")').first(); const party = partyDiv.length ? partyDiv.text().match(/Party:\s*(.+)/)?.[1]?.trim() : null; const relationDiv = $('div:contains("S/o|D/o|W/o:")').first(); const relation = relationDiv.length ? relationDiv.text().match(/S\/o\|D\/o\|W\/o:\s*(.+)/)?.[1]?.trim() : null; const ageDiv = $('div:contains("Age:")').first(); const age = ageDiv.length ? parseInt(ageDiv.text().match(/Age:\s*(\d+)/)?.[1]) : null; const voterDiv = $('div:contains("Name Enrolled as Voter in:")').first(); const voterEnrollment = voterDiv.length ? voterDiv.text().match(/Name Enrolled as Voter in:\s*(.+)/)?.[1]?.trim() : null; let education = 'N/A'; const educationDiv = $('div:contains("Educational Details")').first(); if (educationDiv.length) { let fullText = educationDiv.text(); // Find where "Educational Details" starts const startMarker = 'Educational Details'; const startIndex = fullText.indexOf(startMarker); if (startIndex !== -1) { let eduText = fullText.substring(startIndex + startMarker.length); const endMarkers = [ 'Crime-O-Meter', 'Assets & Liabilities', 'google.charts', 'No criminal cases', 'function drawChart' ]; for (const marker of endMarkers) { const markerIndex = eduText.indexOf(marker); if (markerIndex !== -1) { eduText = eduText.substring(0, markerIndex); } } eduText = eduText .replace(/Category:/gi, '') .replace(/\s+/g, ' ') .trim(); if (eduText && eduText.length > 0) { education = eduText; } }} // ============================================================================ // PROFESSION & INCOME SOURCES (Fixed indices) // ============================================================================ const professionTable = $('#profession table.w3-table'); const profession = { self: professionTable.find('tr').eq(0).find('td').eq(1).find('b').text().trim() || 'NA', spouse: professionTable.find('tr').eq(1).find('td').eq(1).find('b').text().trim() || 'NA' }; const incomeTable = $('#incomesource table.w3-table'); const sourcesOfIncome = { self: incomeTable.find('tr').eq(0).find('td').eq(1).find('b').text().trim() || 'Nil', spouse: incomeTable.find('tr').eq(1).find('td').eq(1).find('b').text().trim() || 'NA', dependent: incomeTable.find('tr').eq(2).find('td').eq(1).find('b').text().trim() || 'NA' }; // ============================================================================ // OTHER ELECTIONS // ============================================================================ const otherElections = []; $('table:contains("Other Elections") tr').each((i, row) => { if (i <= 1) return; const cells = $(row).find('td'); if (cells.length >= 3) { const declIn = getCellText(cells[0]); if (declIn && !declIn.includes('Click here')) { otherElections.push({ declarationIn: declIn, declaredAssets: getCellText(cells[1]), declaredCases: parseInt(getCellText(cells[2])) || 0 }); } } }); // ============================================================================ // CRIMINAL CASES // ============================================================================ const crimeCasesText = $('div:contains("Number of Criminal Cases:")').text(); const criminalCases = parseInt(crimeCasesText.match(/Number of Criminal Cases:\s*(\d+)/)?.[1]) || 0; const briefIPC = []; $('ul li').each((i, li) => { const text = $(li).text().trim(); const match = text.match(/(\d+)\s*charges related to\s*(.+)/i); if (match) { briefIPC.push({ count: parseInt(match[1]), section: match[2].trim() }); } }); const pendingCases = []; $('#cases tr').each((i, row) => { if (i === 0) return; const cells = $(row).find('td'); if (cells.length >= 9) { const serialNo = getCellText(cells[0]); if (serialNo && serialNo !== 'Serial No.' && !serialNo.includes('No Cases')) { pendingCases.push({ serialNo, firNo: getCellText(cells[1]), caseNo: getCellText(cells[2]), court: getCellText(cells[3]), ipcSections: getCellText(cells[4]), otherDetails: getCellText(cells[5]), chargesFramed: getCellText(cells[6]), dateChargesFramed: getCellText(cells[7]), appealFiled: getCellText(cells[8]), appealDetails: cells.length > 9 ? getCellText(cells[9]) : 'Nil' }); } } }); // ============================================================================ // INCOME TAX (Delayed loading - robust parsing) // ============================================================================ console.log('[INCOME_TAX] Parsing...'); const incomeTax = []; const incomeTaxTable = $('#income_tax'); if (incomeTaxTable.length) { incomeTaxTable.find('tr').each((i, row) => { if (i === 0) return; // Skip header const cells = $(row).find('td'); if (cells.length >= 4) { const relation = getCellText(cells[0]); const pan = getCellText(cells[1]); const year = getCellText(cells[2]); const incomeHtml = $(cells[3]).html() || ''; let income = incomeHtml .replace(//gi, ' ** ') .replace(/ /g, ' ') .replace(/]*>/gi, '') .replace(/<\/span>/gi, '') .replace(/~/g, '') .trim(); incomeTax.push({ relation, pan, year, income }); } }); console.log(`[INCOME_TAX] ✓ Parsed ${incomeTax.length} entries`); } else { console.warn('[INCOME_TAX] ⚠️ Table not found'); } // ============================================================================ // MOVABLE ASSETS (Delayed loading - robust parsing) // ============================================================================ console.log('[MOVABLE] Parsing...'); const movableAssets = []; const movableTable = $('#movable_assets'); if (movableTable.length) { movableTable.find('tr').each((i, row) => { const cells = $(row).find('td'); if (cells.length < 8) return; const srNo = getCellText(cells[0]); const description = getCellText(cells[1]); if (srNo === 'Sr No' || description === 'Description') return; const descLower = description.toLowerCase(); if (descLower.includes('gross total') || descLower.includes('total value') || descLower.includes('totals') || srNo.toLowerCase().includes('total')) { return; } if (!srNo || srNo === 'Nil') return; movableAssets.push({ srNo, description, self: parseMultiEntryCell(cells[2]), spouse: getCellText(cells[3]), huf: getCellText(cells[4]), dependent1: getCellText(cells[5]), dependent2: getCellText(cells[6]), dependent3: getCellText(cells[7]), total: cells.length > 8 ? extractAmount($(cells[8]).html()) : 'Nil' }); }); console.log(`[MOVABLE] ✓ Parsed ${movableAssets.length} assets`); } else { console.warn('[MOVABLE] ⚠️ Table not found'); } // ============================================================================ // IMMOVABLE ASSETS (Delayed loading - robust parsing) // ============================================================================ console.log('[IMMOVABLE] Parsing...'); const immovableAssets = []; const immovableTable = $('#immovable_assets'); if (immovableTable.length) { immovableTable.find('tr').each((i, row) => { const cells = $(row).find('td'); if (cells.length < 8) return; const srNo = getCellText(cells[0]); const description = getCellText(cells[1]); if (srNo === 'Sr No' || description === 'Description') return; const descLower = description.toLowerCase(); if (descLower.includes('total current market') || descLower.includes('totals calculated') || srNo.toLowerCase().includes('total')) { return; } if (!srNo || srNo === 'Nil') return; immovableAssets.push({ srNo, description, self: parsePropertyCell(cells[2]), spouse: getCellText(cells[3]), huf: getCellText(cells[4]), dependent1: getCellText(cells[5]), dependent2: getCellText(cells[6]), dependent3: getCellText(cells[7]), total: cells.length > 8 ? extractAmount($(cells[8]).html()) : 'Nil' }); }); console.log(`[IMMOVABLE] ✓ Parsed ${immovableAssets.length} properties`); } else { console.warn('[IMMOVABLE] ⚠️ Table not found'); } // ============================================================================ // LIABILITIES (Delayed loading - robust parsing) // ============================================================================ console.log('[LIABILITIES] Parsing...'); const liabilities = []; const liabilitiesTable = $('#liabilities'); if (liabilitiesTable.length) { liabilitiesTable.find('tr').each((i, row) => { const cells = $(row).find('td'); if (cells.length < 8) return; const srNo = getCellText(cells[0]); const description = getCellText(cells[1]); if (srNo === 'Sr No' || description === 'Description') return; const descLower = description.toLowerCase(); if (descLower.includes('grand total') || descLower.includes('totals calculated') || descLower.includes('govt dues') || descLower.includes('dues to departments') || descLower.includes('tax dues') || descLower.includes('whether any other') || srNo === 'ii' || srNo === 'iii' || srNo === 'iv') { return; } if (!srNo || srNo === 'Nil') return; if (srNo !== 'i' && !srNo.match(/^\d+$/)) return; liabilities.push({ srNo, description, self: parseMultiEntryCell(cells[2]), spouse: getCellText(cells[3]), huf: getCellText(cells[4]), dependent1: getCellText(cells[5]), dependent2: getCellText(cells[6]), dependent3: getCellText(cells[7]), total: cells.length > 8 ? extractAmount($(cells[8]).html()) : 'Nil' }); }); console.log(`[LIABILITIES] ✓ Parsed ${liabilities.length} items`); } else { console.warn('[LIABILITIES] ⚠️ Table not found'); } // ============================================================================ // CONTRACTS // ============================================================================ const contracts = { candidate: 'NA', spouse: 'NA', dependents: 'NA', huf: 'NA', partnerships: 'NA', privateCompanies: 'NA' }; $('#contractdetails tr').each((i, row) => { const cells = $(row).find('td'); if (cells.length >= 2) { const desc = getCellText(cells[0]).toLowerCase(); const details = $(cells[1]).find('b').text().trim() || getCellText(cells[1]); if (desc.includes('candidate')) contracts.candidate = details; else if (desc.includes('spouse')) contracts.spouse = details; else if (desc.includes('dependent')) contracts.dependents = details; else if (desc.includes('hindu undivided')) contracts.huf = details; else if (desc.includes('partnership')) contracts.partnerships = details; else if (desc.includes('private companies')) contracts.privateCompanies = details; } }); // ============================================================================ // SUMMARY // ============================================================================ const assetsRow = $('td:contains("Assets:")').first(); const liabilitiesRow = $('td:contains("Liabilities:")').first(); const summary = { totalAssets: assetsRow.length ? assetsRow.next().text().trim() : 'N/A', totalLiabilities: liabilitiesRow.length ? liabilitiesRow.next().text().trim() : 'N/A' }; // ============================================================================ // FINAL ASSEMBLY // ============================================================================ const extractedData = { candidate: { name: candidateName, party, constituency: constituencyText, relation, age, voterEnrollment, education, professions: profession }, otherElections, crimeOMeter: { cases: criminalCases }, incomeTax, criminalCases: { briefIPC, pendingCases, convictedCases: pendingCases.length === 0 ? ['No Cases'] : [] }, movableAssets, immovableAssets, liabilities, profession, sourcesOfIncome, contracts, summary }; console.log('\n✅ Extraction Summary:'); console.log(` Name: ${candidateName}`); console.log(` Criminal Cases: ${criminalCases}`); console.log(` Income Tax Entries: ${incomeTax.length}`); console.log(` Movable Assets: ${movableAssets.length}`); console.log(` Immovable Assets: ${immovableAssets.length}`); console.log(` Liabilities: ${liabilities.length}`); const outputPath = 'extracted_test.txt'; fs.writeFileSync(outputPath, JSON.stringify(extractedData, null, 2), 'utf-8'); console.log(`\n💾 Extracted data saved to ${outputPath}`); return extractedData; } catch (error) { console.error('❌ Extraction error:', error); throw error; } }