Spaces:
Runtime error
Runtime error
| import * as cheerio from 'cheerio'; | |
| import fs from 'fs'; // ✅ Add this import | |
| export async function extractData(html) { | |
| try { | |
| const $ = cheerio.load(html); | |
| // ============================================================================ | |
| // UTILITY FUNCTIONS | |
| // ============================================================================ | |
| const getCellText = (cell) => { | |
| const text = $(cell).text().replace(/\s+/g, ' ').trim(); | |
| return text === '' || text === 'Nil' ? 'Nil' : text; | |
| }; | |
| const extractAmount = (htmlString) => { | |
| if (!htmlString) return 'Nil'; | |
| const tempDiv = $('<div>').html(htmlString); | |
| const plainText = tempDiv.text().trim(); | |
| if (plainText === 'Nil' || plainText === '') return 'Nil'; | |
| let rsMatch = htmlString.match(/Rs\s* \s*([\d,]+)/i); | |
| if (rsMatch) return `Rs ${rsMatch[1]}`; | |
| const beforeSpanMatch = htmlString.match(/([\d,]+)\s* \s*<span/); | |
| if (beforeSpanMatch) return `Rs ${beforeSpanMatch[1]}`; | |
| const numberMatch = plainText.match(/^(\d[\d,]*)/); | |
| if (numberMatch) return `Rs ${numberMatch[1]}`; | |
| return 'Nil'; | |
| }; | |
| const parseMultiEntryCell = (cell) => { | |
| if (!cell) return 'Nil'; | |
| const $cell = $(cell); | |
| const htmlContent = $cell.html() || ''; | |
| const textContent = $cell.text().trim(); | |
| if (!htmlContent || !textContent || textContent === 'Nil') { | |
| return 'Nil'; | |
| } | |
| const entries = htmlContent.split(/<br\s*\/?>\s*<br\s*\/?>/i); | |
| const parsedEntries = []; | |
| entries.forEach(entry => { | |
| if (!entry || entry.trim() === '') return; | |
| const $entry = $('<div>').html(entry); | |
| const description = $entry.find('.desc').text().trim(); | |
| if (description) { | |
| const amount = extractAmount(entry); | |
| parsedEntries.push(`${description}: ${amount}`); | |
| } | |
| }); | |
| return parsedEntries.length > 0 ? parsedEntries.join('\n') : 'Nil'; | |
| }; | |
| const parsePropertyCell = (cell) => { | |
| if (!cell) return 'Nil'; | |
| const $cell = $(cell); | |
| const htmlContent = $cell.html() || ''; | |
| const textContent = $cell.text().trim(); | |
| if (!htmlContent || !textContent || textContent === 'Nil') { | |
| return 'Nil'; | |
| } | |
| const properties = htmlContent.split(/<br\s*\/?>\s*<br\s*\/?>/i); | |
| const parsedProperties = []; | |
| properties.forEach(prop => { | |
| if (!prop || prop.trim() === '') return; | |
| const $prop = $('<div>').html(prop); | |
| const mainDesc = $prop.find('.desc').first().text().trim(); | |
| if (!mainDesc) return; | |
| const valueMatch = prop.match(/([\d,]+)\s* \s*<span[^>]*>.*?(?:Lacs?|Crore?)/i); | |
| const currentValue = valueMatch ? `Rs ${valueMatch[1]}` : ''; | |
| let formatted = mainDesc; | |
| if (currentValue) formatted += ` - ${currentValue}`; | |
| parsedProperties.push(formatted); | |
| }); | |
| return parsedProperties.length > 0 ? parsedProperties.join('\n') : 'Nil'; | |
| }; | |
| // ============================================================================ | |
| // EXTRACT BASIC INFO | |
| // ============================================================================ | |
| const candidateName = $('h2').first().text().replace(/\(Winner\)/gi, '').trim(); | |
| const constituencyText = $('h5').first().text().trim(); | |
| const partyDiv = $('div:contains("Party:")').first(); | |
| const party = partyDiv.length ? partyDiv.text().match(/Party:\s*(.+)/)?.[1]?.trim() : null; | |
| const relationDiv = $('div:contains("S/o|D/o|W/o:")').first(); | |
| const relation = relationDiv.length ? relationDiv.text().match(/S\/o\|D\/o\|W\/o:\s*(.+)/)?.[1]?.trim() : null; | |
| const ageDiv = $('div:contains("Age:")').first(); | |
| const age = ageDiv.length ? parseInt(ageDiv.text().match(/Age:\s*(\d+)/)?.[1]) : null; | |
| const voterDiv = $('div:contains("Name Enrolled as Voter in:")').first(); | |
| const voterEnrollment = voterDiv.length ? voterDiv.text().match(/Name Enrolled as Voter in:\s*(.+)/)?.[1]?.trim() : null; | |
| // ============================================================================ | |
| // EDUCATION - Find the correct div (not the Crime-O-Meter one) | |
| // ============================================================================ | |
| let education = 'N/A'; | |
| $('h3').each((i, h3) => { | |
| const $h3 = $(h3); | |
| if ($h3.text().trim() === 'Educational Details') { | |
| const $panel = $h3.closest('div.w3-panel'); | |
| const fullText = $panel.text(); | |
| const cleanText = fullText | |
| .replace('Educational Details', '') | |
| .replace(/\s+/g, ' ') | |
| .trim(); | |
| // Only take the part after the HR | |
| const parts = cleanText.split(/Category:/); | |
| if (parts.length > 1) { | |
| education = 'Category: ' + parts[1].trim(); | |
| } else { | |
| education = cleanText; | |
| } | |
| return false; | |
| } | |
| }); | |
| // ============================================================================ | |
| // PROFESSION & INCOME | |
| // ============================================================================ | |
| const professionTable = $('#profession table.w3-table'); | |
| const profession = { | |
| self: professionTable.find('tr').eq(0).find('td').eq(1).find('b').text().trim() || 'NA', | |
| spouse: professionTable.find('tr').eq(1).find('td').eq(1).find('b').text().trim() || 'NA' | |
| }; | |
| const incomeTable = $('#incomesource table.w3-table'); | |
| const sourcesOfIncome = { | |
| self: incomeTable.find('tr').eq(0).find('td').eq(1).find('b').text().trim() || 'Nil', | |
| spouse: incomeTable.find('tr').eq(1).find('td').eq(1).find('b').text().trim() || 'NA', | |
| dependent: incomeTable.find('tr').eq(2).find('td').eq(1).find('b').text().trim() || 'NA' | |
| }; | |
| // ============================================================================ | |
| // OTHER ELECTIONS | |
| // ============================================================================ | |
| const otherElections = []; | |
| $('table:contains("Other Elections") tr').each((i, row) => { | |
| if (i <= 1) return; | |
| const cells = $(row).find('td'); | |
| if (cells.length >= 3) { | |
| const declIn = getCellText(cells[0]); | |
| if (declIn && !declIn.includes('Click here')) { | |
| otherElections.push({ | |
| declarationIn: declIn, | |
| declaredAssets: getCellText(cells[1]), | |
| declaredCases: parseInt(getCellText(cells[2])) || 0 | |
| }); | |
| } | |
| } | |
| }); | |
| // ============================================================================ | |
| // CRIMINAL CASES | |
| // ============================================================================ | |
| const crimeCasesText = $('div:contains("Number of Criminal Cases:")').text(); | |
| const criminalCases = parseInt(crimeCasesText.match(/Number of Criminal Cases:\s*(\d+)/)?.[1]) || 0; | |
| const briefIPC = []; | |
| $('ul li').each((i, li) => { | |
| const text = $(li).text().trim(); | |
| const match = text.match(/(\d+)\s*charges related to\s*(.+)/i); | |
| if (match) { | |
| briefIPC.push({ | |
| count: parseInt(match[1]), | |
| section: match[2].trim() | |
| }); | |
| } | |
| }); | |
| const pendingCases = []; | |
| $('#cases tr').each((i, row) => { | |
| if (i === 0) return; | |
| const cells = $(row).find('td'); | |
| if (cells.length >= 9) { | |
| const serialNo = getCellText(cells[0]); | |
| if (serialNo && serialNo !== 'Serial No.' && !serialNo.includes('No Cases')) { | |
| pendingCases.push({ | |
| serialNo, | |
| firNo: getCellText(cells[1]), | |
| caseNo: getCellText(cells[2]), | |
| court: getCellText(cells[3]), | |
| ipcSections: getCellText(cells[4]), | |
| otherDetails: getCellText(cells[5]), | |
| chargesFramed: getCellText(cells[6]), | |
| dateChargesFramed: getCellText(cells[7]), | |
| appealFiled: getCellText(cells[8]), | |
| appealDetails: cells.length > 9 ? getCellText(cells[9]) : 'Nil' | |
| }); | |
| } | |
| } | |
| }); | |
| // ============================================================================ | |
| // INCOME TAX | |
| // ============================================================================ | |
| console.log('[INCOME_TAX] Parsing...'); | |
| const incomeTax = []; | |
| const incomeTaxTable = $('#income_tax'); | |
| if (incomeTaxTable.length) { | |
| incomeTaxTable.find('tr').each((i, row) => { | |
| if (i === 0) return; | |
| const cells = $(row).find('td'); | |
| if (cells.length >= 4) { | |
| const relation = getCellText(cells[0]); | |
| const pan = getCellText(cells[1]); | |
| const year = getCellText(cells[2]); | |
| const incomeHtml = $(cells[3]).html() || ''; | |
| let income = incomeHtml | |
| .replace(/<br\s*\/?>/gi, ' ** ') | |
| .replace(/ /g, ' ') | |
| .replace(/<span[^>]*>/gi, '') | |
| .replace(/<\/span>/gi, '') | |
| .replace(/~/g, '') | |
| .trim(); | |
| incomeTax.push({ relation, pan, year, income }); | |
| } | |
| }); | |
| console.log(`[INCOME_TAX] ✓ Parsed ${incomeTax.length} entries`); | |
| } else { | |
| console.warn('[INCOME_TAX] ⚠️ Table not found'); | |
| } | |
| // ============================================================================ | |
| // MOVABLE ASSETS | |
| // ============================================================================ | |
| console.log('[MOVABLE] Parsing...'); | |
| const movableAssets = []; | |
| const movableTable = $('#movable_assets'); | |
| if (movableTable.length) { | |
| let currentSrNo = null; | |
| movableTable.find('tr').each((i, row) => { | |
| if (i === 0) return; | |
| const cells = $(row).find('td'); | |
| if (cells.length < 8) return; | |
| const firstCellText = getCellText(cells[0]); | |
| const secondCellText = getCellText(cells[1]); | |
| if (firstCellText === 'Sr No' || secondCellText === 'Description') return; | |
| // Skip total rows | |
| if (secondCellText.toLowerCase().includes('gross total') || | |
| secondCellText.toLowerCase().includes('totals')) { | |
| return; | |
| } | |
| let srNo, description, selfIdx, spouseIdx, hufIdx, dep1Idx, dep2Idx, dep3Idx, totalIdx; | |
| // Check if this row has a rowspan attribute | |
| const firstCellRowspan = $(cells[0]).attr('rowspan'); | |
| const isSerialNoCell = /^[ivxlcdm]+$/i.test(firstCellText); | |
| if (cells.length >= 9 && (isSerialNoCell || firstCellRowspan)) { | |
| // Full row with srNo | |
| srNo = firstCellText; | |
| description = secondCellText; | |
| currentSrNo = srNo; | |
| selfIdx = 2; spouseIdx = 3; hufIdx = 4; dep1Idx = 5; dep2Idx = 6; dep3Idx = 7; totalIdx = 8; | |
| } else if (cells.length === 8) { | |
| // Rowspan continuation | |
| srNo = currentSrNo; | |
| description = firstCellText; | |
| selfIdx = 1; spouseIdx = 2; hufIdx = 3; dep1Idx = 4; dep2Idx = 5; dep3Idx = 6; totalIdx = 7; | |
| } else { | |
| return; | |
| } | |
| if (!description || description === 'Nil') return; | |
| movableAssets.push({ | |
| srNo, | |
| description, | |
| self: parseMultiEntryCell(cells[selfIdx]), | |
| spouse: getCellText(cells[spouseIdx]), | |
| huf: getCellText(cells[hufIdx]), | |
| dependent1: getCellText(cells[dep1Idx]), | |
| dependent2: getCellText(cells[dep2Idx]), | |
| dependent3: getCellText(cells[dep3Idx]), | |
| total: extractAmount($(cells[totalIdx]).html()) | |
| }); | |
| }); | |
| console.log(`[MOVABLE] ✓ Parsed ${movableAssets.length} assets`); | |
| } else { | |
| console.warn('[MOVABLE] ⚠️ Table not found'); | |
| } | |
| // ============================================================================ | |
| // IMMOVABLE ASSETS | |
| // ============================================================================ | |
| console.log('[IMMOVABLE] Parsing...'); | |
| const immovableAssets = []; | |
| const immovableTable = $('#immovable_assets'); | |
| if (immovableTable.length) { | |
| immovableTable.find('tr').each((i, row) => { | |
| const cells = $(row).find('td'); | |
| if (cells.length < 8) return; | |
| const srNo = getCellText(cells[0]); | |
| const description = getCellText(cells[1]); | |
| if (srNo === 'Sr No' || description === 'Description') return; | |
| const descLower = description.toLowerCase(); | |
| if (descLower.includes('total current market') || descLower.includes('totals')) { | |
| return; | |
| } | |
| if (!srNo || srNo === 'Nil') return; | |
| immovableAssets.push({ | |
| srNo, | |
| description, | |
| self: parsePropertyCell(cells[2]), | |
| spouse: getCellText(cells[3]), | |
| huf: getCellText(cells[4]), | |
| dependent1: getCellText(cells[5]), | |
| dependent2: getCellText(cells[6]), | |
| dependent3: getCellText(cells[7]), | |
| total: cells.length > 8 ? extractAmount($(cells[8]).html()) : 'Nil' | |
| }); | |
| }); | |
| console.log(`[IMMOVABLE] ✓ Parsed ${immovableAssets.length} properties`); | |
| } else { | |
| console.warn('[IMMOVABLE] ⚠️ Table not found'); | |
| } | |
| // ============================================================================ | |
| // LIABILITIES - rowspan="4" means 4 rows total | |
| // ============================================================================ | |
| console.log('[LIABILITIES] Parsing...'); | |
| const liabilities = []; | |
| const liabilitiesTable = $('#liabilities'); | |
| if (liabilitiesTable.length) { | |
| let inSectionI = false; | |
| let sectionIRowsProcessed = 0; | |
| const maxSectionIRows = 4; // rowspan="4" | |
| liabilitiesTable.find('tr').each((i, row) => { | |
| if (i === 0) return; // Skip header | |
| const cells = $(row).find('td'); | |
| if (cells.length < 8) return; | |
| const firstCellText = getCellText(cells[0]); | |
| const firstCellRowspan = $(cells[0]).attr('rowspan'); | |
| // Check if this starts section i | |
| if (firstCellText === 'i' && firstCellRowspan) { | |
| inSectionI = true; | |
| sectionIRowsProcessed = 0; | |
| // First row of section i | |
| const description = getCellText(cells[1]); | |
| liabilities.push({ | |
| srNo: 'i', | |
| description, | |
| self: parseMultiEntryCell(cells[2]), | |
| spouse: getCellText(cells[3]), | |
| huf: getCellText(cells[4]), | |
| dependent1: getCellText(cells[5]), | |
| dependent2: getCellText(cells[6]), | |
| dependent3: getCellText(cells[7]), | |
| total: cells.length > 8 ? extractAmount($(cells[8]).html()) : 'Nil' | |
| }); | |
| sectionIRowsProcessed++; | |
| return; | |
| } | |
| // Check if we're entering section ii, iii, or iv | |
| if (firstCellText === 'ii' || firstCellText === 'iii' || firstCellText === 'iv') { | |
| inSectionI = false; | |
| return; | |
| } | |
| // Process continuation rows of section i | |
| if (inSectionI && sectionIRowsProcessed < maxSectionIRows && cells.length === 8) { | |
| const description = firstCellText; | |
| // Skip the "Grand Total" row | |
| if (description.toLowerCase().includes('grand total')) { | |
| inSectionI = false; | |
| return; | |
| } | |
| liabilities.push({ | |
| srNo: 'i', | |
| description, | |
| self: parseMultiEntryCell(cells[1]), | |
| spouse: getCellText(cells[2]), | |
| huf: getCellText(cells[3]), | |
| dependent1: getCellText(cells[4]), | |
| dependent2: getCellText(cells[5]), | |
| dependent3: getCellText(cells[6]), | |
| total: extractAmount($(cells[7]).html()) | |
| }); | |
| sectionIRowsProcessed++; | |
| } | |
| }); | |
| console.log(`[LIABILITIES] ✓ Parsed ${liabilities.length} items`); | |
| } else { | |
| console.warn('[LIABILITIES] ⚠️ Table not found'); | |
| } | |
| // ============================================================================ | |
| // CONTRACTS | |
| // ============================================================================ | |
| const contracts = { | |
| candidate: 'NA', | |
| spouse: 'NA', | |
| dependents: 'NA', | |
| huf: 'NA', | |
| partnerships: 'NA', | |
| privateCompanies: 'NA' | |
| }; | |
| $('#contractdetails tr').each((i, row) => { | |
| const cells = $(row).find('td'); | |
| if (cells.length >= 2) { | |
| const desc = getCellText(cells[0]).toLowerCase(); | |
| const details = $(cells[1]).find('b').text().trim() || getCellText(cells[1]); | |
| if (desc.includes('candidate')) contracts.candidate = details; | |
| else if (desc.includes('spouse')) contracts.spouse = details; | |
| else if (desc.includes('dependent')) contracts.dependents = details; | |
| else if (desc.includes('hindu undivided')) contracts.huf = details; | |
| else if (desc.includes('partnership')) contracts.partnerships = details; | |
| else if (desc.includes('private companies')) contracts.privateCompanies = details; | |
| } | |
| }); | |
| // ============================================================================ | |
| // SUMMARY | |
| // ============================================================================ | |
| const assetsRow = $('td:contains("Assets:")').first(); | |
| const liabilitiesRow = $('td:contains("Liabilities:")').first(); | |
| const summary = { | |
| totalAssets: assetsRow.length ? assetsRow.next().text().trim() : 'N/A', | |
| totalLiabilities: liabilitiesRow.length ? liabilitiesRow.next().text().trim() : 'N/A' | |
| }; | |
| // ============================================================================ | |
| // FINAL ASSEMBLY | |
| // ============================================================================ | |
| const extractedData = { | |
| candidate: { | |
| name: candidateName, | |
| party, | |
| constituency: constituencyText, | |
| relation, | |
| age, | |
| voterEnrollment, | |
| education, | |
| professions: profession | |
| }, | |
| otherElections, | |
| crimeOMeter: { | |
| cases: criminalCases | |
| }, | |
| incomeTax, | |
| criminalCases: { | |
| briefIPC, | |
| pendingCases, | |
| convictedCases: pendingCases.length === 0 ? ['No Cases'] : [] | |
| }, | |
| movableAssets, | |
| immovableAssets, | |
| liabilities, | |
| profession, | |
| sourcesOfIncome, | |
| contracts, | |
| summary | |
| }; | |
| console.log('\n✅ Extraction Summary:'); | |
| console.log(` Name: ${candidateName}`); | |
| console.log(` Education: ${education}`); | |
| console.log(` Criminal Cases: ${criminalCases}`); | |
| console.log(` Income Tax Entries: ${incomeTax.length}`); | |
| console.log(` Movable Assets: ${movableAssets.length}`); | |
| console.log(` Immovable Assets: ${immovableAssets.length}`); | |
| console.log(` Liabilities: ${liabilities.length}`); | |
| const outputPath = 'extracted_test.txt'; | |
| fs.writeFileSync(outputPath, JSON.stringify(extractedData, null, 2), 'utf-8'); | |
| console.log(`\n💾 Extracted data saved to ${outputPath}`); | |
| return extractedData; | |
| } catch (error) { | |
| console.error('❌ Extraction error:', error); | |
| throw error; | |
| } | |
| } |