Spaces:
Running
Running
| // data_extraction/extractor.js | |
| const fs = require("fs"); | |
| const cheerio = require("cheerio"); | |
| function extractFromHTML(filePath) { | |
| const raw = fs.readFileSync(filePath, "utf8"); | |
| const $ = cheerio.load(raw); | |
| // Select paragraphs AND headings | |
| const blocks = $("p, h1, h2, h3, h4, h5, h6") | |
| .map((i, el) => $(el).text().trim()) | |
| .get() | |
| .filter((t) => t.length > 0); | |
| // If no structured blocks exist, fallback to body text split | |
| let lines = blocks; | |
| if (lines.length === 0) { | |
| lines = $("body") | |
| .text() | |
| .split("\n") | |
| .map((l) => l.trim()) | |
| .filter((l) => l.length > 0); | |
| } | |
| // Convert each block into a turn | |
| const turns = lines.map((text, idx) => ({ | |
| role: idx === 0 ? "user" : "assistant", | |
| content: text, | |
| })); | |
| return turns; | |
| } | |
| module.exports = { extractFromHTML }; | |