Spaces:
Running
Running
File size: 830 Bytes
a67789e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
// data_extraction/extractor.js
const fs = require("fs");
const cheerio = require("cheerio");
function extractFromHTML(filePath) {
const raw = fs.readFileSync(filePath, "utf8");
const $ = cheerio.load(raw);
// Select paragraphs AND headings
const blocks = $("p, h1, h2, h3, h4, h5, h6")
.map((i, el) => $(el).text().trim())
.get()
.filter((t) => t.length > 0);
// If no structured blocks exist, fallback to body text split
let lines = blocks;
if (lines.length === 0) {
lines = $("body")
.text()
.split("\n")
.map((l) => l.trim())
.filter((l) => l.length > 0);
}
// Convert each block into a turn
const turns = lines.map((text, idx) => ({
role: idx === 0 ? "user" : "assistant",
content: text,
}));
return turns;
}
module.exports = { extractFromHTML };
|