htaf's picture
added data extractor
a67789e
raw
history blame contribute delete
830 Bytes
// data_extraction/extractor.js
const fs = require("fs");
const cheerio = require("cheerio");
function extractFromHTML(filePath) {
const raw = fs.readFileSync(filePath, "utf8");
const $ = cheerio.load(raw);
// Select paragraphs AND headings
const blocks = $("p, h1, h2, h3, h4, h5, h6")
.map((i, el) => $(el).text().trim())
.get()
.filter((t) => t.length > 0);
// If no structured blocks exist, fallback to body text split
let lines = blocks;
if (lines.length === 0) {
lines = $("body")
.text()
.split("\n")
.map((l) => l.trim())
.filter((l) => l.length > 0);
}
// Convert each block into a turn
const turns = lines.map((text, idx) => ({
role: idx === 0 ? "user" : "assistant",
content: text,
}));
return turns;
}
module.exports = { extractFromHTML };