File size: 830 Bytes
a67789e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
// data_extraction/extractor.js
const fs = require("fs");
const cheerio = require("cheerio");

function extractFromHTML(filePath) {
  const raw = fs.readFileSync(filePath, "utf8");
  const $ = cheerio.load(raw);

  // Select paragraphs AND headings
  const blocks = $("p, h1, h2, h3, h4, h5, h6")
    .map((i, el) => $(el).text().trim())
    .get()
    .filter((t) => t.length > 0);

  // If no structured blocks exist, fallback to body text split
  let lines = blocks;
  if (lines.length === 0) {
    lines = $("body")
      .text()
      .split("\n")
      .map((l) => l.trim())
      .filter((l) => l.length > 0);
  }

  // Convert each block into a turn
  const turns = lines.map((text, idx) => ({
    role: idx === 0 ? "user" : "assistant",
    content: text,
  }));

  return turns;
}

module.exports = { extractFromHTML };