Spaces:

romizone
/

open-chatbot

Sleeping

File size: 7,834 Bytes

c730f0b

/* eslint-disable @typescript-eslint/no-require-imports */
import { IMAGE_EXTENSIONS, TEXT_EXTENSIONS, SUPPORTED_EXTENSIONS } from "./constants";
import type { FileContext } from "./types";
import { v4 as uuidv4 } from "uuid";

function getExtension(filename: string): string {
  const parts = filename.split(".");
  return parts.length > 1 ? parts.pop()!.toLowerCase() : "";
}

export async function processFile(
  buffer: Buffer,
  filename: string
): Promise<FileContext> {
  const ext = getExtension(filename);
  const result: FileContext = {
    id: uuidv4(),
    filename,
    extension: ext,
    text: "",
    error: null,
    size: buffer.length,
  };

  if (!SUPPORTED_EXTENSIONS.includes(ext)) {
    result.error = `Format '.${ext}' belum didukung.`;
    return result;
  }

  try {
    if (ext === "pdf") {
      result.text = await processPdf(buffer);
    } else if (ext === "doc") {
      result.text = await processDoc(buffer);
    } else if (ext === "docx") {
      result.text = await processDocx(buffer);
    } else if (ext === "xlsx" || ext === "xls") {
      result.text = processExcel(buffer);
    } else if (ext === "csv") {
      result.text = processCsv(buffer);
    } else if (IMAGE_EXTENSIONS.includes(ext)) {
      result.text = await processImage(buffer);
    } else if (TEXT_EXTENSIONS.includes(ext)) {
      result.text = processText(buffer);
    } else {
      result.text = processText(buffer);
    }
  } catch (e: unknown) {
    result.error = `Error memproses '${filename}': ${e instanceof Error ? e.message : String(e)}`;
  }

  return result;
}

async function processPdf(buffer: Buffer): Promise<string> {
  const { writeFile, readFile } = require("fs/promises");
  const { mkdtemp, rm } = require("fs/promises");
  const { tmpdir } = require("os");
  const path = require("path");
  const { execFile } = require("child_process");

  // Step 1: Try fast text extraction with pdftotext CLI (poppler)
  const tmpDir = await mkdtemp(path.join(tmpdir(), "pdf-txt-"));
  const pdfPath = path.join(tmpDir, "input.pdf");
  const txtPath = path.join(tmpDir, "output.txt");

  try {
    await writeFile(pdfPath, buffer);

    await new Promise<void>((resolve, reject) => {
      execFile(
        "pdftotext",
        ["-layout", pdfPath, txtPath],
        { timeout: 15000 },
        (error: Error | null) => {
          if (error) reject(error);
          else resolve();
        }
      );
    });

    const text = (await readFile(txtPath, "utf-8")).trim();
    console.log(`[pdf] pdftotext extracted ${text.length} chars`);

    if (text.length > 50) {
      return text;
    }
  } catch (e) {
    console.log(`[pdf] pdftotext failed: ${e instanceof Error ? e.message : String(e)}`);
  } finally {
    await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
  }

  // Step 2: Fallback — convert PDF pages to images with pdftoppm, then OCR
  console.log("[pdf] Text extraction empty, starting OCR fallback...");
  return await ocrPdf(buffer);
}

async function ocrPdf(buffer: Buffer): Promise<string> {
  const { writeFile, readFile, readdir } = require("fs/promises");
  const { mkdtemp, rm } = require("fs/promises");
  const { tmpdir } = require("os");
  const path = require("path");
  const { execFile } = require("child_process");

  const tmpDir = await mkdtemp(path.join(tmpdir(), "pdf-ocr-"));
  const pdfPath = path.join(tmpDir, "input.pdf");

  try {
    await writeFile(pdfPath, buffer);

    // Convert PDF to PNG images using pdftoppm (poppler)
    // Always limit to 20 pages max to avoid excessive processing
    const args = ["-png", "-r", "300", "-l", "20", pdfPath, path.join(tmpDir, "page")];
    await new Promise<void>((resolve, reject) => {
      execFile(
        "pdftoppm",
        args,
        { timeout: 120000 },
        (error: Error | null) => {
          if (error) reject(error);
          else resolve();
        }
      );
    });

    // Find all generated page images
    const files = await readdir(tmpDir);
    const pageFiles = files
      .filter((f: string) => f.startsWith("page") && f.endsWith(".png"))
      .sort();

    if (pageFiles.length === 0) {
      return "(PDF berisi gambar tapi tidak dapat di-OCR)";
    }

    // OCR each page
    const results: string[] = [];
    for (const pageFile of pageFiles) {
      const imgPath = path.join(tmpDir, pageFile);
      const ocrBase = path.join(tmpDir, `ocr-${pageFile}`);
      const ocrPath = ocrBase + ".txt";

      try {
        await new Promise<void>((resolve, reject) => {
          execFile(
            "tesseract",
            [imgPath, ocrBase, "-l", "eng+ind"],
            { timeout: 60000 },
            (error: Error | null) => {
              if (error) reject(error);
              else resolve();
            }
          );
        });

        const pageText = await readFile(ocrPath, "utf-8");
        if (pageText.trim()) {
          results.push(`--- Halaman ${results.length + 1} ---\n${pageText.trim()}`);
        }
      } catch {
        // Skip pages that fail OCR
      }
    }

    return results.length > 0
      ? results.join("\n\n")
      : "(PDF berisi gambar tapi tidak dapat di-OCR)";
  } finally {
    // Clean up temp directory
    await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
  }
}

async function processDoc(buffer: Buffer): Promise<string> {
  const { writeFile, unlink } = require("fs/promises");
  const { tmpdir } = require("os");
  const path = require("path");
  const tmpPath = path.join(tmpdir(), `doc-${uuidv4()}.doc`);
  try {
    await writeFile(tmpPath, buffer);
    const WordExtractor = require("word-extractor");
    const extractor = new WordExtractor();
    const doc = await extractor.extract(tmpPath);
    return doc.getBody().trim();
  } finally {
    await unlink(tmpPath).catch(() => {});
  }
}

async function processDocx(buffer: Buffer): Promise<string> {
  const mammoth = await import("mammoth");
  const result = await mammoth.extractRawText({ buffer });
  return result.value.trim();
}

function processExcel(buffer: Buffer): string {
  const XLSX = require("xlsx");
  const workbook = XLSX.read(buffer, { type: "buffer" });
  const texts: string[] = [];

  for (const sheetName of workbook.SheetNames) {
    const sheet = workbook.Sheets[sheetName];
    const csv = XLSX.utils.sheet_to_csv(sheet);
    texts.push(`--- Sheet: ${sheetName} ---\n${csv}`);
  }

  return texts.join("\n\n");
}

function processCsv(buffer: Buffer): string {
  const XLSX = require("xlsx");
  const workbook = XLSX.read(buffer, { type: "buffer" });
  const sheet = workbook.Sheets[workbook.SheetNames[0]];
  return XLSX.utils.sheet_to_csv(sheet);
}

async function processImage(buffer: Buffer): Promise<string> {
  const { writeFile, readFile, unlink } = require("fs/promises");
  const { tmpdir } = require("os");
  const path = require("path");
  const { execFile } = require("child_process");

  const inputPath = path.join(tmpdir(), `ocr-${uuidv4()}`);
  const outputBase = path.join(tmpdir(), `ocr-out-${uuidv4()}`);
  const outputPath = outputBase + ".txt";

  try {
    await writeFile(inputPath, buffer);

    // Use system tesseract CLI — avoids Turbopack module resolution issues
    await new Promise<void>((resolve, reject) => {
      execFile(
        "tesseract",
        [inputPath, outputBase, "-l", "eng+ind"],
        { timeout: 60000 },
        (error: Error | null) => {
          if (error) reject(error);
          else resolve();
        }
      );
    });

    const text = await readFile(outputPath, "utf-8");
    return text.trim();
  } finally {
    await unlink(inputPath).catch(() => {});
    await unlink(outputPath).catch(() => {});
  }
}

function processText(buffer: Buffer): string {
  try {
    return buffer.toString("utf-8");
  } catch {
    return buffer.toString("latin1");
  }
}