CarouselForge Developer
fix: resolve HF build - ESLint rule and img tags
ef4c2aa
/**
* PDF Parser
* Extracts text from PDF files
*/
import pdfParse from 'pdf-parse';
export interface PDFContent {
success: boolean;
text?: string;
title?: string;
pageCount?: number;
error?: string;
}
/**
* Extract text from PDF buffer
*/
export async function extractPDFText(buffer: Buffer): Promise<PDFContent> {
try {
const data = await pdfParse(buffer);
if (!data.text) {
return {
success: false,
error: 'No text content found in PDF',
};
}
// Clean up text: remove extra whitespace, join lines
const cleanText = data.text
.split('\n')
.map((line: string) => line.trim())
.filter((line: string) => line.length > 0)
.join('\n\n');
// If text is too long, truncate to first 10000 characters
const finalText = cleanText.length > 10000 ? cleanText.substring(0, 10000) : cleanText;
return {
success: true,
text: finalText,
pageCount: data.numpages,
title: extractTitle(cleanText),
};
} catch (error) {
console.error('[pdf parser] extraction failed:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to parse PDF',
};
}
}
/**
* Extract PDF text from file path (Node.js only)
*/
export async function extractPDFFromFile(filePath: string): Promise<PDFContent> {
try {
const fs = await import('fs');
const buffer = await fs.promises.readFile(filePath);
return extractPDFText(buffer);
} catch (error) {
console.error('[pdf parser] file read failed:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to read PDF file',
};
}
}
/**
* Extract title from PDF text (first line or heading)
*/
function extractTitle(text: string): string {
const lines = text.split('\n');
// Look for a reasonable title (non-empty line, not too long)
for (const line of lines) {
const trimmed = line.trim();
if (trimmed.length > 5 && trimmed.length < 200 && !trimmed.startsWith('http')) {
return trimmed;
}
}
return 'PDF Document';
}
/**
* Chunk PDF text by word count
*/
export function chunkPDFText(
text: string,
maxWords: number = 500
): { chunks: string[]; count: number } {
if (!text || text.trim().length === 0) {
return { chunks: [], count: 0 };
}
const words = text
.split(/\s+/)
.filter((w) => w.length > 0);
const chunks: string[] = [];
let currentChunk: string[] = [];
for (const word of words) {
currentChunk.push(word);
if (currentChunk.length >= maxWords) {
chunks.push(currentChunk.join(' '));
currentChunk = [];
}
}
// Add remaining words
if (currentChunk.length > 0) {
chunks.push(currentChunk.join(' '));
}
return { chunks, count: chunks.length };
}