Spaces:
No application file
No application file
| /** | |
| * PDF Parsing Provider Implementation | |
| * | |
| * Factory pattern for routing PDF parsing requests to appropriate provider implementations. | |
| * Follows the same architecture as lib/ai/providers.ts for consistency. | |
| * | |
| * Currently Supported Providers: | |
| * - unpdf: Built-in Node.js PDF parser with text and image extraction | |
| * - MinerU: Advanced commercial service with OCR, formula, and table extraction | |
| * (https://mineru.ai or self-hosted) | |
| * | |
| * HOW TO ADD A NEW PROVIDER: | |
| * | |
| * 1. Add provider ID to PDFProviderId in lib/pdf/types.ts | |
| * Example: | 'tesseract-ocr' | |
| * | |
| * 2. Add provider configuration to lib/pdf/constants.ts | |
| * Example: | |
| * 'tesseract-ocr': { | |
| * id: 'tesseract-ocr', | |
| * name: 'Tesseract OCR', | |
| * requiresApiKey: false, | |
| * icon: '/tesseract.svg', | |
| * features: ['text', 'images', 'ocr'] | |
| * } | |
| * | |
| * 3. Implement provider function in this file | |
| * Pattern: async function parseWithXxx(config, pdfBuffer): Promise<ParsedPdfContent> | |
| * - Accept PDF as Buffer | |
| * - Extract text, images, tables, formulas as needed | |
| * - Return unified format: | |
| * { | |
| * text: string, // Markdown or plain text | |
| * images: string[], // Base64 data URLs | |
| * metadata: { | |
| * pageCount: number, | |
| * parser: string, | |
| * ... // Provider-specific metadata | |
| * } | |
| * } | |
| * | |
| * Example: | |
| * async function parseWithTesseractOCR( | |
| * config: PDFParserConfig, | |
| * pdfBuffer: Buffer | |
| * ): Promise<ParsedPdfContent> { | |
| * const { createWorker } = await import('tesseract.js'); | |
| * | |
| * // Convert PDF pages to images | |
| * const pdf = await getDocumentProxy(new Uint8Array(pdfBuffer)); | |
| * const numPages = pdf.numPages; | |
| * | |
| * const texts: string[] = []; | |
| * const images: string[] = []; | |
| * | |
| * for (let pageNum = 1; pageNum <= numPages; pageNum++) { | |
| * // Render page to canvas/image | |
| * const page = await pdf.getPage(pageNum); | |
| * const viewport = page.getViewport({ scale: 2.0 }); | |
| * const canvas = createCanvas(viewport.width, viewport.height); | |
| * const context = canvas.getContext('2d'); | |
| * await page.render({ canvasContext: context, viewport }).promise; | |
| * | |
| * // OCR the image | |
| * const worker = await createWorker('eng+chi_sim'); | |
| * const { data: { text } } = await worker.recognize(canvas.toBuffer()); | |
| * texts.push(text); | |
| * await worker.terminate(); | |
| * | |
| * // Save image | |
| * images.push(canvas.toDataURL()); | |
| * } | |
| * | |
| * return { | |
| * text: texts.join('\n\n'), | |
| * images, | |
| * metadata: { | |
| * pageCount: numPages, | |
| * parser: 'tesseract-ocr', | |
| * }, | |
| * }; | |
| * } | |
| * | |
| * 4. Add case to parsePDF() switch statement | |
| * case 'tesseract-ocr': | |
| * result = await parseWithTesseractOCR(config, pdfBuffer); | |
| * break; | |
| * | |
| * 5. Add i18n translations in lib/i18n.ts | |
| * providerTesseractOCR: { zh: 'Tesseract OCR', en: 'Tesseract OCR' } | |
| * | |
| * 6. Update features in constants.ts to reflect parser capabilities | |
| * features: ['text', 'images', 'ocr'] // OCR-capable | |
| * | |
| * Provider Implementation Patterns: | |
| * | |
| * Pattern 1: Local Node.js Parser (like unpdf) | |
| * - Import parsing library | |
| * - Process Buffer directly | |
| * - Extract text and images synchronously or asynchronously | |
| * - Convert images to base64 data URLs | |
| * - Return immediately | |
| * | |
| * Pattern 2: Remote API (like MinerU) | |
| * - Upload PDF or provide URL | |
| * - Create task and get task ID | |
| * - Poll for completion (with timeout) | |
| * - Download results (text, images, metadata) | |
| * - Parse and convert to unified format | |
| * | |
| * Pattern 3: OCR-based Parser (Tesseract, Google Vision) | |
| * - Render PDF pages to images | |
| * - Send images to OCR service | |
| * - Collect text from all pages | |
| * - Combine with layout analysis if available | |
| * - Return combined text and original images | |
| * | |
| * Image Extraction Best Practices: | |
| * - Always convert to base64 data URLs (data:image/png;base64,...) | |
| * - Use PNG for lossless quality | |
| * - Use sharp for efficient image processing | |
| * - Handle errors per image (don't fail entire parsing) | |
| * - Log extraction failures but continue processing | |
| * | |
| * Metadata Recommendations: | |
| * - pageCount: Number of pages in PDF | |
| * - parser: Provider ID for debugging | |
| * - processingTime: Time taken (auto-added) | |
| * - taskId/jobId: For async providers (useful for troubleshooting) | |
| * - Custom fields: imageMapping, pdfImages, tables, formulas, etc. | |
| * | |
| * Error Handling: | |
| * - Validate API key if requiresApiKey is true | |
| * - Throw descriptive errors for missing configuration | |
| * - For async providers, handle timeout and polling errors | |
| * - Log warnings for non-critical failures (e.g., single page errors) | |
| * - Always include provider name in error messages | |
| */ | |
| import { extractText, getDocumentProxy, extractImages } from 'unpdf'; | |
| import sharp from 'sharp'; | |
| import type { PDFParserConfig } from './types'; | |
| import type { ParsedPdfContent } from '@/lib/types/pdf'; | |
| import { PDF_PROVIDERS } from './constants'; | |
| import { createLogger } from '@/lib/logger'; | |
| const log = createLogger('PDFProviders'); | |
| /** | |
| * Parse PDF using specified provider | |
| */ | |
| export async function parsePDF( | |
| config: PDFParserConfig, | |
| pdfBuffer: Buffer, | |
| ): Promise<ParsedPdfContent> { | |
| const provider = PDF_PROVIDERS[config.providerId]; | |
| if (!provider) { | |
| throw new Error(`Unknown PDF provider: ${config.providerId}`); | |
| } | |
| // Validate API key if required | |
| if (provider.requiresApiKey && !config.apiKey) { | |
| throw new Error(`API key required for PDF provider: ${config.providerId}`); | |
| } | |
| const startTime = Date.now(); | |
| let result: ParsedPdfContent; | |
| switch (config.providerId) { | |
| case 'unpdf': | |
| result = await parseWithUnpdf(pdfBuffer); | |
| break; | |
| case 'mineru': | |
| result = await parseWithMinerU(config, pdfBuffer); | |
| break; | |
| default: | |
| throw new Error(`Unsupported PDF provider: ${config.providerId}`); | |
| } | |
| // Add processing time to metadata | |
| if (result.metadata) { | |
| result.metadata.processingTime = Date.now() - startTime; | |
| } | |
| return result; | |
| } | |
| /** | |
| * Parse PDF using unpdf (existing implementation) | |
| */ | |
| async function parseWithUnpdf(pdfBuffer: Buffer): Promise<ParsedPdfContent> { | |
| const uint8Array = new Uint8Array(pdfBuffer); | |
| const pdf = await getDocumentProxy(uint8Array); | |
| const numPages = pdf.numPages; | |
| // Extract text using the document proxy | |
| const { text: pdfText } = await extractText(pdf, { | |
| mergePages: true, | |
| }); | |
| // Extract images using the same document proxy | |
| const images: string[] = []; | |
| const pdfImagesMeta: Array<{ | |
| id: string; | |
| src: string; | |
| pageNumber: number; | |
| width: number; | |
| height: number; | |
| }> = []; | |
| let imageCounter = 0; | |
| for (let pageNum = 1; pageNum <= numPages; pageNum++) { | |
| try { | |
| const pageImages = await extractImages(pdf, pageNum); | |
| for (let i = 0; i < pageImages.length; i++) { | |
| const imgData = pageImages[i]; | |
| try { | |
| // Use sharp to convert raw image data to PNG base64 | |
| const pngBuffer = await sharp(Buffer.from(imgData.data), { | |
| raw: { | |
| width: imgData.width, | |
| height: imgData.height, | |
| channels: imgData.channels, | |
| }, | |
| }) | |
| .png() | |
| .toBuffer(); | |
| // Convert to base64 | |
| const base64 = `data:image/png;base64,${pngBuffer.toString('base64')}`; | |
| imageCounter++; | |
| const imgId = `img_${imageCounter}`; | |
| images.push(base64); | |
| pdfImagesMeta.push({ | |
| id: imgId, | |
| src: base64, | |
| pageNumber: pageNum, | |
| width: imgData.width, | |
| height: imgData.height, | |
| }); | |
| } catch (sharpError) { | |
| log.error(`Failed to convert image ${i + 1} from page ${pageNum}:`, sharpError); | |
| } | |
| } | |
| } catch (pageError) { | |
| log.error(`Failed to extract images from page ${pageNum}:`, pageError); | |
| } | |
| } | |
| return { | |
| text: pdfText, | |
| images, | |
| metadata: { | |
| pageCount: numPages, | |
| parser: 'unpdf', | |
| imageMapping: Object.fromEntries(pdfImagesMeta.map((m) => [m.id, m.src])), | |
| pdfImages: pdfImagesMeta, | |
| }, | |
| }; | |
| } | |
| /** | |
| * Parse PDF using self-hosted MinerU service (mineru-api) | |
| * | |
| * Official MinerU API endpoint: | |
| * POST /file_parse (multipart/form-data) | |
| * | |
| * Response format: | |
| * { results: { "document.pdf": { md_content, images, content_list, ... } } } | |
| * | |
| * @see https://github.com/opendatalab/MinerU | |
| */ | |
| async function parseWithMinerU( | |
| config: PDFParserConfig, | |
| pdfBuffer: Buffer, | |
| ): Promise<ParsedPdfContent> { | |
| if (!config.baseUrl) { | |
| throw new Error( | |
| 'MinerU base URL is required. ' + | |
| 'Please deploy MinerU locally or specify the server URL. ' + | |
| 'See: https://github.com/opendatalab/MinerU', | |
| ); | |
| } | |
| log.info('[MinerU] Parsing PDF with MinerU server:', config.baseUrl); | |
| const fileName = 'document.pdf'; | |
| // Create FormData for file upload | |
| const formData = new FormData(); | |
| // Convert Buffer to Blob | |
| const arrayBuffer = pdfBuffer.buffer.slice( | |
| pdfBuffer.byteOffset, | |
| pdfBuffer.byteOffset + pdfBuffer.byteLength, | |
| ); | |
| const blob = new Blob([arrayBuffer as ArrayBuffer], { | |
| type: 'application/pdf', | |
| }); | |
| formData.append('files', blob, fileName); | |
| // MinerU API form fields | |
| // Defaults already: return_md=true, formula_enable=true, table_enable=true | |
| formData.append('parse_method', 'auto'); | |
| // hybrid-auto-engine: best accuracy, uses VLM for layout understanding (requires GPU) | |
| // pipeline: basic mode, no VLM, faster but lower quality image extraction | |
| formData.append('backend', 'hybrid-auto-engine'); | |
| formData.append('return_content_list', 'true'); | |
| formData.append('return_images', 'true'); | |
| // API key (if required by deployment) | |
| const headers: Record<string, string> = {}; | |
| if (config.apiKey) { | |
| headers['Authorization'] = `Bearer ${config.apiKey}`; | |
| } | |
| // POST /file_parse | |
| const response = await fetch(`${config.baseUrl}/file_parse`, { | |
| method: 'POST', | |
| headers, | |
| body: formData, | |
| }); | |
| if (!response.ok) { | |
| const errorText = await response.text().catch(() => response.statusText); | |
| throw new Error(`MinerU API error (${response.status}): ${errorText}`); | |
| } | |
| const json = await response.json(); | |
| // Response: { results: { "<fileName>": { md_content, images, content_list, ... } } } | |
| const fileResult = json.results?.[fileName]; | |
| if (!fileResult) { | |
| const keys = json.results ? Object.keys(json.results) : []; | |
| // Try first available key in case filename doesn't match exactly | |
| const fallback = keys.length > 0 ? json.results[keys[0]] : null; | |
| if (!fallback) { | |
| throw new Error(`MinerU returned no results. Response keys: ${JSON.stringify(keys)}`); | |
| } | |
| log.warn(`[MinerU] Filename mismatch, using key "${keys[0]}" instead of "${fileName}"`); | |
| return extractMinerUResult(fallback); | |
| } | |
| return extractMinerUResult(fileResult); | |
| } | |
| /** Extract ParsedPdfContent from a single MinerU file result */ | |
| function extractMinerUResult(fileResult: Record<string, unknown>): ParsedPdfContent { | |
| const markdown: string = (fileResult.md_content as string) || ''; | |
| const imageData: Record<string, string> = {}; | |
| let pageCount = 0; | |
| // Extract images from the images object (key → base64 string) | |
| if (fileResult.images && typeof fileResult.images === 'object') { | |
| Object.entries(fileResult.images as Record<string, string>).forEach(([key, value]) => { | |
| imageData[key] = value.startsWith('data:') ? value : `data:image/png;base64,${value}`; | |
| }); | |
| } | |
| // Parse content_list to build image metadata lookup (img_path → metadata) | |
| const imageMetaLookup = new Map<string, { pageIdx: number; bbox: number[]; caption?: string }>(); | |
| const contentList = | |
| typeof fileResult.content_list === 'string' | |
| ? JSON.parse(fileResult.content_list as string) | |
| : fileResult.content_list; | |
| if (Array.isArray(contentList)) { | |
| const pages = new Set( | |
| contentList | |
| .map((item: Record<string, unknown>) => item.page_idx) | |
| .filter((v: unknown) => v != null), | |
| ); | |
| pageCount = pages.size; | |
| for (const item of contentList) { | |
| if (item.type === 'image' && item.img_path) { | |
| const metaEntry = { | |
| pageIdx: item.page_idx ?? 0, | |
| bbox: item.bbox || [0, 0, 1000, 1000], | |
| caption: Array.isArray(item.image_caption) ? item.image_caption[0] : undefined, | |
| }; | |
| // Store under both the full path and basename so lookup works | |
| // regardless of whether images dict uses "abc.jpg" or "images/abc.jpg" | |
| imageMetaLookup.set(item.img_path, metaEntry); | |
| const basename = item.img_path.split('/').pop(); | |
| if (basename && basename !== item.img_path) { | |
| imageMetaLookup.set(basename, metaEntry); | |
| } | |
| } | |
| } | |
| } | |
| // Build image mapping and pdfImages array | |
| const imageMapping: Record<string, string> = {}; | |
| const pdfImages: Array<{ | |
| id: string; | |
| src: string; | |
| pageNumber: number; | |
| description?: string; | |
| width?: number; | |
| height?: number; | |
| }> = []; | |
| Object.entries(imageData).forEach(([key, base64Url], index) => { | |
| const imageId = key.startsWith('img_') ? key : `img_${index + 1}`; | |
| imageMapping[imageId] = base64Url; | |
| // Try exact key first, then with 'images/' prefix (MinerU content_list uses prefixed paths) | |
| const meta = imageMetaLookup.get(key) || imageMetaLookup.get(`images/${key}`); | |
| pdfImages.push({ | |
| id: imageId, | |
| src: base64Url, | |
| pageNumber: meta ? meta.pageIdx + 1 : 0, | |
| description: meta?.caption, | |
| width: meta ? meta.bbox[2] - meta.bbox[0] : undefined, | |
| height: meta ? meta.bbox[3] - meta.bbox[1] : undefined, | |
| }); | |
| }); | |
| const images = Object.values(imageMapping); | |
| log.info( | |
| `[MinerU] Parsed successfully: ${images.length} images, ` + | |
| `${markdown.length} chars of markdown`, | |
| ); | |
| return { | |
| text: markdown, | |
| images, | |
| metadata: { | |
| pageCount, | |
| parser: 'mineru', | |
| imageMapping, | |
| pdfImages, | |
| }, | |
| }; | |
| } | |
| /** | |
| * Get current PDF parser configuration from settings store | |
| * Note: This function should only be called in browser context | |
| */ | |
| export async function getCurrentPDFConfig(): Promise<PDFParserConfig> { | |
| if (typeof window === 'undefined') { | |
| throw new Error('getCurrentPDFConfig() can only be called in browser context'); | |
| } | |
| // Dynamic import to avoid circular dependency | |
| const { useSettingsStore } = await import('@/lib/store/settings'); | |
| const { pdfProviderId, pdfProvidersConfig } = useSettingsStore.getState(); | |
| const providerConfig = pdfProvidersConfig?.[pdfProviderId]; | |
| return { | |
| providerId: pdfProviderId, | |
| apiKey: providerConfig?.apiKey, | |
| baseUrl: providerConfig?.baseUrl, | |
| }; | |
| } | |
| // Re-export from constants for convenience | |
| export { getAllPDFProviders, getPDFProvider } from './constants'; | |