Spaces:

Nitishkumar-ai
/

arena-learning

No application file

arena-learning / studyArena /lib /pdf /pdf-providers.ts

Nitish kumar

Upload folder using huggingface_hub

c20f20c verified 28 days ago

15 kB

	/**
	* PDF Parsing Provider Implementation
	*
	* Factory pattern for routing PDF parsing requests to appropriate provider implementations.
	* Follows the same architecture as lib/ai/providers.ts for consistency.
	*
	* Currently Supported Providers:
	* - unpdf: Built-in Node.js PDF parser with text and image extraction
	* - MinerU: Advanced commercial service with OCR, formula, and table extraction
	* (https://mineru.ai or self-hosted)
	*
	* HOW TO ADD A NEW PROVIDER:
	*
	* 1. Add provider ID to PDFProviderId in lib/pdf/types.ts
	* Example: \| 'tesseract-ocr'
	*
	* 2. Add provider configuration to lib/pdf/constants.ts
	* Example:
	* 'tesseract-ocr': {
	* id: 'tesseract-ocr',
	* name: 'Tesseract OCR',
	* requiresApiKey: false,
	* icon: '/tesseract.svg',
	* features: ['text', 'images', 'ocr']
	* }
	*
	* 3. Implement provider function in this file
	* Pattern: async function parseWithXxx(config, pdfBuffer): Promise<ParsedPdfContent>
	* - Accept PDF as Buffer
	* - Extract text, images, tables, formulas as needed
	* - Return unified format:
	* {
	* text: string, // Markdown or plain text
	* images: string[], // Base64 data URLs
	* metadata: {
	* pageCount: number,
	* parser: string,
	* ... // Provider-specific metadata
	* }
	* }
	*
	* Example:
	* async function parseWithTesseractOCR(
	* config: PDFParserConfig,
	* pdfBuffer: Buffer
	* ): Promise<ParsedPdfContent> {
	* const { createWorker } = await import('tesseract.js');
	*
	* // Convert PDF pages to images
	* const pdf = await getDocumentProxy(new Uint8Array(pdfBuffer));
	* const numPages = pdf.numPages;
	*
	* const texts: string[] = [];
	* const images: string[] = [];
	*
	* for (let pageNum = 1; pageNum <= numPages; pageNum++) {
	* // Render page to canvas/image
	* const page = await pdf.getPage(pageNum);
	* const viewport = page.getViewport({ scale: 2.0 });
	* const canvas = createCanvas(viewport.width, viewport.height);
	* const context = canvas.getContext('2d');
	* await page.render({ canvasContext: context, viewport }).promise;
	*
	* // OCR the image
	* const worker = await createWorker('eng+chi_sim');
	* const { data: { text } } = await worker.recognize(canvas.toBuffer());
	* texts.push(text);
	* await worker.terminate();
	*
	* // Save image
	* images.push(canvas.toDataURL());
	* }
	*
	* return {
	* text: texts.join('\n\n'),
	* images,
	* metadata: {
	* pageCount: numPages,
	* parser: 'tesseract-ocr',
	* },
	* };
	* }
	*
	* 4. Add case to parsePDF() switch statement
	* case 'tesseract-ocr':
	* result = await parseWithTesseractOCR(config, pdfBuffer);
	* break;
	*
	* 5. Add i18n translations in lib/i18n.ts
	* providerTesseractOCR: { zh: 'Tesseract OCR', en: 'Tesseract OCR' }
	*
	* 6. Update features in constants.ts to reflect parser capabilities
	* features: ['text', 'images', 'ocr'] // OCR-capable
	*
	* Provider Implementation Patterns:
	*
	* Pattern 1: Local Node.js Parser (like unpdf)
	* - Import parsing library
	* - Process Buffer directly
	* - Extract text and images synchronously or asynchronously
	* - Convert images to base64 data URLs
	* - Return immediately
	*
	* Pattern 2: Remote API (like MinerU)
	* - Upload PDF or provide URL
	* - Create task and get task ID
	* - Poll for completion (with timeout)
	* - Download results (text, images, metadata)
	* - Parse and convert to unified format
	*
	* Pattern 3: OCR-based Parser (Tesseract, Google Vision)
	* - Render PDF pages to images
	* - Send images to OCR service
	* - Collect text from all pages
	* - Combine with layout analysis if available
	* - Return combined text and original images
	*
	* Image Extraction Best Practices:
	* - Always convert to base64 data URLs (data:image/png;base64,...)
	* - Use PNG for lossless quality
	* - Use sharp for efficient image processing
	* - Handle errors per image (don't fail entire parsing)
	* - Log extraction failures but continue processing
	*
	* Metadata Recommendations:
	* - pageCount: Number of pages in PDF
	* - parser: Provider ID for debugging
	* - processingTime: Time taken (auto-added)
	* - taskId/jobId: For async providers (useful for troubleshooting)
	* - Custom fields: imageMapping, pdfImages, tables, formulas, etc.
	*
	* Error Handling:
	* - Validate API key if requiresApiKey is true
	* - Throw descriptive errors for missing configuration
	* - For async providers, handle timeout and polling errors
	* - Log warnings for non-critical failures (e.g., single page errors)
	* - Always include provider name in error messages
	*/

	import { extractText, getDocumentProxy, extractImages } from 'unpdf';
	import sharp from 'sharp';
	import type { PDFParserConfig } from './types';
	import type { ParsedPdfContent } from '@/lib/types/pdf';
	import { PDF_PROVIDERS } from './constants';
	import { createLogger } from '@/lib/logger';

	const log = createLogger('PDFProviders');

	/**
	* Parse PDF using specified provider
	*/
	export async function parsePDF(
	config: PDFParserConfig,
	pdfBuffer: Buffer,
	): Promise<ParsedPdfContent> {
	const provider = PDF_PROVIDERS[config.providerId];
	if (!provider) {
	throw new Error(`Unknown PDF provider: ${config.providerId}`);
	}

	// Validate API key if required
	if (provider.requiresApiKey && !config.apiKey) {
	throw new Error(`API key required for PDF provider: ${config.providerId}`);
	}

	const startTime = Date.now();

	let result: ParsedPdfContent;

	switch (config.providerId) {
	case 'unpdf':
	result = await parseWithUnpdf(pdfBuffer);
	break;

	case 'mineru':
	result = await parseWithMinerU(config, pdfBuffer);
	break;

	default:
	throw new Error(`Unsupported PDF provider: ${config.providerId}`);
	}

	// Add processing time to metadata
	if (result.metadata) {
	result.metadata.processingTime = Date.now() - startTime;
	}

	return result;
	}

	/**
	* Parse PDF using unpdf (existing implementation)
	*/
	async function parseWithUnpdf(pdfBuffer: Buffer): Promise<ParsedPdfContent> {
	const uint8Array = new Uint8Array(pdfBuffer);
	const pdf = await getDocumentProxy(uint8Array);
	const numPages = pdf.numPages;

	// Extract text using the document proxy
	const { text: pdfText } = await extractText(pdf, {
	mergePages: true,
	});

	// Extract images using the same document proxy
	const images: string[] = [];
	const pdfImagesMeta: Array<{
	id: string;
	src: string;
	pageNumber: number;
	width: number;
	height: number;
	}> = [];
	let imageCounter = 0;

	for (let pageNum = 1; pageNum <= numPages; pageNum++) {
	try {
	const pageImages = await extractImages(pdf, pageNum);
	for (let i = 0; i < pageImages.length; i++) {
	const imgData = pageImages[i];
	try {
	// Use sharp to convert raw image data to PNG base64
	const pngBuffer = await sharp(Buffer.from(imgData.data), {
	raw: {
	width: imgData.width,
	height: imgData.height,
	channels: imgData.channels,
	},
	})
	.png()
	.toBuffer();

	// Convert to base64
	const base64 = `data:image/png;base64,${pngBuffer.toString('base64')}`;
	imageCounter++;
	const imgId = `img_${imageCounter}`;
	images.push(base64);
	pdfImagesMeta.push({
	id: imgId,
	src: base64,
	pageNumber: pageNum,
	width: imgData.width,
	height: imgData.height,
	});
	} catch (sharpError) {
	log.error(`Failed to convert image ${i + 1} from page ${pageNum}:`, sharpError);
	}
	}
	} catch (pageError) {
	log.error(`Failed to extract images from page ${pageNum}:`, pageError);
	}
	}

	return {
	text: pdfText,
	images,
	metadata: {
	pageCount: numPages,
	parser: 'unpdf',
	imageMapping: Object.fromEntries(pdfImagesMeta.map((m) => [m.id, m.src])),
	pdfImages: pdfImagesMeta,
	},
	};
	}

	/**
	* Parse PDF using self-hosted MinerU service (mineru-api)
	*
	* Official MinerU API endpoint:
	* POST /file_parse (multipart/form-data)
	*
	* Response format:
	* { results: { "document.pdf": { md_content, images, content_list, ... } } }
	*
	* @see https://github.com/opendatalab/MinerU
	*/
	async function parseWithMinerU(
	config: PDFParserConfig,
	pdfBuffer: Buffer,
	): Promise<ParsedPdfContent> {
	if (!config.baseUrl) {
	throw new Error(
	'MinerU base URL is required. ' +
	'Please deploy MinerU locally or specify the server URL. ' +
	'See: https://github.com/opendatalab/MinerU',
	);
	}

	log.info('[MinerU] Parsing PDF with MinerU server:', config.baseUrl);

	const fileName = 'document.pdf';

	// Create FormData for file upload
	const formData = new FormData();

	// Convert Buffer to Blob
	const arrayBuffer = pdfBuffer.buffer.slice(
	pdfBuffer.byteOffset,
	pdfBuffer.byteOffset + pdfBuffer.byteLength,
	);
	const blob = new Blob([arrayBuffer as ArrayBuffer], {
	type: 'application/pdf',
	});
	formData.append('files', blob, fileName);

	// MinerU API form fields
	// Defaults already: return_md=true, formula_enable=true, table_enable=true
	formData.append('parse_method', 'auto');
	// hybrid-auto-engine: best accuracy, uses VLM for layout understanding (requires GPU)
	// pipeline: basic mode, no VLM, faster but lower quality image extraction
	formData.append('backend', 'hybrid-auto-engine');
	formData.append('return_content_list', 'true');
	formData.append('return_images', 'true');

	// API key (if required by deployment)
	const headers: Record<string, string> = {};
	if (config.apiKey) {
	headers['Authorization'] = `Bearer ${config.apiKey}`;
	}

	// POST /file_parse
	const response = await fetch(`${config.baseUrl}/file_parse`, {
	method: 'POST',
	headers,
	body: formData,
	});

	if (!response.ok) {
	const errorText = await response.text().catch(() => response.statusText);
	throw new Error(`MinerU API error (${response.status}): ${errorText}`);
	}

	const json = await response.json();

	// Response: { results: { "<fileName>": { md_content, images, content_list, ... } } }
	const fileResult = json.results?.[fileName];
	if (!fileResult) {
	const keys = json.results ? Object.keys(json.results) : [];
	// Try first available key in case filename doesn't match exactly
	const fallback = keys.length > 0 ? json.results[keys[0]] : null;
	if (!fallback) {
	throw new Error(`MinerU returned no results. Response keys: ${JSON.stringify(keys)}`);
	}
	log.warn(`[MinerU] Filename mismatch, using key "${keys[0]}" instead of "${fileName}"`);
	return extractMinerUResult(fallback);
	}

	return extractMinerUResult(fileResult);
	}

	/** Extract ParsedPdfContent from a single MinerU file result */
	function extractMinerUResult(fileResult: Record<string, unknown>): ParsedPdfContent {
	const markdown: string = (fileResult.md_content as string) \|\| '';
	const imageData: Record<string, string> = {};
	let pageCount = 0;

	// Extract images from the images object (key → base64 string)
	if (fileResult.images && typeof fileResult.images === 'object') {
	Object.entries(fileResult.images as Record<string, string>).forEach(([key, value]) => {
	imageData[key] = value.startsWith('data:') ? value : `data:image/png;base64,${value}`;
	});
	}

	// Parse content_list to build image metadata lookup (img_path → metadata)
	const imageMetaLookup = new Map<string, { pageIdx: number; bbox: number[]; caption?: string }>();
	const contentList =
	typeof fileResult.content_list === 'string'
	? JSON.parse(fileResult.content_list as string)
	: fileResult.content_list;
	if (Array.isArray(contentList)) {
	const pages = new Set(
	contentList
	.map((item: Record<string, unknown>) => item.page_idx)
	.filter((v: unknown) => v != null),
	);
	pageCount = pages.size;

	for (const item of contentList) {
	if (item.type === 'image' && item.img_path) {
	const metaEntry = {
	pageIdx: item.page_idx ?? 0,
	bbox: item.bbox \|\| [0, 0, 1000, 1000],
	caption: Array.isArray(item.image_caption) ? item.image_caption[0] : undefined,
	};
	// Store under both the full path and basename so lookup works
	// regardless of whether images dict uses "abc.jpg" or "images/abc.jpg"
	imageMetaLookup.set(item.img_path, metaEntry);
	const basename = item.img_path.split('/').pop();
	if (basename && basename !== item.img_path) {
	imageMetaLookup.set(basename, metaEntry);
	}
	}
	}
	}

	// Build image mapping and pdfImages array
	const imageMapping: Record<string, string> = {};
	const pdfImages: Array<{
	id: string;
	src: string;
	pageNumber: number;
	description?: string;
	width?: number;
	height?: number;
	}> = [];

	Object.entries(imageData).forEach(([key, base64Url], index) => {
	const imageId = key.startsWith('img_') ? key : `img_${index + 1}`;
	imageMapping[imageId] = base64Url;
	// Try exact key first, then with 'images/' prefix (MinerU content_list uses prefixed paths)
	const meta = imageMetaLookup.get(key) \|\| imageMetaLookup.get(`images/${key}`);
	pdfImages.push({
	id: imageId,
	src: base64Url,
	pageNumber: meta ? meta.pageIdx + 1 : 0,
	description: meta?.caption,
	width: meta ? meta.bbox[2] - meta.bbox[0] : undefined,
	height: meta ? meta.bbox[3] - meta.bbox[1] : undefined,
	});
	});

	const images = Object.values(imageMapping);

	log.info(
	`[MinerU] Parsed successfully: ${images.length} images, ` +
	`${markdown.length} chars of markdown`,
	);

	return {
	text: markdown,
	images,
	metadata: {
	pageCount,
	parser: 'mineru',
	imageMapping,
	pdfImages,
	},
	};
	}

	/**
	* Get current PDF parser configuration from settings store
	* Note: This function should only be called in browser context
	*/
	export async function getCurrentPDFConfig(): Promise<PDFParserConfig> {
	if (typeof window === 'undefined') {
	throw new Error('getCurrentPDFConfig() can only be called in browser context');
	}

	// Dynamic import to avoid circular dependency
	const { useSettingsStore } = await import('@/lib/store/settings');
	const { pdfProviderId, pdfProvidersConfig } = useSettingsStore.getState();

	const providerConfig = pdfProvidersConfig?.[pdfProviderId];

	return {
	providerId: pdfProviderId,
	apiKey: providerConfig?.apiKey,
	baseUrl: providerConfig?.baseUrl,
	};
	}

	// Re-export from constants for convenience
	export { getAllPDFProviders, getPDFProvider } from './constants';