Spaces:

Raj718
/

mcp-client

Runtime error

App Files Files Community

mcp-client / Projects /LeaseGuard /src /lib /document-processor.ts

Raj718

feat: Complete Task 3 - Document Processing Pipeline with comprehensive TDD implementation

ea8da24 6 months ago

raw

history blame contribute delete

11.6 kB

	import Tesseract from 'tesseract.js';
	import geminiClient from './gemini';
	import redisClient from './redis';
	import { ViolationPattern, getAllViolationPatterns, findViolationPatternById } from './housing-law-database';

	// PDF.js will be imported dynamically when needed
	let pdfjsLib: any = null;
	let pdfjsLoaded = false;

	export interface ProcessedClause {
	id: string;
	text: string;
	section: string;
	vector: number[];
	metadata: {
	leaseId: string;
	flagged: boolean;
	severity?: 'Critical' \| 'High' \| 'Medium' \| 'Low';
	violationType?: string;
	legalReference?: string;
	confidence: number;
	};
	}

	export interface LeaseAnalysis {
	leaseId: string;
	clauses: ProcessedClause[];
	violations: Array<{
	clauseId: string;
	type: string;
	description: string;
	legalReference: string;
	severity: 'Critical' \| 'High' \| 'Medium' \| 'Low';
	}>;
	summary: {
	totalClauses: number;
	flaggedClauses: number;
	criticalViolations: number;
	highViolations: number;
	mediumViolations: number;
	lowViolations: number;
	};
	}

	/**
	* Document processing pipeline for LeaseGuard
	* Handles PDF text extraction, OCR, and clause analysis
	*/
	class DocumentProcessor {
	/**
	* Process uploaded document (PDF or image)
	* @param file - Uploaded file
	* @param leaseId - Unique lease identifier
	* @returns Processed lease analysis
	*/
	async processDocument(file: File, leaseId: string): Promise<LeaseAnalysis> {
	try {
	console.log(`Processing document: ${file.name} (${file.size} bytes)`);

	// Extract text from document
	const extractedText = await this.extractText(file);

	// Extract clauses using AI
	const extractedClauses = await geminiClient.extractClauses(extractedText);

	// Generate embeddings and analyze clauses
	const processedClauses = await this.processClauses(extractedClauses, leaseId);

	// Detect violations
	const violations = await this.detectViolations(processedClauses);

	// Store in Redis
	await this.storeInRedis(processedClauses, leaseId);

	// Generate summary
	const summary = this.generateSummary(processedClauses, violations);

	return {
	leaseId,
	clauses: processedClauses,
	violations,
	summary
	};
	} catch (error) {
	console.error('Error processing document:', error);
	throw new Error(`Failed to process document: ${error instanceof Error ? error.message : 'Unknown error'}`);
	}
	}

	/**
	* Extract text from PDF or image file
	*/
	private async extractText(file: File): Promise<string> {
	const fileType = file.type;

	if (fileType === 'application/pdf') {
	return await this.extractTextFromPDF(file);
	} else if (fileType.startsWith('image/')) {
	return await this.extractTextFromImage(file);
	} else {
	throw new Error('Unsupported file type. Please upload a PDF or image file.');
	}
	}

	/**
	* Extract text from PDF using PDF.js
	*/
	private async extractTextFromPDF(file: File): Promise<string> {
	try {
	// Dynamically import PDF.js only when needed
	if (!pdfjsLoaded) {
	try {
	pdfjsLib = await import('pdfjs-dist');
	// Configure PDF.js worker
	pdfjsLib.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.js`;
	pdfjsLoaded = true;
	} catch (importError) {
	console.error('Failed to import PDF.js:', importError);
	throw new Error('PDF processing is not available in this environment');
	}
	}

	const arrayBuffer = await file.arrayBuffer();
	const pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise;

	let fullText = '';

	for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
	const page = await pdf.getPage(pageNum);
	const textContent = await page.getTextContent();

	const pageText = textContent.items
	.map((item: any) => item.str)
	.join(' ');

	fullText += pageText + '\n';
	}

	return fullText.trim();
	} catch (error) {
	console.error('Error extracting text from PDF:', error);
	throw new Error('Failed to extract text from PDF. The file may be corrupted or password-protected.');
	}
	}

	/**
	* Extract text from image using Tesseract.js OCR
	*/
	private async extractTextFromImage(file: File): Promise<string> {
	try {
	const result = await Tesseract.recognize(file, 'eng', {
	logger: m => console.log(m)
	});

	return result.data.text.trim();
	} catch (error) {
	console.error('Error extracting text from image:', error);
	throw new Error('Failed to extract text from image. Please ensure the image is clear and readable.');
	}
	}

	/**
	* Process extracted clauses with embeddings and violation detection
	*/
	private async processClauses(
	extractedClauses: Array<{ text: string; section: string }>,
	leaseId: string
	): Promise<ProcessedClause[]> {
	const processedClauses: ProcessedClause[] = [];

	for (const clause of extractedClauses) {
	try {
	// Generate embedding
	const vector = await geminiClient.generateEmbedding(clause.text);

	// Detect violations
	const violation = await this.detectClauseViolation(clause.text);

	const processedClause: ProcessedClause = {
	id: `${leaseId}_${processedClauses.length}`,
	text: clause.text,
	section: clause.section,
	vector,
	metadata: {
	leaseId,
	flagged: !!violation,
	severity: violation?.severity,
	violationType: violation?.violation_type,
	legalReference: violation?.legal_violation,
	confidence: violation ? 0.85 : 0.0
	}
	};

	processedClauses.push(processedClause);
	} catch (error) {
	console.error('Error processing clause:', error);
	// Continue with other clauses
	}
	}

	return processedClauses;
	}

	/**
	* Detect violations in a single clause
	*/
	private async detectClauseViolation(clauseText: string): Promise<ViolationPattern \| null> {
	try {
	// First, try regex-based detection for speed
	const violationPatterns = getAllViolationPatterns();

	for (const pattern of violationPatterns) {
	const regex = new RegExp(pattern.detection_regex, 'i');
	if (regex.test(clauseText)) {
	return pattern;
	}
	}

	// If no regex match, try vector similarity search
	const clauseEmbedding = await geminiClient.generateEmbedding(clauseText);
	const redis = redisClient.getClient();

	// Search for similar violation patterns in Redis
	const searchResults = await redis.ft.search('clause_idx',
	`*=>[KNN 5 @vector $vector AS score]`,
	{
	PARAMS: {
	vector: Buffer.from(Float32Array.from(clauseEmbedding).buffer)
	},
	RETURN: ['text', 'metadata', 'score'],
	SORTBY: 'score'
	}
	);

	// Check if any violation patterns have high similarity
	for (const result of searchResults.documents) {
	const score = parseFloat(result.score as string);
	if (score >= 0.85) {
	const metadata = result.metadata as any;
	if (metadata?.violationType) {
	return findViolationPatternById(metadata.violationType);
	}
	}
	}

	return null;
	} catch (error) {
	console.error('Error detecting clause violation:', error);
	return null;
	}
	}

	/**
	* Detect all violations in processed clauses
	*/
	private async detectViolations(clauses: ProcessedClause[]): Promise<LeaseAnalysis['violations']> {
	const violations: LeaseAnalysis['violations'] = [];

	for (const clause of clauses) {
	if (clause.metadata.flagged && clause.metadata.violationType) {
	violations.push({
	clauseId: clause.id,
	type: clause.metadata.violationType,
	description: clause.text,
	legalReference: clause.metadata.legalReference \|\| 'Unknown',
	severity: clause.metadata.severity \|\| 'Low'
	});
	}
	}

	return violations;
	}

	/**
	* Store processed clauses in Redis
	*/
	private async storeInRedis(clauses: ProcessedClause[], leaseId: string): Promise<void> {
	try {
	const redis = redisClient.getClient();

	for (const clause of clauses) {
	const key = `clause:${clause.id}`;

	await redis.json.set(key, '$', {
	text: clause.text,
	vector: clause.vector,
	metadata: clause.metadata
	});

	// Set expiration for 30 days
	await redis.expire(key, 30 * 24 * 60 * 60);
	}

	// Store lease metadata
	await redis.json.set(`lease:${leaseId}`, '$', {
	id: leaseId,
	processedAt: new Date().toISOString(),
	clauseCount: clauses.length,
	flaggedCount: clauses.filter(c => c.metadata.flagged).length
	});

	console.log(`Stored ${clauses.length} clauses in Redis for lease ${leaseId}`);
	} catch (error) {
	console.error('Error storing in Redis:', error);
	// Don't throw error - Redis storage failure shouldn't block document processing
	// The analysis results are still valid and can be returned to the user
	console.warn('Redis storage failed, but document processing completed successfully');
	}
	}

	/**
	* Generate analysis summary
	*/
	private generateSummary(
	clauses: ProcessedClause[],
	violations: LeaseAnalysis['violations']
	): LeaseAnalysis['summary'] {
	const flaggedClauses = clauses.filter(c => c.metadata.flagged);

	return {
	totalClauses: clauses.length,
	flaggedClauses: flaggedClauses.length,
	criticalViolations: violations.filter(v => v.severity === 'Critical').length,
	highViolations: violations.filter(v => v.severity === 'High').length,
	mediumViolations: violations.filter(v => v.severity === 'Medium').length,
	lowViolations: violations.filter(v => v.severity === 'Low').length
	};
	}

	/**
	* Validate file before processing
	*/
	validateFile(file: File): { valid: boolean; error?: string } {
	const maxSize = 10 * 1024 * 1024; // 10MB
	const allowedTypes = [
	'application/pdf',
	'image/jpeg',
	'image/jpg',
	'image/png',
	'image/tiff',
	'image/bmp'
	];

	if (file.size > maxSize) {
	return { valid: false, error: 'File size must be less than 10MB' };
	}

	if (!allowedTypes.includes(file.type)) {
	return { valid: false, error: 'File type not supported. Please upload a PDF or image file.' };
	}

	return { valid: true };
	}

	/**
	* Health check for document processing
	*/
	async healthCheck(): Promise<boolean> {
	try {
	// Check if Tesseract is available
	const tesseractAvailable = typeof Tesseract !== 'undefined';

	// PDF.js will be loaded dynamically when needed
	return tesseractAvailable;
	} catch (error) {
	console.error('Document processor health check failed:', error);
	return false;
	}
	}
	}

	// Singleton instance
	const documentProcessor = new DocumentProcessor();

	export default documentProcessor;