mcp-client / Projects /LeaseGuard /src /lib /document-processor.ts
Raj718's picture
feat: Complete Task 3 - Document Processing Pipeline with comprehensive TDD implementation
ea8da24
import Tesseract from 'tesseract.js';
import geminiClient from './gemini';
import redisClient from './redis';
import { ViolationPattern, getAllViolationPatterns, findViolationPatternById } from './housing-law-database';
// PDF.js will be imported dynamically when needed
let pdfjsLib: any = null;
let pdfjsLoaded = false;
export interface ProcessedClause {
id: string;
text: string;
section: string;
vector: number[];
metadata: {
leaseId: string;
flagged: boolean;
severity?: 'Critical' | 'High' | 'Medium' | 'Low';
violationType?: string;
legalReference?: string;
confidence: number;
};
}
export interface LeaseAnalysis {
leaseId: string;
clauses: ProcessedClause[];
violations: Array<{
clauseId: string;
type: string;
description: string;
legalReference: string;
severity: 'Critical' | 'High' | 'Medium' | 'Low';
}>;
summary: {
totalClauses: number;
flaggedClauses: number;
criticalViolations: number;
highViolations: number;
mediumViolations: number;
lowViolations: number;
};
}
/**
* Document processing pipeline for LeaseGuard
* Handles PDF text extraction, OCR, and clause analysis
*/
class DocumentProcessor {
/**
* Process uploaded document (PDF or image)
* @param file - Uploaded file
* @param leaseId - Unique lease identifier
* @returns Processed lease analysis
*/
async processDocument(file: File, leaseId: string): Promise<LeaseAnalysis> {
try {
console.log(`Processing document: ${file.name} (${file.size} bytes)`);
// Extract text from document
const extractedText = await this.extractText(file);
// Extract clauses using AI
const extractedClauses = await geminiClient.extractClauses(extractedText);
// Generate embeddings and analyze clauses
const processedClauses = await this.processClauses(extractedClauses, leaseId);
// Detect violations
const violations = await this.detectViolations(processedClauses);
// Store in Redis
await this.storeInRedis(processedClauses, leaseId);
// Generate summary
const summary = this.generateSummary(processedClauses, violations);
return {
leaseId,
clauses: processedClauses,
violations,
summary
};
} catch (error) {
console.error('Error processing document:', error);
throw new Error(`Failed to process document: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Extract text from PDF or image file
*/
private async extractText(file: File): Promise<string> {
const fileType = file.type;
if (fileType === 'application/pdf') {
return await this.extractTextFromPDF(file);
} else if (fileType.startsWith('image/')) {
return await this.extractTextFromImage(file);
} else {
throw new Error('Unsupported file type. Please upload a PDF or image file.');
}
}
/**
* Extract text from PDF using PDF.js
*/
private async extractTextFromPDF(file: File): Promise<string> {
try {
// Dynamically import PDF.js only when needed
if (!pdfjsLoaded) {
try {
pdfjsLib = await import('pdfjs-dist');
// Configure PDF.js worker
pdfjsLib.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.js`;
pdfjsLoaded = true;
} catch (importError) {
console.error('Failed to import PDF.js:', importError);
throw new Error('PDF processing is not available in this environment');
}
}
const arrayBuffer = await file.arrayBuffer();
const pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise;
let fullText = '';
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
const page = await pdf.getPage(pageNum);
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item: any) => item.str)
.join(' ');
fullText += pageText + '\n';
}
return fullText.trim();
} catch (error) {
console.error('Error extracting text from PDF:', error);
throw new Error('Failed to extract text from PDF. The file may be corrupted or password-protected.');
}
}
/**
* Extract text from image using Tesseract.js OCR
*/
private async extractTextFromImage(file: File): Promise<string> {
try {
const result = await Tesseract.recognize(file, 'eng', {
logger: m => console.log(m)
});
return result.data.text.trim();
} catch (error) {
console.error('Error extracting text from image:', error);
throw new Error('Failed to extract text from image. Please ensure the image is clear and readable.');
}
}
/**
* Process extracted clauses with embeddings and violation detection
*/
private async processClauses(
extractedClauses: Array<{ text: string; section: string }>,
leaseId: string
): Promise<ProcessedClause[]> {
const processedClauses: ProcessedClause[] = [];
for (const clause of extractedClauses) {
try {
// Generate embedding
const vector = await geminiClient.generateEmbedding(clause.text);
// Detect violations
const violation = await this.detectClauseViolation(clause.text);
const processedClause: ProcessedClause = {
id: `${leaseId}_${processedClauses.length}`,
text: clause.text,
section: clause.section,
vector,
metadata: {
leaseId,
flagged: !!violation,
severity: violation?.severity,
violationType: violation?.violation_type,
legalReference: violation?.legal_violation,
confidence: violation ? 0.85 : 0.0
}
};
processedClauses.push(processedClause);
} catch (error) {
console.error('Error processing clause:', error);
// Continue with other clauses
}
}
return processedClauses;
}
/**
* Detect violations in a single clause
*/
private async detectClauseViolation(clauseText: string): Promise<ViolationPattern | null> {
try {
// First, try regex-based detection for speed
const violationPatterns = getAllViolationPatterns();
for (const pattern of violationPatterns) {
const regex = new RegExp(pattern.detection_regex, 'i');
if (regex.test(clauseText)) {
return pattern;
}
}
// If no regex match, try vector similarity search
const clauseEmbedding = await geminiClient.generateEmbedding(clauseText);
const redis = redisClient.getClient();
// Search for similar violation patterns in Redis
const searchResults = await redis.ft.search('clause_idx',
`*=>[KNN 5 @vector $vector AS score]`,
{
PARAMS: {
vector: Buffer.from(Float32Array.from(clauseEmbedding).buffer)
},
RETURN: ['text', 'metadata', 'score'],
SORTBY: 'score'
}
);
// Check if any violation patterns have high similarity
for (const result of searchResults.documents) {
const score = parseFloat(result.score as string);
if (score >= 0.85) {
const metadata = result.metadata as any;
if (metadata?.violationType) {
return findViolationPatternById(metadata.violationType);
}
}
}
return null;
} catch (error) {
console.error('Error detecting clause violation:', error);
return null;
}
}
/**
* Detect all violations in processed clauses
*/
private async detectViolations(clauses: ProcessedClause[]): Promise<LeaseAnalysis['violations']> {
const violations: LeaseAnalysis['violations'] = [];
for (const clause of clauses) {
if (clause.metadata.flagged && clause.metadata.violationType) {
violations.push({
clauseId: clause.id,
type: clause.metadata.violationType,
description: clause.text,
legalReference: clause.metadata.legalReference || 'Unknown',
severity: clause.metadata.severity || 'Low'
});
}
}
return violations;
}
/**
* Store processed clauses in Redis
*/
private async storeInRedis(clauses: ProcessedClause[], leaseId: string): Promise<void> {
try {
const redis = redisClient.getClient();
for (const clause of clauses) {
const key = `clause:${clause.id}`;
await redis.json.set(key, '$', {
text: clause.text,
vector: clause.vector,
metadata: clause.metadata
});
// Set expiration for 30 days
await redis.expire(key, 30 * 24 * 60 * 60);
}
// Store lease metadata
await redis.json.set(`lease:${leaseId}`, '$', {
id: leaseId,
processedAt: new Date().toISOString(),
clauseCount: clauses.length,
flaggedCount: clauses.filter(c => c.metadata.flagged).length
});
console.log(`Stored ${clauses.length} clauses in Redis for lease ${leaseId}`);
} catch (error) {
console.error('Error storing in Redis:', error);
// Don't throw error - Redis storage failure shouldn't block document processing
// The analysis results are still valid and can be returned to the user
console.warn('Redis storage failed, but document processing completed successfully');
}
}
/**
* Generate analysis summary
*/
private generateSummary(
clauses: ProcessedClause[],
violations: LeaseAnalysis['violations']
): LeaseAnalysis['summary'] {
const flaggedClauses = clauses.filter(c => c.metadata.flagged);
return {
totalClauses: clauses.length,
flaggedClauses: flaggedClauses.length,
criticalViolations: violations.filter(v => v.severity === 'Critical').length,
highViolations: violations.filter(v => v.severity === 'High').length,
mediumViolations: violations.filter(v => v.severity === 'Medium').length,
lowViolations: violations.filter(v => v.severity === 'Low').length
};
}
/**
* Validate file before processing
*/
validateFile(file: File): { valid: boolean; error?: string } {
const maxSize = 10 * 1024 * 1024; // 10MB
const allowedTypes = [
'application/pdf',
'image/jpeg',
'image/jpg',
'image/png',
'image/tiff',
'image/bmp'
];
if (file.size > maxSize) {
return { valid: false, error: 'File size must be less than 10MB' };
}
if (!allowedTypes.includes(file.type)) {
return { valid: false, error: 'File type not supported. Please upload a PDF or image file.' };
}
return { valid: true };
}
/**
* Health check for document processing
*/
async healthCheck(): Promise<boolean> {
try {
// Check if Tesseract is available
const tesseractAvailable = typeof Tesseract !== 'undefined';
// PDF.js will be loaded dynamically when needed
return tesseractAvailable;
} catch (error) {
console.error('Document processor health check failed:', error);
return false;
}
}
}
// Singleton instance
const documentProcessor = new DocumentProcessor();
export default documentProcessor;