import Tesseract from 'tesseract.js'; import geminiClient from './gemini'; import redisClient from './redis'; import { ViolationPattern, getAllViolationPatterns, findViolationPatternById } from './housing-law-database'; // PDF.js will be imported dynamically when needed let pdfjsLib: any = null; let pdfjsLoaded = false; export interface ProcessedClause { id: string; text: string; section: string; vector: number[]; metadata: { leaseId: string; flagged: boolean; severity?: 'Critical' | 'High' | 'Medium' | 'Low'; violationType?: string; legalReference?: string; confidence: number; }; } export interface LeaseAnalysis { leaseId: string; clauses: ProcessedClause[]; violations: Array<{ clauseId: string; type: string; description: string; legalReference: string; severity: 'Critical' | 'High' | 'Medium' | 'Low'; }>; summary: { totalClauses: number; flaggedClauses: number; criticalViolations: number; highViolations: number; mediumViolations: number; lowViolations: number; }; } /** * Document processing pipeline for LeaseGuard * Handles PDF text extraction, OCR, and clause analysis */ class DocumentProcessor { /** * Process uploaded document (PDF or image) * @param file - Uploaded file * @param leaseId - Unique lease identifier * @returns Processed lease analysis */ async processDocument(file: File, leaseId: string): Promise { try { console.log(`Processing document: ${file.name} (${file.size} bytes)`); // Extract text from document const extractedText = await this.extractText(file); // Extract clauses using AI const extractedClauses = await geminiClient.extractClauses(extractedText); // Generate embeddings and analyze clauses const processedClauses = await this.processClauses(extractedClauses, leaseId); // Detect violations const violations = await this.detectViolations(processedClauses); // Store in Redis await this.storeInRedis(processedClauses, leaseId); // Generate summary const summary = this.generateSummary(processedClauses, violations); return { leaseId, clauses: processedClauses, violations, summary }; } catch (error) { console.error('Error processing document:', error); throw new Error(`Failed to process document: ${error instanceof Error ? error.message : 'Unknown error'}`); } } /** * Extract text from PDF or image file */ private async extractText(file: File): Promise { const fileType = file.type; if (fileType === 'application/pdf') { return await this.extractTextFromPDF(file); } else if (fileType.startsWith('image/')) { return await this.extractTextFromImage(file); } else { throw new Error('Unsupported file type. Please upload a PDF or image file.'); } } /** * Extract text from PDF using PDF.js */ private async extractTextFromPDF(file: File): Promise { try { // Dynamically import PDF.js only when needed if (!pdfjsLoaded) { try { pdfjsLib = await import('pdfjs-dist'); // Configure PDF.js worker pdfjsLib.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.js`; pdfjsLoaded = true; } catch (importError) { console.error('Failed to import PDF.js:', importError); throw new Error('PDF processing is not available in this environment'); } } const arrayBuffer = await file.arrayBuffer(); const pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise; let fullText = ''; for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) { const page = await pdf.getPage(pageNum); const textContent = await page.getTextContent(); const pageText = textContent.items .map((item: any) => item.str) .join(' '); fullText += pageText + '\n'; } return fullText.trim(); } catch (error) { console.error('Error extracting text from PDF:', error); throw new Error('Failed to extract text from PDF. The file may be corrupted or password-protected.'); } } /** * Extract text from image using Tesseract.js OCR */ private async extractTextFromImage(file: File): Promise { try { const result = await Tesseract.recognize(file, 'eng', { logger: m => console.log(m) }); return result.data.text.trim(); } catch (error) { console.error('Error extracting text from image:', error); throw new Error('Failed to extract text from image. Please ensure the image is clear and readable.'); } } /** * Process extracted clauses with embeddings and violation detection */ private async processClauses( extractedClauses: Array<{ text: string; section: string }>, leaseId: string ): Promise { const processedClauses: ProcessedClause[] = []; for (const clause of extractedClauses) { try { // Generate embedding const vector = await geminiClient.generateEmbedding(clause.text); // Detect violations const violation = await this.detectClauseViolation(clause.text); const processedClause: ProcessedClause = { id: `${leaseId}_${processedClauses.length}`, text: clause.text, section: clause.section, vector, metadata: { leaseId, flagged: !!violation, severity: violation?.severity, violationType: violation?.violation_type, legalReference: violation?.legal_violation, confidence: violation ? 0.85 : 0.0 } }; processedClauses.push(processedClause); } catch (error) { console.error('Error processing clause:', error); // Continue with other clauses } } return processedClauses; } /** * Detect violations in a single clause */ private async detectClauseViolation(clauseText: string): Promise { try { // First, try regex-based detection for speed const violationPatterns = getAllViolationPatterns(); for (const pattern of violationPatterns) { const regex = new RegExp(pattern.detection_regex, 'i'); if (regex.test(clauseText)) { return pattern; } } // If no regex match, try vector similarity search const clauseEmbedding = await geminiClient.generateEmbedding(clauseText); const redis = redisClient.getClient(); // Search for similar violation patterns in Redis const searchResults = await redis.ft.search('clause_idx', `*=>[KNN 5 @vector $vector AS score]`, { PARAMS: { vector: Buffer.from(Float32Array.from(clauseEmbedding).buffer) }, RETURN: ['text', 'metadata', 'score'], SORTBY: 'score' } ); // Check if any violation patterns have high similarity for (const result of searchResults.documents) { const score = parseFloat(result.score as string); if (score >= 0.85) { const metadata = result.metadata as any; if (metadata?.violationType) { return findViolationPatternById(metadata.violationType); } } } return null; } catch (error) { console.error('Error detecting clause violation:', error); return null; } } /** * Detect all violations in processed clauses */ private async detectViolations(clauses: ProcessedClause[]): Promise { const violations: LeaseAnalysis['violations'] = []; for (const clause of clauses) { if (clause.metadata.flagged && clause.metadata.violationType) { violations.push({ clauseId: clause.id, type: clause.metadata.violationType, description: clause.text, legalReference: clause.metadata.legalReference || 'Unknown', severity: clause.metadata.severity || 'Low' }); } } return violations; } /** * Store processed clauses in Redis */ private async storeInRedis(clauses: ProcessedClause[], leaseId: string): Promise { try { const redis = redisClient.getClient(); for (const clause of clauses) { const key = `clause:${clause.id}`; await redis.json.set(key, '$', { text: clause.text, vector: clause.vector, metadata: clause.metadata }); // Set expiration for 30 days await redis.expire(key, 30 * 24 * 60 * 60); } // Store lease metadata await redis.json.set(`lease:${leaseId}`, '$', { id: leaseId, processedAt: new Date().toISOString(), clauseCount: clauses.length, flaggedCount: clauses.filter(c => c.metadata.flagged).length }); console.log(`Stored ${clauses.length} clauses in Redis for lease ${leaseId}`); } catch (error) { console.error('Error storing in Redis:', error); // Don't throw error - Redis storage failure shouldn't block document processing // The analysis results are still valid and can be returned to the user console.warn('Redis storage failed, but document processing completed successfully'); } } /** * Generate analysis summary */ private generateSummary( clauses: ProcessedClause[], violations: LeaseAnalysis['violations'] ): LeaseAnalysis['summary'] { const flaggedClauses = clauses.filter(c => c.metadata.flagged); return { totalClauses: clauses.length, flaggedClauses: flaggedClauses.length, criticalViolations: violations.filter(v => v.severity === 'Critical').length, highViolations: violations.filter(v => v.severity === 'High').length, mediumViolations: violations.filter(v => v.severity === 'Medium').length, lowViolations: violations.filter(v => v.severity === 'Low').length }; } /** * Validate file before processing */ validateFile(file: File): { valid: boolean; error?: string } { const maxSize = 10 * 1024 * 1024; // 10MB const allowedTypes = [ 'application/pdf', 'image/jpeg', 'image/jpg', 'image/png', 'image/tiff', 'image/bmp' ]; if (file.size > maxSize) { return { valid: false, error: 'File size must be less than 10MB' }; } if (!allowedTypes.includes(file.type)) { return { valid: false, error: 'File type not supported. Please upload a PDF or image file.' }; } return { valid: true }; } /** * Health check for document processing */ async healthCheck(): Promise { try { // Check if Tesseract is available const tesseractAvailable = typeof Tesseract !== 'undefined'; // PDF.js will be loaded dynamically when needed return tesseractAvailable; } catch (error) { console.error('Document processor health check failed:', error); return false; } } } // Singleton instance const documentProcessor = new DocumentProcessor(); export default documentProcessor;