pythonprincess commited on
Commit
4a96fcc
·
verified ·
1 Parent(s): 8bfd955

Upload 2 files

Browse files
models/layoutlm/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # LayoutLM Document Processing Model Package
2
+
models/layoutlm/layoutlm_utils.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/layoutlm/layoutlm_utils.py
2
+
3
+ """
4
+ LayoutLM Model Utilities for PENNY Project
5
+ Handles document structure extraction and field recognition for civic forms and documents.
6
+ Provides async document processing with structured error handling and logging.
7
+ """
8
+
9
+ import asyncio
10
+ import time
11
+ from typing import Dict, Any, Optional, List
12
+ from io import BytesIO
13
+
14
+ # --- Logging Imports ---
15
+ from app.logging_utils import log_interaction, sanitize_for_logging
16
+
17
+ # --- Model Loader Import ---
18
+ try:
19
+ from app.model_loader import load_model_pipeline
20
+ MODEL_LOADER_AVAILABLE = True
21
+ except ImportError:
22
+ MODEL_LOADER_AVAILABLE = False
23
+ import logging
24
+ logging.getLogger(__name__).warning("Could not import load_model_pipeline. LayoutLM service unavailable.")
25
+
26
+ # Global variable to store the loaded pipeline for re-use
27
+ LAYOUTLM_PIPELINE: Optional[Any] = None
28
+ AGENT_NAME = "penny-doc-agent"
29
+ INITIALIZATION_ATTEMPTED = False
30
+
31
+
32
+ def _initialize_layoutlm_pipeline() -> bool:
33
+ """
34
+ Initializes the LayoutLM pipeline only once.
35
+
36
+ Returns:
37
+ bool: True if initialization succeeded, False otherwise.
38
+ """
39
+ global LAYOUTLM_PIPELINE, INITIALIZATION_ATTEMPTED
40
+
41
+ if INITIALIZATION_ATTEMPTED:
42
+ return LAYOUTLM_PIPELINE is not None
43
+
44
+ INITIALIZATION_ATTEMPTED = True
45
+
46
+ if not MODEL_LOADER_AVAILABLE:
47
+ log_interaction(
48
+ intent="layoutlm_initialization",
49
+ success=False,
50
+ error="model_loader unavailable"
51
+ )
52
+ return False
53
+
54
+ try:
55
+ log_interaction(
56
+ intent="layoutlm_initialization",
57
+ success=None,
58
+ details=f"Loading {AGENT_NAME}"
59
+ )
60
+
61
+ LAYOUTLM_PIPELINE = load_model_pipeline(AGENT_NAME)
62
+
63
+ if LAYOUTLM_PIPELINE is None:
64
+ log_interaction(
65
+ intent="layoutlm_initialization",
66
+ success=False,
67
+ error="Pipeline returned None"
68
+ )
69
+ return False
70
+
71
+ log_interaction(
72
+ intent="layoutlm_initialization",
73
+ success=True,
74
+ details=f"Model {AGENT_NAME} loaded successfully"
75
+ )
76
+ return True
77
+
78
+ except Exception as e:
79
+ log_interaction(
80
+ intent="layoutlm_initialization",
81
+ success=False,
82
+ error=str(e)
83
+ )
84
+ return False
85
+
86
+
87
+ # Attempt initialization at module load
88
+ _initialize_layoutlm_pipeline()
89
+
90
+
91
+ def is_layoutlm_available() -> bool:
92
+ """
93
+ Check if LayoutLM service is available.
94
+
95
+ Returns:
96
+ bool: True if LayoutLM pipeline is loaded and ready.
97
+ """
98
+ return LAYOUTLM_PIPELINE is not None
99
+
100
+
101
+ async def extract_document_data(
102
+ file_bytes: bytes,
103
+ file_name: str,
104
+ tenant_id: Optional[str] = None
105
+ ) -> Dict[str, Any]:
106
+ """
107
+ Processes a document (e.g., PDF, image) using LayoutLM to extract structured data.
108
+
109
+ Args:
110
+ file_bytes: The raw bytes of the uploaded file.
111
+ file_name: The original name of the file (e.g., form.pdf).
112
+ tenant_id: Optional tenant identifier for logging.
113
+
114
+ Returns:
115
+ A dictionary containing:
116
+ - status (str): "success" or "error"
117
+ - extracted_fields (dict, optional): Extracted key-value pairs
118
+ - available (bool): Whether the service was available
119
+ - message (str, optional): Error message if extraction failed
120
+ - response_time_ms (int, optional): Processing time in milliseconds
121
+ """
122
+ start_time = time.time()
123
+
124
+ global LAYOUTLM_PIPELINE
125
+
126
+ # Check availability
127
+ if not is_layoutlm_available():
128
+ log_interaction(
129
+ intent="layoutlm_extract",
130
+ tenant_id=tenant_id,
131
+ success=False,
132
+ error="LayoutLM pipeline not available",
133
+ fallback_used=True
134
+ )
135
+ return {
136
+ "status": "error",
137
+ "available": False,
138
+ "message": "Document processing is temporarily unavailable. Please try uploading your document again in a moment!"
139
+ }
140
+
141
+ # Validate inputs
142
+ if not file_bytes or not isinstance(file_bytes, bytes):
143
+ log_interaction(
144
+ intent="layoutlm_extract",
145
+ tenant_id=tenant_id,
146
+ success=False,
147
+ error="Invalid file_bytes provided"
148
+ )
149
+ return {
150
+ "status": "error",
151
+ "available": True,
152
+ "message": "I didn't receive valid document data. Could you try uploading your file again?"
153
+ }
154
+
155
+ if not file_name or not isinstance(file_name, str):
156
+ log_interaction(
157
+ intent="layoutlm_extract",
158
+ tenant_id=tenant_id,
159
+ success=False,
160
+ error="Invalid file_name provided"
161
+ )
162
+ return {
163
+ "status": "error",
164
+ "available": True,
165
+ "message": "I need a valid file name to process your document. Please try again!"
166
+ }
167
+
168
+ # Check file size (prevent processing extremely large files)
169
+ file_size_mb = len(file_bytes) / (1024 * 1024)
170
+ if file_size_mb > 50: # 50 MB limit
171
+ log_interaction(
172
+ intent="layoutlm_extract",
173
+ tenant_id=tenant_id,
174
+ success=False,
175
+ error=f"File too large: {file_size_mb:.2f}MB",
176
+ file_name=sanitize_for_logging(file_name)
177
+ )
178
+ return {
179
+ "status": "error",
180
+ "available": True,
181
+ "message": f"Your file is too large ({file_size_mb:.1f}MB). Please upload a document smaller than 50MB."
182
+ }
183
+
184
+ try:
185
+ # --- Real-world step (PLACEHOLDER) ---
186
+ # In a real implementation, you would:
187
+ # 1. Use a library (e.g., PyMuPDF, pdf2image) to convert PDF bytes to image(s).
188
+ # 2. Use PIL/Pillow to load the image(s) from bytes.
189
+ # 3. Pass the PIL Image object to the LayoutLM pipeline.
190
+
191
+ # For now, we use a simple mock placeholder for the image object:
192
+ image_mock = {
193
+ "file_name": file_name,
194
+ "byte_size": len(file_bytes)
195
+ }
196
+
197
+ loop = asyncio.get_event_loop()
198
+
199
+ # Run model inference in thread executor
200
+ results = await loop.run_in_executor(
201
+ None,
202
+ lambda: LAYOUTLM_PIPELINE(image_mock)
203
+ )
204
+
205
+ response_time_ms = int((time.time() - start_time) * 1000)
206
+
207
+ # Validate results
208
+ if not results or not isinstance(results, list):
209
+ log_interaction(
210
+ intent="layoutlm_extract",
211
+ tenant_id=tenant_id,
212
+ success=False,
213
+ error="Unexpected model output format",
214
+ response_time_ms=response_time_ms,
215
+ file_name=sanitize_for_logging(file_name)
216
+ )
217
+ return {
218
+ "status": "error",
219
+ "available": True,
220
+ "message": "I had trouble understanding the document structure. The file might be corrupted or in an unsupported format."
221
+ }
222
+
223
+ # Convert model output (list of dicts) into a clean key-value format
224
+ extracted_data = {}
225
+ for item in results:
226
+ if isinstance(item, dict) and 'label' in item and 'text' in item:
227
+ label_key = item['label'].lower().strip()
228
+ text_value = str(item['text']).strip()
229
+
230
+ # Avoid empty values
231
+ if text_value:
232
+ extracted_data[label_key] = text_value
233
+
234
+ # Log slow processing
235
+ if response_time_ms > 10000: # 10 seconds
236
+ log_interaction(
237
+ intent="layoutlm_extract_slow",
238
+ tenant_id=tenant_id,
239
+ success=True,
240
+ response_time_ms=response_time_ms,
241
+ details="Slow document processing detected",
242
+ file_name=sanitize_for_logging(file_name)
243
+ )
244
+
245
+ log_interaction(
246
+ intent="layoutlm_extract",
247
+ tenant_id=tenant_id,
248
+ success=True,
249
+ response_time_ms=response_time_ms,
250
+ file_name=sanitize_for_logging(file_name),
251
+ fields_extracted=len(extracted_data)
252
+ )
253
+
254
+ return {
255
+ "status": "success",
256
+ "extracted_fields": extracted_data,
257
+ "available": True,
258
+ "response_time_ms": response_time_ms,
259
+ "fields_count": len(extracted_data)
260
+ }
261
+
262
+ except asyncio.CancelledError:
263
+ log_interaction(
264
+ intent="layoutlm_extract",
265
+ tenant_id=tenant_id,
266
+ success=False,
267
+ error="Processing cancelled",
268
+ file_name=sanitize_for_logging(file_name)
269
+ )
270
+ raise
271
+
272
+ except Exception as e:
273
+ response_time_ms = int((time.time() - start_time) * 1000)
274
+
275
+ log_interaction(
276
+ intent="layoutlm_extract",
277
+ tenant_id=tenant_id,
278
+ success=False,
279
+ error=str(e),
280
+ response_time_ms=response_time_ms,
281
+ file_name=sanitize_for_logging(file_name),
282
+ fallback_used=True
283
+ )
284
+
285
+ return {
286
+ "status": "error",
287
+ "available": False,
288
+ "message": f"I encountered an issue while processing your document. Please try again, or contact support if this continues!",
289
+ "error": str(e),
290
+ "response_time_ms": response_time_ms
291
+ }
292
+
293
+
294
+ async def validate_document_fields(
295
+ extracted_fields: Dict[str, str],
296
+ required_fields: List[str],
297
+ tenant_id: Optional[str] = None
298
+ ) -> Dict[str, Any]:
299
+ """
300
+ Validates that required fields were successfully extracted from a document.
301
+
302
+ Args:
303
+ extracted_fields: Dictionary of extracted field names and values.
304
+ required_fields: List of field names that must be present.
305
+ tenant_id: Optional tenant identifier for logging.
306
+
307
+ Returns:
308
+ A dictionary containing:
309
+ - valid (bool): Whether all required fields are present
310
+ - missing_fields (list): List of missing required fields
311
+ - present_fields (list): List of found required fields
312
+ """
313
+ if not isinstance(extracted_fields, dict):
314
+ log_interaction(
315
+ intent="layoutlm_validate",
316
+ tenant_id=tenant_id,
317
+ success=False,
318
+ error="Invalid extracted_fields type"
319
+ )
320
+ return {
321
+ "valid": False,
322
+ "missing_fields": required_fields,
323
+ "present_fields": []
324
+ }
325
+
326
+ if not isinstance(required_fields, list):
327
+ log_interaction(
328
+ intent="layoutlm_validate",
329
+ tenant_id=tenant_id,
330
+ success=False,
331
+ error="Invalid required_fields type"
332
+ )
333
+ return {
334
+ "valid": False,
335
+ "missing_fields": [],
336
+ "present_fields": []
337
+ }
338
+
339
+ # Normalize field names for case-insensitive comparison
340
+ extracted_keys = {k.lower().strip() for k in extracted_fields.keys()}
341
+ required_keys = {f.lower().strip() for f in required_fields}
342
+
343
+ present_fields = [f for f in required_fields if f.lower().strip() in extracted_keys]
344
+ missing_fields = [f for f in required_fields if f.lower().strip() not in extracted_keys]
345
+
346
+ is_valid = len(missing_fields) == 0
347
+
348
+ log_interaction(
349
+ intent="layoutlm_validate",
350
+ tenant_id=tenant_id,
351
+ success=is_valid,
352
+ details=f"Validated {len(present_fields)}/{len(required_fields)} required fields"
353
+ )
354
+
355
+ return {
356
+ "valid": is_valid,
357
+ "missing_fields": missing_fields,
358
+ "present_fields": present_fields
359
+ }