pythonprincess commited on
Commit
0c6ad07
·
verified ·
1 Parent(s): f256d02

Delete layoutlm_utils.py

Browse files
Files changed (1) hide show
  1. layoutlm_utils.py +0 -359
layoutlm_utils.py DELETED
@@ -1,359 +0,0 @@
1
- # models/layoutlm/layoutlm_utils.py
2
-
3
- """
4
- LayoutLM Model Utilities for PENNY Project
5
- Handles document structure extraction and field recognition for civic forms and documents.
6
- Provides async document processing with structured error handling and logging.
7
- """
8
-
9
- import asyncio
10
- import time
11
- from typing import Dict, Any, Optional, List
12
- from io import BytesIO
13
-
14
- # --- Logging Imports ---
15
- from app.logging_utils import log_interaction, sanitize_for_logging
16
-
17
- # --- Model Loader Import ---
18
- try:
19
- from app.model_loader import load_model_pipeline
20
- MODEL_LOADER_AVAILABLE = True
21
- except ImportError:
22
- MODEL_LOADER_AVAILABLE = False
23
- import logging
24
- logging.getLogger(__name__).warning("Could not import load_model_pipeline. LayoutLM service unavailable.")
25
-
26
- # Global variable to store the loaded pipeline for re-use
27
- LAYOUTLM_PIPELINE: Optional[Any] = None
28
- AGENT_NAME = "penny-doc-agent"
29
- INITIALIZATION_ATTEMPTED = False
30
-
31
-
32
- def _initialize_layoutlm_pipeline() -> bool:
33
- """
34
- Initializes the LayoutLM pipeline only once.
35
-
36
- Returns:
37
- bool: True if initialization succeeded, False otherwise.
38
- """
39
- global LAYOUTLM_PIPELINE, INITIALIZATION_ATTEMPTED
40
-
41
- if INITIALIZATION_ATTEMPTED:
42
- return LAYOUTLM_PIPELINE is not None
43
-
44
- INITIALIZATION_ATTEMPTED = True
45
-
46
- if not MODEL_LOADER_AVAILABLE:
47
- log_interaction(
48
- intent="layoutlm_initialization",
49
- success=False,
50
- error="model_loader unavailable"
51
- )
52
- return False
53
-
54
- try:
55
- log_interaction(
56
- intent="layoutlm_initialization",
57
- success=None,
58
- details=f"Loading {AGENT_NAME}"
59
- )
60
-
61
- LAYOUTLM_PIPELINE = load_model_pipeline(AGENT_NAME)
62
-
63
- if LAYOUTLM_PIPELINE is None:
64
- log_interaction(
65
- intent="layoutlm_initialization",
66
- success=False,
67
- error="Pipeline returned None"
68
- )
69
- return False
70
-
71
- log_interaction(
72
- intent="layoutlm_initialization",
73
- success=True,
74
- details=f"Model {AGENT_NAME} loaded successfully"
75
- )
76
- return True
77
-
78
- except Exception as e:
79
- log_interaction(
80
- intent="layoutlm_initialization",
81
- success=False,
82
- error=str(e)
83
- )
84
- return False
85
-
86
-
87
- # Attempt initialization at module load
88
- _initialize_layoutlm_pipeline()
89
-
90
-
91
- def is_layoutlm_available() -> bool:
92
- """
93
- Check if LayoutLM service is available.
94
-
95
- Returns:
96
- bool: True if LayoutLM pipeline is loaded and ready.
97
- """
98
- return LAYOUTLM_PIPELINE is not None
99
-
100
-
101
- async def extract_document_data(
102
- file_bytes: bytes,
103
- file_name: str,
104
- tenant_id: Optional[str] = None
105
- ) -> Dict[str, Any]:
106
- """
107
- Processes a document (e.g., PDF, image) using LayoutLM to extract structured data.
108
-
109
- Args:
110
- file_bytes: The raw bytes of the uploaded file.
111
- file_name: The original name of the file (e.g., form.pdf).
112
- tenant_id: Optional tenant identifier for logging.
113
-
114
- Returns:
115
- A dictionary containing:
116
- - status (str): "success" or "error"
117
- - extracted_fields (dict, optional): Extracted key-value pairs
118
- - available (bool): Whether the service was available
119
- - message (str, optional): Error message if extraction failed
120
- - response_time_ms (int, optional): Processing time in milliseconds
121
- """
122
- start_time = time.time()
123
-
124
- global LAYOUTLM_PIPELINE
125
-
126
- # Check availability
127
- if not is_layoutlm_available():
128
- log_interaction(
129
- intent="layoutlm_extract",
130
- tenant_id=tenant_id,
131
- success=False,
132
- error="LayoutLM pipeline not available",
133
- fallback_used=True
134
- )
135
- return {
136
- "status": "error",
137
- "available": False,
138
- "message": "Document processing is temporarily unavailable. Please try uploading your document again in a moment!"
139
- }
140
-
141
- # Validate inputs
142
- if not file_bytes or not isinstance(file_bytes, bytes):
143
- log_interaction(
144
- intent="layoutlm_extract",
145
- tenant_id=tenant_id,
146
- success=False,
147
- error="Invalid file_bytes provided"
148
- )
149
- return {
150
- "status": "error",
151
- "available": True,
152
- "message": "I didn't receive valid document data. Could you try uploading your file again?"
153
- }
154
-
155
- if not file_name or not isinstance(file_name, str):
156
- log_interaction(
157
- intent="layoutlm_extract",
158
- tenant_id=tenant_id,
159
- success=False,
160
- error="Invalid file_name provided"
161
- )
162
- return {
163
- "status": "error",
164
- "available": True,
165
- "message": "I need a valid file name to process your document. Please try again!"
166
- }
167
-
168
- # Check file size (prevent processing extremely large files)
169
- file_size_mb = len(file_bytes) / (1024 * 1024)
170
- if file_size_mb > 50: # 50 MB limit
171
- log_interaction(
172
- intent="layoutlm_extract",
173
- tenant_id=tenant_id,
174
- success=False,
175
- error=f"File too large: {file_size_mb:.2f}MB",
176
- file_name=sanitize_for_logging(file_name)
177
- )
178
- return {
179
- "status": "error",
180
- "available": True,
181
- "message": f"Your file is too large ({file_size_mb:.1f}MB). Please upload a document smaller than 50MB."
182
- }
183
-
184
- try:
185
- # --- Real-world step (PLACEHOLDER) ---
186
- # In a real implementation, you would:
187
- # 1. Use a library (e.g., PyMuPDF, pdf2image) to convert PDF bytes to image(s).
188
- # 2. Use PIL/Pillow to load the image(s) from bytes.
189
- # 3. Pass the PIL Image object to the LayoutLM pipeline.
190
-
191
- # For now, we use a simple mock placeholder for the image object:
192
- image_mock = {
193
- "file_name": file_name,
194
- "byte_size": len(file_bytes)
195
- }
196
-
197
- loop = asyncio.get_event_loop()
198
-
199
- # Run model inference in thread executor
200
- results = await loop.run_in_executor(
201
- None,
202
- lambda: LAYOUTLM_PIPELINE(image_mock)
203
- )
204
-
205
- response_time_ms = int((time.time() - start_time) * 1000)
206
-
207
- # Validate results
208
- if not results or not isinstance(results, list):
209
- log_interaction(
210
- intent="layoutlm_extract",
211
- tenant_id=tenant_id,
212
- success=False,
213
- error="Unexpected model output format",
214
- response_time_ms=response_time_ms,
215
- file_name=sanitize_for_logging(file_name)
216
- )
217
- return {
218
- "status": "error",
219
- "available": True,
220
- "message": "I had trouble understanding the document structure. The file might be corrupted or in an unsupported format."
221
- }
222
-
223
- # Convert model output (list of dicts) into a clean key-value format
224
- extracted_data = {}
225
- for item in results:
226
- if isinstance(item, dict) and 'label' in item and 'text' in item:
227
- label_key = item['label'].lower().strip()
228
- text_value = str(item['text']).strip()
229
-
230
- # Avoid empty values
231
- if text_value:
232
- extracted_data[label_key] = text_value
233
-
234
- # Log slow processing
235
- if response_time_ms > 10000: # 10 seconds
236
- log_interaction(
237
- intent="layoutlm_extract_slow",
238
- tenant_id=tenant_id,
239
- success=True,
240
- response_time_ms=response_time_ms,
241
- details="Slow document processing detected",
242
- file_name=sanitize_for_logging(file_name)
243
- )
244
-
245
- log_interaction(
246
- intent="layoutlm_extract",
247
- tenant_id=tenant_id,
248
- success=True,
249
- response_time_ms=response_time_ms,
250
- file_name=sanitize_for_logging(file_name),
251
- fields_extracted=len(extracted_data)
252
- )
253
-
254
- return {
255
- "status": "success",
256
- "extracted_fields": extracted_data,
257
- "available": True,
258
- "response_time_ms": response_time_ms,
259
- "fields_count": len(extracted_data)
260
- }
261
-
262
- except asyncio.CancelledError:
263
- log_interaction(
264
- intent="layoutlm_extract",
265
- tenant_id=tenant_id,
266
- success=False,
267
- error="Processing cancelled",
268
- file_name=sanitize_for_logging(file_name)
269
- )
270
- raise
271
-
272
- except Exception as e:
273
- response_time_ms = int((time.time() - start_time) * 1000)
274
-
275
- log_interaction(
276
- intent="layoutlm_extract",
277
- tenant_id=tenant_id,
278
- success=False,
279
- error=str(e),
280
- response_time_ms=response_time_ms,
281
- file_name=sanitize_for_logging(file_name),
282
- fallback_used=True
283
- )
284
-
285
- return {
286
- "status": "error",
287
- "available": False,
288
- "message": f"I encountered an issue while processing your document. Please try again, or contact support if this continues!",
289
- "error": str(e),
290
- "response_time_ms": response_time_ms
291
- }
292
-
293
-
294
- async def validate_document_fields(
295
- extracted_fields: Dict[str, str],
296
- required_fields: List[str],
297
- tenant_id: Optional[str] = None
298
- ) -> Dict[str, Any]:
299
- """
300
- Validates that required fields were successfully extracted from a document.
301
-
302
- Args:
303
- extracted_fields: Dictionary of extracted field names and values.
304
- required_fields: List of field names that must be present.
305
- tenant_id: Optional tenant identifier for logging.
306
-
307
- Returns:
308
- A dictionary containing:
309
- - valid (bool): Whether all required fields are present
310
- - missing_fields (list): List of missing required fields
311
- - present_fields (list): List of found required fields
312
- """
313
- if not isinstance(extracted_fields, dict):
314
- log_interaction(
315
- intent="layoutlm_validate",
316
- tenant_id=tenant_id,
317
- success=False,
318
- error="Invalid extracted_fields type"
319
- )
320
- return {
321
- "valid": False,
322
- "missing_fields": required_fields,
323
- "present_fields": []
324
- }
325
-
326
- if not isinstance(required_fields, list):
327
- log_interaction(
328
- intent="layoutlm_validate",
329
- tenant_id=tenant_id,
330
- success=False,
331
- error="Invalid required_fields type"
332
- )
333
- return {
334
- "valid": False,
335
- "missing_fields": [],
336
- "present_fields": []
337
- }
338
-
339
- # Normalize field names for case-insensitive comparison
340
- extracted_keys = {k.lower().strip() for k in extracted_fields.keys()}
341
- required_keys = {f.lower().strip() for f in required_fields}
342
-
343
- present_fields = [f for f in required_fields if f.lower().strip() in extracted_keys]
344
- missing_fields = [f for f in required_fields if f.lower().strip() not in extracted_keys]
345
-
346
- is_valid = len(missing_fields) == 0
347
-
348
- log_interaction(
349
- intent="layoutlm_validate",
350
- tenant_id=tenant_id,
351
- success=is_valid,
352
- details=f"Validated {len(present_fields)}/{len(required_fields)} required fields"
353
- )
354
-
355
- return {
356
- "valid": is_valid,
357
- "missing_fields": missing_fields,
358
- "present_fields": present_fields
359
- }