omgy commited on
Commit
9fae783
Β·
verified Β·
1 Parent(s): 430da8b

Update ocr_helper.py

Browse files
Files changed (1) hide show
  1. ocr_helper.py +46 -33
ocr_helper.py CHANGED
@@ -2,7 +2,6 @@ import io
2
  import json
3
  import logging
4
  import os
5
- import glob
6
  from typing import List, Dict
7
  from PIL import Image, ImageOps
8
  import pytesseract
@@ -19,24 +18,33 @@ print(f"===== Application Startup at {__import__('datetime').datetime.now()} ===
19
  print(f"FVCORE_CACHE: {os.environ.get('FVCORE_CACHE')}")
20
  print(f"TORCH_HOME: {os.environ.get('TORCH_HOME')}")
21
 
22
- # Clean up any files with ?dl=1 in the name BEFORE importing layoutparser
23
- def cleanup_query_params():
24
- """Remove ?dl=1 from filenames in cache directory"""
25
- cache_dir = os.environ.get('FVCORE_CACHE', '/root/.torch/iopath_cache')
26
- if os.path.exists(cache_dir):
27
- for root, dirs, files in os.walk(cache_dir):
28
- for filename in files:
29
- if '?dl=' in filename:
30
- old_path = os.path.join(root, filename)
31
- new_filename = filename.split('?')[0]
32
- new_path = os.path.join(root, new_filename)
33
- try:
34
- os.rename(old_path, new_path)
35
- logger.info(f"Renamed: {filename} -> {new_filename}")
36
- except Exception as e:
37
- logger.warning(f"Failed to rename {filename}: {e}")
 
 
 
 
 
 
 
 
 
38
 
39
- cleanup_query_params()
40
 
41
  # ────────────────────────────────────────────────
42
  # Try importing layoutparser + Detectron2
@@ -48,6 +56,24 @@ try:
48
  import layoutparser as lp
49
  logger.info("Layoutparser imported successfully")
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  try:
52
  logger.info("Loading Detectron2 PubLayNet model...")
53
  # Load Detectron2 model (PubLayNet)
@@ -60,21 +86,8 @@ try:
60
  logger.info("βœ… Detectron2 LayoutParser model loaded successfully.")
61
  except Exception as e:
62
  logger.warning(f"⚠️ Failed to load Detectron2 model: {e}")
63
- # Try to clean up files and retry once
64
- cleanup_query_params()
65
- try:
66
- logger.info("Retrying model load after cleanup...")
67
- TABLE_MODEL = lp.Detectron2LayoutModel(
68
- "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config",
69
- extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
70
- label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
71
- )
72
- USE_LAYOUT = True
73
- logger.info("βœ… Detectron2 model loaded successfully after cleanup.")
74
- except Exception as e2:
75
- logger.warning(f"⚠️ Failed to load model after retry: {e2}")
76
- TABLE_MODEL = None
77
- USE_LAYOUT = False
78
  except Exception as e:
79
  logger.warning(f"⚠️ LayoutParser not available ({e}). Using OCR only.")
80
  lp = None
 
2
  import json
3
  import logging
4
  import os
 
5
  from typing import List, Dict
6
  from PIL import Image, ImageOps
7
  import pytesseract
 
18
  print(f"FVCORE_CACHE: {os.environ.get('FVCORE_CACHE')}")
19
  print(f"TORCH_HOME: {os.environ.get('TORCH_HOME')}")
20
 
21
+ # Monkey-patch to handle query parameters in filenames
22
+ def patch_file_loading():
23
+ """Patch torch.load and os.path.isfile to handle ?dl=1 in filenames"""
24
+ import torch
25
+ _original_load = torch.load
26
+ _original_isfile = os.path.isfile
27
+
28
+ def patched_load(f, *args, **kwargs):
29
+ if isinstance(f, str) and '?dl=' in f:
30
+ clean_path = f.split('?')[0]
31
+ if os.path.exists(clean_path):
32
+ logger.info(f"Using cleaned path: {clean_path}")
33
+ return _original_load(clean_path, *args, **kwargs)
34
+ return _original_load(f, *args, **kwargs)
35
+
36
+ def patched_isfile(path):
37
+ if isinstance(path, str) and '?dl=' in path:
38
+ clean_path = path.split('?')[0]
39
+ if _original_isfile(clean_path):
40
+ return True
41
+ return _original_isfile(path)
42
+
43
+ torch.load = patched_load
44
+ os.path.isfile = patched_isfile
45
+ logger.info("βœ“ File loading patched to handle query parameters")
46
 
47
+ patch_file_loading()
48
 
49
  # ────────────────────────────────────────────────
50
  # Try importing layoutparser + Detectron2
 
56
  import layoutparser as lp
57
  logger.info("Layoutparser imported successfully")
58
 
59
+ # Also need to patch fvcore's checkpoint loader
60
+ try:
61
+ from fvcore.common.checkpoint import Checkpointer
62
+ _original_load_file = Checkpointer._load_file
63
+
64
+ def patched_load_file(self, f):
65
+ if isinstance(f, str) and '?dl=' in f:
66
+ clean_path = f.split('?')[0]
67
+ if os.path.exists(clean_path):
68
+ logger.info(f"Checkpointer using cleaned path: {clean_path}")
69
+ return _original_load_file(self, clean_path)
70
+ return _original_load_file(self, f)
71
+
72
+ Checkpointer._load_file = patched_load_file
73
+ logger.info("βœ“ Checkpointer patched")
74
+ except Exception as e:
75
+ logger.warning(f"Could not patch Checkpointer: {e}")
76
+
77
  try:
78
  logger.info("Loading Detectron2 PubLayNet model...")
79
  # Load Detectron2 model (PubLayNet)
 
86
  logger.info("βœ… Detectron2 LayoutParser model loaded successfully.")
87
  except Exception as e:
88
  logger.warning(f"⚠️ Failed to load Detectron2 model: {e}")
89
+ TABLE_MODEL = None
90
+ USE_LAYOUT = False
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  except Exception as e:
92
  logger.warning(f"⚠️ LayoutParser not available ({e}). Using OCR only.")
93
  lp = None