redhairedshanks1 commited on
Commit
5d43a8b
·
verified ·
1 Parent(s): 352ad92

Update services/extract_text.py

Browse files
Files changed (1) hide show
  1. services/extract_text.py +236 -48
services/extract_text.py CHANGED
@@ -1,3 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import logging
3
  import fitz # PyMuPDF
@@ -5,6 +170,7 @@ import numpy as np
5
  from PIL import Image
6
  import cv2
7
  import re
 
8
 
9
  # OCR
10
  from paddleocr import PaddleOCR
@@ -70,9 +236,54 @@ def extract_images_with_fitz(pdf_path, start_page=1, end_page=None):
70
  logger.error(f"Failed to open PDF file: {e}")
71
  return images
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
74
  ext = os.path.splitext(filename or "")[-1].lower()
75
- result = []
76
 
77
  if ext == ".pdf":
78
  try:
@@ -81,63 +292,40 @@ def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
81
  logger.error(f"Cannot open PDF {filename}: {e}")
82
  return "[Error opening PDF]"
83
 
84
- images = extract_images_with_fitz(file.name, start_page or 1, end_page)
85
  total_pages = len(doc)
86
  start = max(start_page or 1, 1)
87
  end = min(end_page or total_pages, total_pages)
 
88
 
89
- for i, page in enumerate(doc):
90
- page_num = i + 1
91
- if not (start <= page_num <= end):
92
- continue
93
-
94
- text = page.get_text()
95
- if text.strip():
96
- result.append(f"Page {page_num} (Extracted):\n{clean_text(text)}")
97
- else:
98
- if i < len(images):
99
- try:
100
- img = auto_rotate_image(images[i][1])
101
- img_np = np.array(img)
102
- ocr_text = ""
103
- # PaddleOCR
104
- try:
105
- ocr_result = ocr.ocr(img_np, cls=True)
106
- ocr_text = "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
107
- except Exception as e:
108
- logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
109
-
110
- # Mistral OCR fallback
111
- if not ocr_text and use_mistral_ocr:
112
- try:
113
- doc_img = DocumentFile.from_images(img)
114
- ocr_text = mistral_ocr(doc_img).render()
115
- except Exception as e:
116
- logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
117
- ocr_text = "[OCR Error]"
118
-
119
- result.append(f"Page {page_num} (OCR):\n{clean_text(ocr_text) or '[No OCR Text]'}")
120
- except Exception as e:
121
- logger.error(f"OCR processing failed for page {page_num}: {e}")
122
- result.append(f"Page {page_num}: [OCR Error]")
123
- else:
124
- result.append(f"Page {page_num}: [No text or image]")
125
 
126
  doc.close()
127
- return "\n\n".join(result)
 
 
 
 
128
 
129
  elif ext == ".docx":
130
- from docx.api import Document
131
  doc = Document(file.name)
132
  paras = [p.text for p in doc.paragraphs if p.text.strip()]
133
- page_texts = []
134
- page_size = 500
135
- for i in range(0, len(paras), page_size):
136
- page_texts.append("\n".join(paras[i:i + page_size]))
137
- selected_pages = page_texts
138
- if start_page and end_page:
139
- selected_pages = page_texts[start_page - 1:end_page]
140
- return clean_text("\n\n".join(selected_pages))
141
 
142
  elif ext == ".csv":
143
  import pandas as pd
 
1
+ # import os
2
+ # import logging
3
+ # import fitz # PyMuPDF
4
+ # import numpy as np
5
+ # from PIL import Image
6
+ # import cv2
7
+ # import re
8
+
9
+ # # OCR
10
+ # from paddleocr import PaddleOCR
11
+
12
+ # # Optional Mistral OCR
13
+ # try:
14
+ # from doctr.models import ocr_predictor
15
+ # from doctr.io import DocumentFile
16
+ # mistral_ocr = ocr_predictor(pretrained=True)
17
+ # use_mistral_ocr = True
18
+ # except ImportError:
19
+ # mistral_ocr = None
20
+ # use_mistral_ocr = False
21
+
22
+ # # Environment paths
23
+ # os.environ.setdefault("HOME", "/app")
24
+ # os.environ.setdefault("PADDLEOCR_HOME", "/app/.paddleocr")
25
+
26
+ # # Logging
27
+ # logging.basicConfig(level=logging.INFO)
28
+ # logger = logging.getLogger(__name__)
29
+
30
+ # # PaddleOCR
31
+ # ocr = PaddleOCR(use_angle_cls=True, lang='en')
32
+
33
+ # def clean_text(text):
34
+ # return re.sub(r'\s+', ' ', text).strip()
35
+
36
+ # def auto_rotate_image(pil_img):
37
+ # """Auto-rotate PIL image safely."""
38
+ # if pil_img.mode != "RGB":
39
+ # pil_img = pil_img.convert("RGB")
40
+ # img_cv = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
41
+ # coords = np.column_stack(np.where(img_cv > 0))
42
+ # if coords.size == 0:
43
+ # return pil_img # blank page
44
+ # angle = cv2.minAreaRect(coords)[-1]
45
+ # angle = -(90 + angle) if angle < -45 else -angle
46
+ # (h, w) = img_cv.shape[:2]
47
+ # M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
48
+ # rotated = cv2.warpAffine(img_cv, M, (w, h),
49
+ # flags=cv2.INTER_CUBIC,
50
+ # borderMode=cv2.BORDER_REPLICATE)
51
+ # return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
52
+
53
+ # def extract_images_with_fitz(pdf_path, start_page=1, end_page=None):
54
+ # images = []
55
+ # try:
56
+ # doc = fitz.open(pdf_path)
57
+ # total_pages = len(doc)
58
+ # end = min(end_page or total_pages, total_pages)
59
+ # for i in range(start_page - 1, end):
60
+ # try:
61
+ # page = doc[i]
62
+ # pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
63
+ # mode = "RGBA" if pix.alpha else "RGB"
64
+ # img = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
65
+ # images.append((i + 1, img))
66
+ # except Exception as e:
67
+ # logger.error(f"Error rendering page {i + 1}: {e}")
68
+ # doc.close()
69
+ # except Exception as e:
70
+ # logger.error(f"Failed to open PDF file: {e}")
71
+ # return images
72
+
73
+ # def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
74
+ # ext = os.path.splitext(filename or "")[-1].lower()
75
+ # result = []
76
+
77
+ # if ext == ".pdf":
78
+ # try:
79
+ # doc = fitz.open(file.name)
80
+ # except Exception as e:
81
+ # logger.error(f"Cannot open PDF {filename}: {e}")
82
+ # return "[Error opening PDF]"
83
+
84
+ # images = extract_images_with_fitz(file.name, start_page or 1, end_page)
85
+ # total_pages = len(doc)
86
+ # start = max(start_page or 1, 1)
87
+ # end = min(end_page or total_pages, total_pages)
88
+
89
+ # for i, page in enumerate(doc):
90
+ # page_num = i + 1
91
+ # if not (start <= page_num <= end):
92
+ # continue
93
+
94
+ # text = page.get_text()
95
+ # if text.strip():
96
+ # result.append(f"Page {page_num} (Extracted):\n{clean_text(text)}")
97
+ # else:
98
+ # if i < len(images):
99
+ # try:
100
+ # img = auto_rotate_image(images[i][1])
101
+ # img_np = np.array(img)
102
+ # ocr_text = ""
103
+ # # PaddleOCR
104
+ # try:
105
+ # ocr_result = ocr.ocr(img_np, cls=True)
106
+ # ocr_text = "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
107
+ # except Exception as e:
108
+ # logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
109
+
110
+ # # Mistral OCR fallback
111
+ # if not ocr_text and use_mistral_ocr:
112
+ # try:
113
+ # doc_img = DocumentFile.from_images(img)
114
+ # ocr_text = mistral_ocr(doc_img).render()
115
+ # except Exception as e:
116
+ # logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
117
+ # ocr_text = "[OCR Error]"
118
+
119
+ # result.append(f"Page {page_num} (OCR):\n{clean_text(ocr_text) or '[No OCR Text]'}")
120
+ # except Exception as e:
121
+ # logger.error(f"OCR processing failed for page {page_num}: {e}")
122
+ # result.append(f"Page {page_num}: [OCR Error]")
123
+ # else:
124
+ # result.append(f"Page {page_num}: [No text or image]")
125
+
126
+ # doc.close()
127
+ # return "\n\n".join(result)
128
+
129
+ # elif ext == ".docx":
130
+ # from docx.api import Document
131
+ # doc = Document(file.name)
132
+ # paras = [p.text for p in doc.paragraphs if p.text.strip()]
133
+ # page_texts = []
134
+ # page_size = 500
135
+ # for i in range(0, len(paras), page_size):
136
+ # page_texts.append("\n".join(paras[i:i + page_size]))
137
+ # selected_pages = page_texts
138
+ # if start_page and end_page:
139
+ # selected_pages = page_texts[start_page - 1:end_page]
140
+ # return clean_text("\n\n".join(selected_pages))
141
+
142
+ # elif ext == ".csv":
143
+ # import pandas as pd
144
+ # try:
145
+ # return pd.read_csv(file.name).to_string(index=False)
146
+ # except Exception as e:
147
+ # logger.error(f"CSV read error: {e}")
148
+ # return "[CSV Read Error]"
149
+
150
+ # elif ext in [".xls", ".xlsx"]:
151
+ # import pandas as pd
152
+ # try:
153
+ # xl = pd.ExcelFile(file.name)
154
+ # return "\n\n".join([
155
+ # f"Sheet: {s}\n{xl.parse(s).to_string(index=False)}"
156
+ # for s in xl.sheet_names
157
+ # ])
158
+ # except Exception as e:
159
+ # logger.error(f"Excel read error: {e}")
160
+ # return "[Excel Read Error]"
161
+
162
+ # else:
163
+ # return "[Unsupported file type]"
164
+
165
+
166
  import os
167
  import logging
168
  import fitz # PyMuPDF
 
170
  from PIL import Image
171
  import cv2
172
  import re
173
+ from concurrent.futures import ThreadPoolExecutor, as_completed
174
 
175
  # OCR
176
  from paddleocr import PaddleOCR
 
236
  logger.error(f"Failed to open PDF file: {e}")
237
  return images
238
 
239
+
240
+ # -------------------- Parallel Extraction Wrapper --------------------
241
+
242
+ def try_pymupdf_text(doc, start, end):
243
+ """Try extracting text using PyMuPDF native text extraction"""
244
+ result = []
245
+ for i in range(start-1, end):
246
+ page = doc[i]
247
+ text = page.get_text()
248
+ if text.strip():
249
+ result.append(f"Page {i+1}:\n{clean_text(text)}")
250
+ return "\n\n".join(result)
251
+
252
+
253
+ def try_paddleocr(images):
254
+ """Try OCR using PaddleOCR"""
255
+ result = []
256
+ for page_num, img in images:
257
+ img = auto_rotate_image(img)
258
+ img_np = np.array(img)
259
+ try:
260
+ ocr_result = ocr.ocr(img_np, cls=True)
261
+ ocr_text = "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
262
+ result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
263
+ except Exception as e:
264
+ logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
265
+ return "\n\n".join(result)
266
+
267
+
268
+ def try_mistralocr(images):
269
+ """Try OCR using Mistral/Doctr OCR"""
270
+ if not use_mistral_ocr:
271
+ return ""
272
+ result = []
273
+ for page_num, img in images:
274
+ try:
275
+ doc_img = DocumentFile.from_images(img)
276
+ ocr_text = mistral_ocr(doc_img).render()
277
+ result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
278
+ except Exception as e:
279
+ logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
280
+ return "\n\n".join(result)
281
+
282
+
283
+ # -------------------- Main Extractor --------------------
284
+
285
  def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
286
  ext = os.path.splitext(filename or "")[-1].lower()
 
287
 
288
  if ext == ".pdf":
289
  try:
 
292
  logger.error(f"Cannot open PDF {filename}: {e}")
293
  return "[Error opening PDF]"
294
 
 
295
  total_pages = len(doc)
296
  start = max(start_page or 1, 1)
297
  end = min(end_page or total_pages, total_pages)
298
+ images = extract_images_with_fitz(file.name, start, end)
299
 
300
+ tasks = {}
301
+ with ThreadPoolExecutor() as executor:
302
+ tasks[executor.submit(try_pymupdf_text, doc, start, end)] = "PyMuPDF"
303
+ tasks[executor.submit(try_paddleocr, images)] = "PaddleOCR"
304
+ if use_mistral_ocr:
305
+ tasks[executor.submit(try_mistralocr, images)] = "MistralOCR"
306
+
307
+ results = {}
308
+ for future in as_completed(tasks):
309
+ method = tasks[future]
310
+ try:
311
+ text = future.result()
312
+ results[method] = text
313
+ except Exception as e:
314
+ logger.error(f"{method} failed: {e}")
315
+ results[method] = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  doc.close()
318
+
319
+ # Pick the longest text among the methods
320
+ best_method, best_text = max(results.items(), key=lambda kv: len(kv[1].strip()))
321
+ logger.info(f"Best extraction chosen: {best_method} (length {len(best_text)})")
322
+ return best_text or "[No text extracted]"
323
 
324
  elif ext == ".docx":
325
+ from docx import Document
326
  doc = Document(file.name)
327
  paras = [p.text for p in doc.paragraphs if p.text.strip()]
328
+ return clean_text("\n".join(paras))
 
 
 
 
 
 
 
329
 
330
  elif ext == ".csv":
331
  import pandas as pd