Update services/extract_text.py
Browse files- services/extract_text.py +34 -15
services/extract_text.py
CHANGED
|
@@ -175,7 +175,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
| 175 |
# OCR
|
| 176 |
from paddleocr import PaddleOCR
|
| 177 |
|
| 178 |
-
# Optional
|
| 179 |
try:
|
| 180 |
from doctr.models import ocr_predictor
|
| 181 |
from doctr.io import DocumentFile
|
|
@@ -196,9 +196,13 @@ logger = logging.getLogger(__name__)
|
|
| 196 |
# PaddleOCR
|
| 197 |
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
| 198 |
|
|
|
|
|
|
|
|
|
|
| 199 |
def clean_text(text):
|
| 200 |
return re.sub(r'\s+', ' ', text).strip()
|
| 201 |
|
|
|
|
| 202 |
def auto_rotate_image(pil_img):
|
| 203 |
"""Auto-rotate PIL image safely."""
|
| 204 |
if pil_img.mode != "RGB":
|
|
@@ -216,6 +220,7 @@ def auto_rotate_image(pil_img):
|
|
| 216 |
borderMode=cv2.BORDER_REPLICATE)
|
| 217 |
return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
|
| 218 |
|
|
|
|
| 219 |
def extract_images_with_fitz(pdf_path, start_page=1, end_page=None):
|
| 220 |
images = []
|
| 221 |
try:
|
|
@@ -237,21 +242,21 @@ def extract_images_with_fitz(pdf_path, start_page=1, end_page=None):
|
|
| 237 |
return images
|
| 238 |
|
| 239 |
|
| 240 |
-
# --------------------
|
| 241 |
|
| 242 |
def try_pymupdf_text(doc, start, end):
|
| 243 |
-
"""Try extracting text using PyMuPDF native text extraction"""
|
| 244 |
result = []
|
| 245 |
for i in range(start-1, end):
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
| 250 |
return "\n\n".join(result)
|
| 251 |
|
| 252 |
|
| 253 |
def try_paddleocr(images):
|
| 254 |
-
"""Try OCR using PaddleOCR"""
|
| 255 |
result = []
|
| 256 |
for page_num, img in images:
|
| 257 |
img = auto_rotate_image(img)
|
|
@@ -259,14 +264,14 @@ def try_paddleocr(images):
|
|
| 259 |
try:
|
| 260 |
ocr_result = ocr.ocr(img_np, cls=True)
|
| 261 |
ocr_text = "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
|
| 262 |
-
|
|
|
|
| 263 |
except Exception as e:
|
| 264 |
logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
|
| 265 |
return "\n\n".join(result)
|
| 266 |
|
| 267 |
|
| 268 |
def try_mistralocr(images):
|
| 269 |
-
"""Try OCR using Mistral/Doctr OCR"""
|
| 270 |
if not use_mistral_ocr:
|
| 271 |
return ""
|
| 272 |
result = []
|
|
@@ -274,7 +279,8 @@ def try_mistralocr(images):
|
|
| 274 |
try:
|
| 275 |
doc_img = DocumentFile.from_images(img)
|
| 276 |
ocr_text = mistral_ocr(doc_img).render()
|
| 277 |
-
|
|
|
|
| 278 |
except Exception as e:
|
| 279 |
logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
|
| 280 |
return "\n\n".join(result)
|
|
@@ -297,6 +303,7 @@ def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
|
|
| 297 |
end = min(end_page or total_pages, total_pages)
|
| 298 |
images = extract_images_with_fitz(file.name, start, end)
|
| 299 |
|
|
|
|
| 300 |
tasks = {}
|
| 301 |
with ThreadPoolExecutor() as executor:
|
| 302 |
tasks[executor.submit(try_pymupdf_text, doc, start, end)] = "PyMuPDF"
|
|
@@ -310,23 +317,34 @@ def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
|
|
| 310 |
try:
|
| 311 |
text = future.result()
|
| 312 |
results[method] = text
|
|
|
|
| 313 |
except Exception as e:
|
| 314 |
logger.error(f"{method} failed: {e}")
|
| 315 |
results[method] = ""
|
| 316 |
|
| 317 |
doc.close()
|
| 318 |
|
| 319 |
-
#
|
| 320 |
-
best_method, best_text = max(
|
| 321 |
-
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
|
|
|
| 324 |
elif ext == ".docx":
|
| 325 |
from docx import Document
|
| 326 |
doc = Document(file.name)
|
| 327 |
paras = [p.text for p in doc.paragraphs if p.text.strip()]
|
| 328 |
return clean_text("\n".join(paras))
|
| 329 |
|
|
|
|
| 330 |
elif ext == ".csv":
|
| 331 |
import pandas as pd
|
| 332 |
try:
|
|
@@ -335,6 +353,7 @@ def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
|
|
| 335 |
logger.error(f"CSV read error: {e}")
|
| 336 |
return "[CSV Read Error]"
|
| 337 |
|
|
|
|
| 338 |
elif ext in [".xls", ".xlsx"]:
|
| 339 |
import pandas as pd
|
| 340 |
try:
|
|
|
|
| 175 |
# OCR
|
| 176 |
from paddleocr import PaddleOCR
|
| 177 |
|
| 178 |
+
# Optional Doctr OCR
|
| 179 |
try:
|
| 180 |
from doctr.models import ocr_predictor
|
| 181 |
from doctr.io import DocumentFile
|
|
|
|
| 196 |
# PaddleOCR
|
| 197 |
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
| 198 |
|
| 199 |
+
|
| 200 |
+
# -------------------- Helpers --------------------
|
| 201 |
+
|
| 202 |
def clean_text(text):
|
| 203 |
return re.sub(r'\s+', ' ', text).strip()
|
| 204 |
|
| 205 |
+
|
| 206 |
def auto_rotate_image(pil_img):
|
| 207 |
"""Auto-rotate PIL image safely."""
|
| 208 |
if pil_img.mode != "RGB":
|
|
|
|
| 220 |
borderMode=cv2.BORDER_REPLICATE)
|
| 221 |
return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
|
| 222 |
|
| 223 |
+
|
| 224 |
def extract_images_with_fitz(pdf_path, start_page=1, end_page=None):
|
| 225 |
images = []
|
| 226 |
try:
|
|
|
|
| 242 |
return images
|
| 243 |
|
| 244 |
|
| 245 |
+
# -------------------- Extractors --------------------
|
| 246 |
|
| 247 |
def try_pymupdf_text(doc, start, end):
|
|
|
|
| 248 |
result = []
|
| 249 |
for i in range(start-1, end):
|
| 250 |
+
try:
|
| 251 |
+
text = doc[i].get_text("text")
|
| 252 |
+
if text.strip():
|
| 253 |
+
result.append(f"Page {i+1}:\n{clean_text(text)}")
|
| 254 |
+
except Exception as e:
|
| 255 |
+
logger.warning(f"PyMuPDF failed on page {i+1}: {e}")
|
| 256 |
return "\n\n".join(result)
|
| 257 |
|
| 258 |
|
| 259 |
def try_paddleocr(images):
|
|
|
|
| 260 |
result = []
|
| 261 |
for page_num, img in images:
|
| 262 |
img = auto_rotate_image(img)
|
|
|
|
| 264 |
try:
|
| 265 |
ocr_result = ocr.ocr(img_np, cls=True)
|
| 266 |
ocr_text = "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
|
| 267 |
+
if ocr_text.strip():
|
| 268 |
+
result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
|
| 269 |
except Exception as e:
|
| 270 |
logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
|
| 271 |
return "\n\n".join(result)
|
| 272 |
|
| 273 |
|
| 274 |
def try_mistralocr(images):
|
|
|
|
| 275 |
if not use_mistral_ocr:
|
| 276 |
return ""
|
| 277 |
result = []
|
|
|
|
| 279 |
try:
|
| 280 |
doc_img = DocumentFile.from_images(img)
|
| 281 |
ocr_text = mistral_ocr(doc_img).render()
|
| 282 |
+
if ocr_text.strip():
|
| 283 |
+
result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
|
| 284 |
except Exception as e:
|
| 285 |
logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
|
| 286 |
return "\n\n".join(result)
|
|
|
|
| 303 |
end = min(end_page or total_pages, total_pages)
|
| 304 |
images = extract_images_with_fitz(file.name, start, end)
|
| 305 |
|
| 306 |
+
# Run all methods in parallel
|
| 307 |
tasks = {}
|
| 308 |
with ThreadPoolExecutor() as executor:
|
| 309 |
tasks[executor.submit(try_pymupdf_text, doc, start, end)] = "PyMuPDF"
|
|
|
|
| 317 |
try:
|
| 318 |
text = future.result()
|
| 319 |
results[method] = text
|
| 320 |
+
logger.info(f"{method} produced {len(text.split())} words")
|
| 321 |
except Exception as e:
|
| 322 |
logger.error(f"{method} failed: {e}")
|
| 323 |
results[method] = ""
|
| 324 |
|
| 325 |
doc.close()
|
| 326 |
|
| 327 |
+
# Selection logic
|
| 328 |
+
best_method, best_text = max(
|
| 329 |
+
results.items(),
|
| 330 |
+
key=lambda kv: len(kv[1].split()) # choose longest by word count
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
logger.info(f"✅ Best extraction chosen: {best_method} "
|
| 334 |
+
f"(words: {len(best_text.split())})")
|
| 335 |
+
|
| 336 |
+
if not best_text.strip():
|
| 337 |
+
return "[No text extracted]"
|
| 338 |
+
return best_text
|
| 339 |
|
| 340 |
+
# DOCX
|
| 341 |
elif ext == ".docx":
|
| 342 |
from docx import Document
|
| 343 |
doc = Document(file.name)
|
| 344 |
paras = [p.text for p in doc.paragraphs if p.text.strip()]
|
| 345 |
return clean_text("\n".join(paras))
|
| 346 |
|
| 347 |
+
# CSV
|
| 348 |
elif ext == ".csv":
|
| 349 |
import pandas as pd
|
| 350 |
try:
|
|
|
|
| 353 |
logger.error(f"CSV read error: {e}")
|
| 354 |
return "[CSV Read Error]"
|
| 355 |
|
| 356 |
+
# Excel
|
| 357 |
elif ext in [".xls", ".xlsx"]:
|
| 358 |
import pandas as pd
|
| 359 |
try:
|