Update services/extract_text.py
Browse files- services/extract_text.py +79 -84
services/extract_text.py
CHANGED
|
@@ -170,12 +170,12 @@ import numpy as np
|
|
| 170 |
from PIL import Image
|
| 171 |
import cv2
|
| 172 |
import re
|
| 173 |
-
|
| 174 |
|
| 175 |
# OCR
|
| 176 |
from paddleocr import PaddleOCR
|
| 177 |
|
| 178 |
-
# Optional
|
| 179 |
try:
|
| 180 |
from doctr.models import ocr_predictor
|
| 181 |
from doctr.io import DocumentFile
|
|
@@ -197,8 +197,7 @@ logger = logging.getLogger(__name__)
|
|
| 197 |
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
| 198 |
|
| 199 |
|
| 200 |
-
#
|
| 201 |
-
|
| 202 |
def clean_text(text):
|
| 203 |
return re.sub(r'\s+', ' ', text).strip()
|
| 204 |
|
|
@@ -215,9 +214,9 @@ def auto_rotate_image(pil_img):
|
|
| 215 |
angle = -(90 + angle) if angle < -45 else -angle
|
| 216 |
(h, w) = img_cv.shape[:2]
|
| 217 |
M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
|
| 218 |
-
rotated = cv2.warpAffine(
|
| 219 |
-
|
| 220 |
-
|
| 221 |
return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
|
| 222 |
|
| 223 |
|
|
@@ -242,55 +241,11 @@ def extract_images_with_fitz(pdf_path, start_page=1, end_page=None):
|
|
| 242 |
return images
|
| 243 |
|
| 244 |
|
| 245 |
-
#
|
| 246 |
-
|
| 247 |
-
def try_pymupdf_text(doc, start, end):
|
| 248 |
-
result = []
|
| 249 |
-
for i in range(start-1, end):
|
| 250 |
-
try:
|
| 251 |
-
text = doc[i].get_text("text")
|
| 252 |
-
if text.strip():
|
| 253 |
-
result.append(f"Page {i+1}:\n{clean_text(text)}")
|
| 254 |
-
except Exception as e:
|
| 255 |
-
logger.warning(f"PyMuPDF failed on page {i+1}: {e}")
|
| 256 |
-
return "\n\n".join(result)
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
def try_paddleocr(images):
|
| 260 |
-
result = []
|
| 261 |
-
for page_num, img in images:
|
| 262 |
-
img = auto_rotate_image(img)
|
| 263 |
-
img_np = np.array(img)
|
| 264 |
-
try:
|
| 265 |
-
ocr_result = ocr.ocr(img_np, cls=True)
|
| 266 |
-
ocr_text = "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
|
| 267 |
-
if ocr_text.strip():
|
| 268 |
-
result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
|
| 269 |
-
except Exception as e:
|
| 270 |
-
logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
|
| 271 |
-
return "\n\n".join(result)
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
def try_mistralocr(images):
|
| 275 |
-
if not use_mistral_ocr:
|
| 276 |
-
return ""
|
| 277 |
-
result = []
|
| 278 |
-
for page_num, img in images:
|
| 279 |
-
try:
|
| 280 |
-
doc_img = DocumentFile.from_images(img)
|
| 281 |
-
ocr_text = mistral_ocr(doc_img).render()
|
| 282 |
-
if ocr_text.strip():
|
| 283 |
-
result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
|
| 284 |
-
except Exception as e:
|
| 285 |
-
logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
|
| 286 |
-
return "\n\n".join(result)
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
# -------------------- Main Extractor --------------------
|
| 290 |
-
|
| 291 |
def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
|
| 292 |
ext = os.path.splitext(filename or "")[-1].lower()
|
| 293 |
|
|
|
|
| 294 |
if ext == ".pdf":
|
| 295 |
try:
|
| 296 |
doc = fitz.open(file.name)
|
|
@@ -303,65 +258,105 @@ def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
|
|
| 303 |
end = min(end_page or total_pages, total_pages)
|
| 304 |
images = extract_images_with_fitz(file.name, start, end)
|
| 305 |
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
try:
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
except Exception as e:
|
| 322 |
-
logger.
|
| 323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
doc.close()
|
| 326 |
|
| 327 |
-
#
|
| 328 |
final_output = []
|
| 329 |
-
for method,
|
| 330 |
-
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
return "\n\n".join(final_output)
|
| 333 |
|
| 334 |
-
# DOCX
|
| 335 |
elif ext == ".docx":
|
| 336 |
from docx import Document
|
| 337 |
doc = Document(file.name)
|
| 338 |
paras = [p.text for p in doc.paragraphs if p.text.strip()]
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
elif ext == ".csv":
|
| 343 |
import pandas as pd
|
| 344 |
try:
|
| 345 |
-
|
| 346 |
-
return "===== Method: pandas-csv =====\n" + data
|
| 347 |
except Exception as e:
|
| 348 |
logger.error(f"CSV read error: {e}")
|
| 349 |
return "[CSV Read Error]"
|
| 350 |
|
| 351 |
-
# Excel
|
| 352 |
elif ext in [".xls", ".xlsx"]:
|
| 353 |
import pandas as pd
|
| 354 |
try:
|
| 355 |
xl = pd.ExcelFile(file.name)
|
| 356 |
-
|
| 357 |
-
f"Sheet: {s}\n{xl.parse(s).to_string(index=False)}"
|
| 358 |
-
|
| 359 |
-
])
|
| 360 |
-
return "===== Method: pandas-excel =====\n" + text
|
| 361 |
except Exception as e:
|
| 362 |
logger.error(f"Excel read error: {e}")
|
| 363 |
return "[Excel Read Error]"
|
| 364 |
|
|
|
|
| 365 |
else:
|
| 366 |
return "[Unsupported file type]"
|
| 367 |
-
|
|
|
|
| 170 |
from PIL import Image
|
| 171 |
import cv2
|
| 172 |
import re
|
| 173 |
+
import concurrent.futures
|
| 174 |
|
| 175 |
# OCR
|
| 176 |
from paddleocr import PaddleOCR
|
| 177 |
|
| 178 |
+
# Optional Mistral OCR
|
| 179 |
try:
|
| 180 |
from doctr.models import ocr_predictor
|
| 181 |
from doctr.io import DocumentFile
|
|
|
|
| 197 |
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
| 198 |
|
| 199 |
|
| 200 |
+
# ========================= Helpers ==============================
|
|
|
|
| 201 |
def clean_text(text):
|
| 202 |
return re.sub(r'\s+', ' ', text).strip()
|
| 203 |
|
|
|
|
| 214 |
angle = -(90 + angle) if angle < -45 else -angle
|
| 215 |
(h, w) = img_cv.shape[:2]
|
| 216 |
M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
|
| 217 |
+
rotated = cv2.warpAffine(
|
| 218 |
+
img_cv, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE
|
| 219 |
+
)
|
| 220 |
return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
|
| 221 |
|
| 222 |
|
|
|
|
| 241 |
return images
|
| 242 |
|
| 243 |
|
| 244 |
+
# ========================= Extraction ==============================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
|
| 246 |
ext = os.path.splitext(filename or "")[-1].lower()
|
| 247 |
|
| 248 |
+
# ---------------- PDF -----------------
|
| 249 |
if ext == ".pdf":
|
| 250 |
try:
|
| 251 |
doc = fitz.open(file.name)
|
|
|
|
| 258 |
end = min(end_page or total_pages, total_pages)
|
| 259 |
images = extract_images_with_fitz(file.name, start, end)
|
| 260 |
|
| 261 |
+
results = {"PyMuPDF": [], "PaddleOCR": [], "MistralOCR": []}
|
| 262 |
+
|
| 263 |
+
def process_page(i):
|
| 264 |
+
page_num = i + 1
|
| 265 |
+
page_results = {}
|
| 266 |
+
|
| 267 |
+
# --- PyMuPDF ---
|
| 268 |
+
pymupdf_text = ""
|
| 269 |
+
try:
|
| 270 |
+
pymupdf_text = clean_text(doc[i].get_text("text"))
|
| 271 |
+
except Exception as e:
|
| 272 |
+
logger.warning(f"PyMuPDF failed on page {page_num}: {e}")
|
| 273 |
+
if len(pymupdf_text.split()) > 5: # ignore tiny metadata
|
| 274 |
+
page_results["PyMuPDF"] = f"Page {page_num}:\n{pymupdf_text}"
|
| 275 |
+
|
| 276 |
+
# --- PaddleOCR ---
|
| 277 |
+
paddle_text = ""
|
| 278 |
+
try:
|
| 279 |
+
img = auto_rotate_image(images[i - (start - 1)][1])
|
| 280 |
+
img_np = np.array(img)
|
| 281 |
+
ocr_result = ocr.ocr(img_np, cls=True)
|
| 282 |
+
paddle_text = (
|
| 283 |
+
"\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
|
| 284 |
+
)
|
| 285 |
+
paddle_text = clean_text(paddle_text)
|
| 286 |
+
except Exception as e:
|
| 287 |
+
logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
|
| 288 |
+
if paddle_text:
|
| 289 |
+
page_results["PaddleOCR"] = f"Page {page_num}:\n{paddle_text}"
|
| 290 |
|
| 291 |
+
# --- MistralOCR ---
|
| 292 |
+
mistral_text = ""
|
| 293 |
+
if use_mistral_ocr:
|
| 294 |
try:
|
| 295 |
+
doc_img = DocumentFile.from_images(images[i - (start - 1)][1])
|
| 296 |
+
mistral_text = mistral_ocr(doc_img).render()
|
| 297 |
+
mistral_text = clean_text(mistral_text)
|
| 298 |
except Exception as e:
|
| 299 |
+
logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
|
| 300 |
+
if mistral_text:
|
| 301 |
+
page_results["MistralOCR"] = f"Page {page_num}:\n{mistral_text}"
|
| 302 |
+
|
| 303 |
+
return page_results
|
| 304 |
+
|
| 305 |
+
# Run in parallel
|
| 306 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 307 |
+
futures = [executor.submit(process_page, i) for i in range(start - 1, end)]
|
| 308 |
+
for future in concurrent.futures.as_completed(futures):
|
| 309 |
+
page_results = future.result()
|
| 310 |
+
for method, text in page_results.items():
|
| 311 |
+
results[method].append(text)
|
| 312 |
|
| 313 |
doc.close()
|
| 314 |
|
| 315 |
+
# Build final output (all methods separately)
|
| 316 |
final_output = []
|
| 317 |
+
for method, texts in results.items():
|
| 318 |
+
if texts:
|
| 319 |
+
final_output.append(f"===== Method: {method} =====\n" + "\n\n".join(texts))
|
| 320 |
+
else:
|
| 321 |
+
final_output.append(f"===== Method: {method} =====\n[No text extracted]")
|
| 322 |
|
| 323 |
return "\n\n".join(final_output)
|
| 324 |
|
| 325 |
+
# ---------------- DOCX -----------------
|
| 326 |
elif ext == ".docx":
|
| 327 |
from docx import Document
|
| 328 |
doc = Document(file.name)
|
| 329 |
paras = [p.text for p in doc.paragraphs if p.text.strip()]
|
| 330 |
+
page_texts = []
|
| 331 |
+
page_size = 500
|
| 332 |
+
for i in range(0, len(paras), page_size):
|
| 333 |
+
page_texts.append("\n".join(paras[i:i + page_size]))
|
| 334 |
+
selected_pages = page_texts
|
| 335 |
+
if start_page and end_page:
|
| 336 |
+
selected_pages = page_texts[start_page - 1:end_page]
|
| 337 |
+
return clean_text("\n\n".join(selected_pages))
|
| 338 |
+
|
| 339 |
+
# ---------------- CSV -----------------
|
| 340 |
elif ext == ".csv":
|
| 341 |
import pandas as pd
|
| 342 |
try:
|
| 343 |
+
return pd.read_csv(file.name).to_string(index=False)
|
|
|
|
| 344 |
except Exception as e:
|
| 345 |
logger.error(f"CSV read error: {e}")
|
| 346 |
return "[CSV Read Error]"
|
| 347 |
|
| 348 |
+
# ---------------- Excel -----------------
|
| 349 |
elif ext in [".xls", ".xlsx"]:
|
| 350 |
import pandas as pd
|
| 351 |
try:
|
| 352 |
xl = pd.ExcelFile(file.name)
|
| 353 |
+
return "\n\n".join(
|
| 354 |
+
[f"Sheet: {s}\n{xl.parse(s).to_string(index=False)}" for s in xl.sheet_names]
|
| 355 |
+
)
|
|
|
|
|
|
|
| 356 |
except Exception as e:
|
| 357 |
logger.error(f"Excel read error: {e}")
|
| 358 |
return "[Excel Read Error]"
|
| 359 |
|
| 360 |
+
# ---------------- Others -----------------
|
| 361 |
else:
|
| 362 |
return "[Unsupported file type]"
|
|
|