|
|
|
|
|
import io |
|
|
from typing import Optional |
|
|
|
|
|
|
|
|
try: |
|
|
from PyPDF2 import PdfReader |
|
|
_has_pypdf2 = True |
|
|
except Exception: |
|
|
_has_pypdf2 = False |
|
|
|
|
|
try: |
|
|
from pdfminer.high_level import extract_text as pdfminer_extract_text |
|
|
_has_pdfminer = True |
|
|
except Exception: |
|
|
_has_pdfminer = False |
|
|
|
|
|
def extract_text_from_pdf(pdf_bytes: bytes) -> str: |
|
|
""" |
|
|
PDF-тен мәтін алу. Бірнеше әдісті қолданып көреді. |
|
|
""" |
|
|
|
|
|
if _has_pypdf2: |
|
|
try: |
|
|
reader = PdfReader(io.BytesIO(pdf_bytes)) |
|
|
text = "" |
|
|
for page in reader.pages: |
|
|
page_text = page.extract_text() |
|
|
if page_text: |
|
|
text += page_text + "\n" |
|
|
if text.strip(): |
|
|
return text |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
if _has_pdfminer: |
|
|
try: |
|
|
text = pdfminer_extract_text(io.BytesIO(pdf_bytes)) |
|
|
if text and text.strip(): |
|
|
return text |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
return "" |
|
|
|
|
|
|