|
|
| from PyPDF2 import PdfReader |
| from docx import Document |
| import zipfile |
| import xml.etree.ElementTree as ET |
| import io |
|
|
| def clean_extracted_text(text: str) -> str: |
| """ |
| Normalize and collapse whitespace in extracted text. |
| """ |
| lines = [line.strip() for line in text.split("\n") if line.strip()] |
| return ' '.join(lines) |
|
|
| def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str: |
| """ |
| Extract text from PDF bytes using PyPDF2. |
| """ |
| try: |
| pdf_file = io.BytesIO(pdf_bytes) |
| reader = PdfReader(pdf_file) |
| text = "" |
| for page in reader.pages: |
| page_text = page.extract_text() or "" |
| text += clean_extracted_text(page_text) + "\n\n" |
| return text.strip() |
| except Exception as e: |
| print(f"Error extracting text from PDF: {e}") |
| return "" |
|
|
| def extract_text_from_docx_bytes(docx_bytes: bytes) -> str: |
| """ |
| Extract text (paragraphs and tables) from DOCX bytes. |
| """ |
| try: |
| docx_file = io.BytesIO(docx_bytes) |
| doc = Document(docx_file) |
| text = "" |
| |
| for para in doc.paragraphs: |
| text += para.text + "\n" |
| |
| for table in doc.tables: |
| for row in table.rows: |
| text += " | ".join(cell.text for cell in row.cells) + "\n" |
| return clean_extracted_text(text).strip() |
| except Exception as e: |
| print(f"Error extracting text from DOCX: {e}") |
| return "" |
|
|
|
|
| def extract_text_from_txt_bytes(txt_bytes: bytes, encoding: str = 'utf-8') -> str: |
| """ |
| Extract and clean text from raw TXT bytes using the given encoding. |
| """ |
| try: |
| raw_text = txt_bytes.decode(encoding, errors='ignore') |
| except Exception: |
| raw_text = txt_bytes.decode('latin-1', errors='ignore') |
| return clean_extracted_text(raw_text).strip() |
|
|