LawyersTrainerAgenticSystem / files_extraction.py
faizanwasif
added documents support
367f1e7
from PyPDF2 import PdfReader
from docx import Document
import zipfile
import xml.etree.ElementTree as ET
import io
def clean_extracted_text(text: str) -> str:
"""
Normalize and collapse whitespace in extracted text.
"""
lines = [line.strip() for line in text.split("\n") if line.strip()]
return ' '.join(lines)
def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
"""
Extract text from PDF bytes using PyPDF2.
"""
try:
pdf_file = io.BytesIO(pdf_bytes)
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
page_text = page.extract_text() or ""
text += clean_extracted_text(page_text) + "\n\n"
return text.strip()
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return ""
def extract_text_from_docx_bytes(docx_bytes: bytes) -> str:
"""
Extract text (paragraphs and tables) from DOCX bytes.
"""
try:
docx_file = io.BytesIO(docx_bytes)
doc = Document(docx_file)
text = ""
# paragraphs
for para in doc.paragraphs:
text += para.text + "\n"
# tables
for table in doc.tables:
for row in table.rows:
text += " | ".join(cell.text for cell in row.cells) + "\n"
return clean_extracted_text(text).strip()
except Exception as e:
print(f"Error extracting text from DOCX: {e}")
return ""
def extract_text_from_txt_bytes(txt_bytes: bytes, encoding: str = 'utf-8') -> str:
"""
Extract and clean text from raw TXT bytes using the given encoding.
"""
try:
raw_text = txt_bytes.decode(encoding, errors='ignore')
except Exception:
raw_text = txt_bytes.decode('latin-1', errors='ignore')
return clean_extracted_text(raw_text).strip()