Spaces:

BluescarfAI
/

LawyersTrainerAgenticSystem

Sleeping

faizanwasif

added documents support

367f1e7 9 months ago

1.86 kB


	from PyPDF2 import PdfReader
	from docx import Document
	import zipfile
	import xml.etree.ElementTree as ET
	import io

	def clean_extracted_text(text: str) -> str:
	"""
	Normalize and collapse whitespace in extracted text.
	"""
	lines = [line.strip() for line in text.split("\n") if line.strip()]
	return ' '.join(lines)

	def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
	"""
	Extract text from PDF bytes using PyPDF2.
	"""
	try:
	pdf_file = io.BytesIO(pdf_bytes)
	reader = PdfReader(pdf_file)
	text = ""
	for page in reader.pages:
	page_text = page.extract_text() or ""
	text += clean_extracted_text(page_text) + "\n\n"
	return text.strip()
	except Exception as e:
	print(f"Error extracting text from PDF: {e}")
	return ""

	def extract_text_from_docx_bytes(docx_bytes: bytes) -> str:
	"""
	Extract text (paragraphs and tables) from DOCX bytes.
	"""
	try:
	docx_file = io.BytesIO(docx_bytes)
	doc = Document(docx_file)
	text = ""
	# paragraphs
	for para in doc.paragraphs:
	text += para.text + "\n"
	# tables
	for table in doc.tables:
	for row in table.rows:
	text += " \| ".join(cell.text for cell in row.cells) + "\n"
	return clean_extracted_text(text).strip()
	except Exception as e:
	print(f"Error extracting text from DOCX: {e}")
	return ""


	def extract_text_from_txt_bytes(txt_bytes: bytes, encoding: str = 'utf-8') -> str:
	"""
	Extract and clean text from raw TXT bytes using the given encoding.
	"""
	try:
	raw_text = txt_bytes.decode(encoding, errors='ignore')
	except Exception:
	raw_text = txt_bytes.decode('latin-1', errors='ignore')
	return clean_extracted_text(raw_text).strip()