Spaces:

hmqid03
/

vietqa-api

Sleeping

quanho114

Deploy VietQA API

ebb8326 about 1 month ago

1.95 kB

	"""Document parsing utilities for various file formats."""

	from pathlib import Path

	from src.utils.common import normalize_text
	from src.utils.logging import print_log


	def load_pdf(file_path: Path) -> str:
	"""Load text from PDF file."""
	try:
	import pypdf
	except ImportError:
	raise ImportError("pypdf is required for PDF files. Install with: pip install pypdf")

	reader = pypdf.PdfReader(str(file_path))
	text = ""
	for page in reader.pages:
	text += page.extract_text() + "\n"
	return text.strip()


	def load_docx(file_path: Path) -> str:
	"""Load text from DOCX file."""
	try:
	import docx
	except ImportError:
	raise ImportError("python-docx is required for DOCX files. Install with: pip install python-docx")

	doc = docx.Document(str(file_path))
	return "\n".join([para.text for para in doc.paragraphs])


	def load_txt(file_path: Path) -> str:
	"""Load text from TXT file."""
	with open(file_path, encoding="utf-8") as f:
	return f.read()


	def load_document(file_path: Path) -> tuple[str \| None, dict \| None]:
	"""Load document (PDF, DOCX, TXT), normalize text, and return (text, metadata).

	Returns (None, None) for unsupported or failed files.
	"""
	ext = file_path.suffix.lower()

	try:
	if ext == ".pdf":
	text = load_pdf(file_path)
	elif ext == ".docx":
	text = load_docx(file_path)
	elif ext == ".txt":
	text = load_txt(file_path)
	else:
	return None, None

	text = normalize_text(text)
	if not text:
	return None, None

	metadata = {
	"source_file": str(file_path),
	"file_name": file_path.name,
	"file_type": ext[1:],
	}
	return text, metadata

	except Exception as e:
	print_log(f" [Error] Failed to load {file_path.name}: {e}")
	return None, None