Spaces:

anthonysigid
/

Ringkas-In

Sleeping

App Files Files Community

Ringkas-In / src /utils /document_parser.py

anthonysigid

deploy SummAIrizer apps to spaces

2a16478 23 days ago

Raw

History Blame Contribute Delete

3.09 kB

	import os
	from io import BytesIO
	from typing import Union
	import PyPDF2
	import docx

	class DocumentParser:
	"""
	Kelas utilitas untuk mengekstrak teks dari berbagai format dokumen (PDF, DOCX, TXT).
	Mendukung baik input path file (lokal) maupun file-like objects (dari Streamlit upload).
	"""

	@staticmethod
	def extract_text(file_input: Union[str, BytesIO], file_type: str = None) -> str:
	"""
	Mengekstrak teks berdasarkan format file.
	:param file_input: Path (string) atau file-like object (BytesIO)
	:param file_type: Ekstensi file (misal: 'pdf', 'docx', 'txt'). Wajib jika input adalah BytesIO.
	:return: String teks dari dokumen.
	"""
	# Tentukan tipe file jika input adalah string (path)
	if isinstance(file_input, str):
	if not os.path.exists(file_input):
	return ""
	file_type = file_input.split('.')[-1].lower()

	if not file_type:
	return ""

	file_type = file_type.lower()

	try:
	if file_type == 'pdf':
	return DocumentParser._parse_pdf(file_input)
	elif file_type in ['docx', 'doc']:
	return DocumentParser._parse_docx(file_input)
	elif file_type == 'txt':
	return DocumentParser._parse_txt(file_input)
	else:
	print(f"Format tidak didukung: {file_type}")
	return ""
	except Exception as e:
	print(f"Error saat membaca file {file_type}: {e}")
	return ""

	@staticmethod
	def _parse_pdf(file_input: Union[str, BytesIO]) -> str:
	text = ""
	is_path = isinstance(file_input, str)

	# Buka file dalam mode binary (jika path)
	f = open(file_input, 'rb') if is_path else file_input

	try:
	reader = PyPDF2.PdfReader(f)
	for page in reader.pages:
	extracted = page.extract_text()
	if extracted:
	text += extracted + "\n"
	finally:
	if is_path:
	f.close()
	elif isinstance(file_input, BytesIO):
	file_input.seek(0) # Reset pointer

	return text.strip()

	@staticmethod
	def _parse_docx(file_input: Union[str, BytesIO]) -> str:
	# docx.Document bisa menerima path maupun file-like object
	doc = docx.Document(file_input)
	text = "\n".join([para.text for para in doc.paragraphs])

	# Reset pointer jika BytesIO
	if isinstance(file_input, BytesIO):
	file_input.seek(0)

	return text.strip()

	@staticmethod
	def _parse_txt(file_input: Union[str, BytesIO]) -> str:
	if isinstance(file_input, str):
	with open(file_input, 'r', encoding='utf-8', errors='ignore') as f:
	return f.read().strip()
	else:
	text = file_input.read().decode('utf-8', errors='ignore')
	file_input.seek(0)
	return text.strip()