Ringkas-In / src /utils /document_parser.py
anthonysigid's picture
deploy SummAIrizer apps to spaces
2a16478
Raw
History Blame Contribute Delete
3.09 kB
import os
from io import BytesIO
from typing import Union
import PyPDF2
import docx
class DocumentParser:
"""
Kelas utilitas untuk mengekstrak teks dari berbagai format dokumen (PDF, DOCX, TXT).
Mendukung baik input path file (lokal) maupun file-like objects (dari Streamlit upload).
"""
@staticmethod
def extract_text(file_input: Union[str, BytesIO], file_type: str = None) -> str:
"""
Mengekstrak teks berdasarkan format file.
:param file_input: Path (string) atau file-like object (BytesIO)
:param file_type: Ekstensi file (misal: 'pdf', 'docx', 'txt'). Wajib jika input adalah BytesIO.
:return: String teks dari dokumen.
"""
# Tentukan tipe file jika input adalah string (path)
if isinstance(file_input, str):
if not os.path.exists(file_input):
return ""
file_type = file_input.split('.')[-1].lower()
if not file_type:
return ""
file_type = file_type.lower()
try:
if file_type == 'pdf':
return DocumentParser._parse_pdf(file_input)
elif file_type in ['docx', 'doc']:
return DocumentParser._parse_docx(file_input)
elif file_type == 'txt':
return DocumentParser._parse_txt(file_input)
else:
print(f"Format tidak didukung: {file_type}")
return ""
except Exception as e:
print(f"Error saat membaca file {file_type}: {e}")
return ""
@staticmethod
def _parse_pdf(file_input: Union[str, BytesIO]) -> str:
text = ""
is_path = isinstance(file_input, str)
# Buka file dalam mode binary (jika path)
f = open(file_input, 'rb') if is_path else file_input
try:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text += extracted + "\n"
finally:
if is_path:
f.close()
elif isinstance(file_input, BytesIO):
file_input.seek(0) # Reset pointer
return text.strip()
@staticmethod
def _parse_docx(file_input: Union[str, BytesIO]) -> str:
# docx.Document bisa menerima path maupun file-like object
doc = docx.Document(file_input)
text = "\n".join([para.text for para in doc.paragraphs])
# Reset pointer jika BytesIO
if isinstance(file_input, BytesIO):
file_input.seek(0)
return text.strip()
@staticmethod
def _parse_txt(file_input: Union[str, BytesIO]) -> str:
if isinstance(file_input, str):
with open(file_input, 'r', encoding='utf-8', errors='ignore') as f:
return f.read().strip()
else:
text = file_input.read().decode('utf-8', errors='ignore')
file_input.seek(0)
return text.strip()