Spaces:
Sleeping
Sleeping
| import os | |
| from io import BytesIO | |
| from typing import Union | |
| import PyPDF2 | |
| import docx | |
| class DocumentParser: | |
| """ | |
| Kelas utilitas untuk mengekstrak teks dari berbagai format dokumen (PDF, DOCX, TXT). | |
| Mendukung baik input path file (lokal) maupun file-like objects (dari Streamlit upload). | |
| """ | |
| def extract_text(file_input: Union[str, BytesIO], file_type: str = None) -> str: | |
| """ | |
| Mengekstrak teks berdasarkan format file. | |
| :param file_input: Path (string) atau file-like object (BytesIO) | |
| :param file_type: Ekstensi file (misal: 'pdf', 'docx', 'txt'). Wajib jika input adalah BytesIO. | |
| :return: String teks dari dokumen. | |
| """ | |
| # Tentukan tipe file jika input adalah string (path) | |
| if isinstance(file_input, str): | |
| if not os.path.exists(file_input): | |
| return "" | |
| file_type = file_input.split('.')[-1].lower() | |
| if not file_type: | |
| return "" | |
| file_type = file_type.lower() | |
| try: | |
| if file_type == 'pdf': | |
| return DocumentParser._parse_pdf(file_input) | |
| elif file_type in ['docx', 'doc']: | |
| return DocumentParser._parse_docx(file_input) | |
| elif file_type == 'txt': | |
| return DocumentParser._parse_txt(file_input) | |
| else: | |
| print(f"Format tidak didukung: {file_type}") | |
| return "" | |
| except Exception as e: | |
| print(f"Error saat membaca file {file_type}: {e}") | |
| return "" | |
| def _parse_pdf(file_input: Union[str, BytesIO]) -> str: | |
| text = "" | |
| is_path = isinstance(file_input, str) | |
| # Buka file dalam mode binary (jika path) | |
| f = open(file_input, 'rb') if is_path else file_input | |
| try: | |
| reader = PyPDF2.PdfReader(f) | |
| for page in reader.pages: | |
| extracted = page.extract_text() | |
| if extracted: | |
| text += extracted + "\n" | |
| finally: | |
| if is_path: | |
| f.close() | |
| elif isinstance(file_input, BytesIO): | |
| file_input.seek(0) # Reset pointer | |
| return text.strip() | |
| def _parse_docx(file_input: Union[str, BytesIO]) -> str: | |
| # docx.Document bisa menerima path maupun file-like object | |
| doc = docx.Document(file_input) | |
| text = "\n".join([para.text for para in doc.paragraphs]) | |
| # Reset pointer jika BytesIO | |
| if isinstance(file_input, BytesIO): | |
| file_input.seek(0) | |
| return text.strip() | |
| def _parse_txt(file_input: Union[str, BytesIO]) -> str: | |
| if isinstance(file_input, str): | |
| with open(file_input, 'r', encoding='utf-8', errors='ignore') as f: | |
| return f.read().strip() | |
| else: | |
| text = file_input.read().decode('utf-8', errors='ignore') | |
| file_input.seek(0) | |
| return text.strip() | |