Spaces:
Build error
Build error
| from PyPDF2 import PdfReader | |
| from typing import Dict, List | |
| import json | |
| from datetime import datetime | |
| import os | |
| from pathlib import Path | |
| class PDFExtractor: | |
| def __init__(self): | |
| pass | |
| def extract_text(self, file_path: str) -> Dict: | |
| """PDF ํ์ผ์์ ํ ์คํธ ์ถ์ถ""" | |
| try: | |
| # PDF ํ์ผ ์ฝ๊ธฐ | |
| pdf_reader = PdfReader(file_path) | |
| # ํ์ด์ง ์ ํ์ธ | |
| total_pages = len(pdf_reader.pages) | |
| # ํ์ด์ง๋ณ ํ ์คํธ ์ถ์ถ | |
| text_by_page = [] | |
| for page_num, page in enumerate(pdf_reader.pages, 1): | |
| text = page.extract_text() | |
| if text: | |
| text_by_page.append({ | |
| "page_number": page_num, | |
| "text": text | |
| }) | |
| # ๊ฒฐ๊ณผ ๋ฐํ | |
| return { | |
| "filename": os.path.basename(file_path), | |
| "total_pages": total_pages, | |
| "extracted_pages": len(text_by_page), | |
| "text_by_page": text_by_page, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| raise Exception(f"PDF ํ ์คํธ ์ถ์ถ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| def save_extracted_text(self, extracted_data: Dict, output_dir: str) -> str: | |
| """์ถ์ถ๋ ํ ์คํธ๋ฅผ JSON ํ์ผ๋ก ์ ์ฅ""" | |
| output_dir = Path(output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| filename = f"extracted_{extracted_data['filename'].split('.')[0]}.json" | |
| output_path = output_dir / filename | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| json.dump(extracted_data, f, ensure_ascii=False, indent=2) | |
| return str(output_path) | |
| # ์ฑ๊ธํค ์ธ์คํด์ค ์์ฑ | |
| pdf_extractor = PDFExtractor() | |