Spaces:
Runtime error
Runtime error
| from PyPDF2 import PdfReader | |
| from typing import Dict, List | |
| import json | |
| from datetime import datetime | |
| import os | |
| from pathlib import Path | |
| class PDFExtractor: | |
| def __init__(self): | |
| pass | |
| def extract_text(self, file_path: str) -> Dict: | |
| """PDF νμΌμμ ν μ€νΈ μΆμΆ""" | |
| try: | |
| # PDF νμΌ μ½κΈ° | |
| pdf_reader = PdfReader(file_path) | |
| # νμ΄μ§ μ νμΈ | |
| total_pages = len(pdf_reader.pages) | |
| # νμ΄μ§λ³ ν μ€νΈ μΆμΆ | |
| text_by_page = [] | |
| for page_num, page in enumerate(pdf_reader.pages, 1): | |
| text = page.extract_text() | |
| if text: | |
| text_by_page.append({ | |
| "page_number": page_num, | |
| "text": text | |
| }) | |
| # κ²°κ³Ό λ°ν | |
| return { | |
| "filename": os.path.basename(file_path), | |
| "total_pages": total_pages, | |
| "extracted_pages": len(text_by_page), | |
| "text_by_page": text_by_page, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| raise Exception(f"PDF ν μ€νΈ μΆμΆ μ€ μ€λ₯ λ°μ: {str(e)}") | |
| def save_extracted_text(self, extracted_data: Dict, output_dir: str) -> str: | |
| """μΆμΆλ ν μ€νΈλ₯Ό JSON νμΌλ‘ μ μ₯""" | |
| output_dir = Path(output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| filename = f"extracted_{extracted_data['filename'].split('.')[0]}.json" | |
| output_path = output_dir / filename | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| json.dump(extracted_data, f, ensure_ascii=False, indent=2) | |
| return str(output_path) | |
| # μ±κΈν€ μΈμ€ν΄μ€ μμ± | |
| pdf_extractor = PDFExtractor() | |