Spaces:
Running
Running
| import os | |
| import shutil | |
| from pathlib import Path | |
| from dataclasses import dataclass | |
| class ProcessingConfig: | |
| """Central configuration for Data Sources.""" | |
| root_dir: str | |
| source_name: str # e.g., 'knbs' or 'cbk' | |
| # Settings | |
| batch_size: int = 20 | |
| max_workers: int = 4 | |
| min_image_bytes: int = 3000 | |
| min_image_dim: int = 100 | |
| max_page_objects: int = 500 | |
| def __post_init__(self): | |
| # Paths setup | |
| self.base_processed_dir = os.path.join(self.root_dir, 'processed') | |
| self.source_dir = os.path.join(self.base_processed_dir, self.source_name) | |
| self.drive_zip_dir = os.path.join(self.source_dir, "zipped_batches") | |
| self.meta_dir = os.path.join(self.source_dir, f"{self.source_name}_index_metadata") | |
| # Log Files | |
| self.logs = { | |
| 'docs': os.path.join(self.meta_dir, f'{self.source_name}_docs_metadata.jsonl'), | |
| 'images': os.path.join(self.meta_dir, f'{self.source_name}_images_index.jsonl'), | |
| 'tables': os.path.join(self.meta_dir, f'{self.source_name}_tables_index.jsonl') | |
| } | |
| # Local Temp Paths | |
| self.local_work_dir = Path(f"/tmp/temp_work_{self.source_name}") | |
| self.local_dirs = { | |
| 'texts': self.local_work_dir / "texts", | |
| 'images': self.local_work_dir / "images", | |
| 'tables': self.local_work_dir / "tables", | |
| 'pdfs': self.local_work_dir / "pdfs" | |
| } | |
| def setup(self): | |
| os.makedirs(self.drive_zip_dir, exist_ok=True) | |
| os.makedirs(self.meta_dir, exist_ok=True) | |
| if self.local_work_dir.exists(): | |
| shutil.rmtree(self.local_work_dir) | |
| for d in self.local_dirs.values(): | |
| d.mkdir(parents=True, exist_ok=True) | |
| self.create_canary() | |
| def create_canary(self): | |
| script_content = """ | |
| import sys, pymupdf, pdfplumber | |
| if len(sys.argv) < 2: sys.exit(1) | |
| try: | |
| doc = pymupdf.open(sys.argv[1]) | |
| for p in doc: _, _ = p.get_text(), [doc.extract_image(i[0]) for i in p.get_images(full=True)] | |
| with pdfplumber.open(sys.argv[1]) as p: _ = [page.objects for page in p.pages] | |
| print("SAFE") | |
| sys.exit(0) | |
| except: sys.exit(1) | |
| """ | |
| with open("pdf_canary.py", "w") as f: | |
| f.write(script_content.strip()) |