Spaces:
Running
Running
File size: 2,294 Bytes
7011b92 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import os
import shutil
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ProcessingConfig:
"""Central configuration for Data Sources."""
root_dir: str
source_name: str # e.g., 'knbs' or 'cbk'
# Settings
batch_size: int = 20
max_workers: int = 4
min_image_bytes: int = 3000
min_image_dim: int = 100
max_page_objects: int = 500
def __post_init__(self):
# Paths setup
self.base_processed_dir = os.path.join(self.root_dir, 'processed')
self.source_dir = os.path.join(self.base_processed_dir, self.source_name)
self.drive_zip_dir = os.path.join(self.source_dir, "zipped_batches")
self.meta_dir = os.path.join(self.source_dir, f"{self.source_name}_index_metadata")
# Log Files
self.logs = {
'docs': os.path.join(self.meta_dir, f'{self.source_name}_docs_metadata.jsonl'),
'images': os.path.join(self.meta_dir, f'{self.source_name}_images_index.jsonl'),
'tables': os.path.join(self.meta_dir, f'{self.source_name}_tables_index.jsonl')
}
# Local Temp Paths
self.local_work_dir = Path(f"/tmp/temp_work_{self.source_name}")
self.local_dirs = {
'texts': self.local_work_dir / "texts",
'images': self.local_work_dir / "images",
'tables': self.local_work_dir / "tables",
'pdfs': self.local_work_dir / "pdfs"
}
def setup(self):
os.makedirs(self.drive_zip_dir, exist_ok=True)
os.makedirs(self.meta_dir, exist_ok=True)
if self.local_work_dir.exists():
shutil.rmtree(self.local_work_dir)
for d in self.local_dirs.values():
d.mkdir(parents=True, exist_ok=True)
self.create_canary()
def create_canary(self):
script_content = """
import sys, pymupdf, pdfplumber
if len(sys.argv) < 2: sys.exit(1)
try:
doc = pymupdf.open(sys.argv[1])
for p in doc: _, _ = p.get_text(), [doc.extract_image(i[0]) for i in p.get_images(full=True)]
with pdfplumber.open(sys.argv[1]) as p: _ = [page.objects for page in p.pages]
print("SAFE")
sys.exit(0)
except: sys.exit(1)
"""
with open("pdf_canary.py", "w") as f:
f.write(script_content.strip()) |