Spaces:
Runtime error
Runtime error
| import fitz # PyMuPDF | |
| import requests | |
| from io import BytesIO | |
| from concurrent.futures import ThreadPoolExecutor | |
| import os | |
| def extract_page_text(page): | |
| text = page.get_text() | |
| return text if text.strip() else None | |
| def parse_pdf_from_url_multithreaded(url, max_workers=None): | |
| # Automatically detect and use all available CPU cores if max_workers not set | |
| if max_workers is None: | |
| max_workers = os.cpu_count() or 8 | |
| res = requests.get(url) | |
| doc = fitz.open(stream=BytesIO(res.content), filetype="pdf") | |
| pages = [page for page in doc] | |
| chunks = [None] * len(pages) | |
| # Process pages in parallel, preserving page order | |
| with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| results = list(executor.map(extract_page_text, pages)) | |
| # Keep only non-empty page results, preserving order | |
| doc.close() | |
| return [r for r in results if r] | |
| def parse_pdf_from_file_multithreaded(file_path, max_workers=None): | |
| if max_workers is None: | |
| max_workers = os.cpu_count() or 8 | |
| try: | |
| doc = fitz.open(file_path) | |
| pages = [page for page in doc] | |
| chunks = [None] * len(pages) | |
| with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| results = list(executor.map(extract_page_text, pages)) | |
| doc.close() | |
| return [r for r in results if r] | |
| except Exception as e: | |
| raise Exception(f"Error parsing PDF file {file_path}: {str(e)}") | |