|
|
|
|
|
"""PDF extraction CLI script for the RAG chatbot build pipeline. |
|
|
|
|
|
This script processes PDF files from an input directory and extracts their |
|
|
content to Markdown format in an output directory. It is Step 2 of the offline |
|
|
build pipeline: PDF -> Markdown extraction using pymupdf4llm. |
|
|
|
|
|
Features: |
|
|
- Incremental extraction: Skip files that already have markdown output |
|
|
- Force overwrite: Re-extract all files with --force flag |
|
|
- Timestamp-based detection: Re-extract if PDF is newer than markdown |
|
|
- Progress reporting: Visual progress bar during batch extraction |
|
|
- Verbose mode: Show detailed file names being processed |
|
|
- Quiet mode: Suppress all output except errors |
|
|
- Statistics summary: Display extraction stats on completion |
|
|
|
|
|
Exit Codes: |
|
|
0: Success - All files processed successfully (or skipped) |
|
|
1: Partial failure - Some files failed but some succeeded |
|
|
2: Total failure - No files processed or invalid arguments |
|
|
|
|
|
Example Usage: |
|
|
# Basic extraction |
|
|
poetry run python scripts/extract.py data/raw/ data/processed/ |
|
|
|
|
|
# Force overwrite existing files |
|
|
poetry run python scripts/extract.py data/raw/ data/processed/ --force |
|
|
|
|
|
# Verbose mode (show file names) |
|
|
poetry run python scripts/extract.py data/raw/ data/processed/ -v |
|
|
|
|
|
# Quiet mode (no output except errors) |
|
|
poetry run python scripts/extract.py data/raw/ data/processed/ -q |
|
|
|
|
|
Note: |
|
|
---- |
|
|
This script uses lazy loading for heavy dependencies (PDFExtractor, |
|
|
MarkdownConverter) to ensure fast CLI startup times. |
|
|
|
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import argparse |
|
|
import hashlib |
|
|
import sys |
|
|
import time |
|
|
from dataclasses import dataclass |
|
|
from pathlib import Path |
|
|
from typing import TYPE_CHECKING |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
|
_PROJECT_ROOT = Path(__file__).parent.parent |
|
|
_ENV_FILE = _PROJECT_ROOT / ".env" |
|
|
|
|
|
if _ENV_FILE.exists(): |
|
|
load_dotenv(_ENV_FILE) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if TYPE_CHECKING: |
|
|
from rag_chatbot.extraction import MarkdownConverter, PDFExtractor |
|
|
from rag_chatbot.extraction.models import ExtractedDocument |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__all__: list[str] = [ |
|
|
"ExtractionStatistics", |
|
|
"parse_args", |
|
|
"run_extraction", |
|
|
"main", |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
EXIT_SUCCESS = 0 |
|
|
EXIT_PARTIAL_FAILURE = 1 |
|
|
EXIT_TOTAL_FAILURE = 2 |
|
|
|
|
|
|
|
|
|
|
|
_update_progress: None = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
class ExtractionStatistics: |
|
|
"""Statistics from an extraction run. |
|
|
|
|
|
This dataclass tracks metrics from a batch PDF extraction operation, |
|
|
including counts of processed files and timing information. |
|
|
|
|
|
Attributes: |
|
|
---------- |
|
|
total : int |
|
|
Total number of PDF files found in the input directory. |
|
|
Must be non-negative. |
|
|
|
|
|
extracted : int |
|
|
Number of files successfully extracted to markdown. |
|
|
Must be non-negative. |
|
|
|
|
|
skipped : int |
|
|
Number of files skipped due to existing output (incremental mode). |
|
|
Must be non-negative. |
|
|
|
|
|
failed : int |
|
|
Number of files that failed to extract due to errors. |
|
|
Must be non-negative. |
|
|
|
|
|
total_pages : int |
|
|
Total number of pages extracted across all successful files. |
|
|
Must be non-negative. |
|
|
|
|
|
elapsed_seconds : float |
|
|
Total time elapsed during extraction in seconds. |
|
|
Must be non-negative. |
|
|
|
|
|
Example: |
|
|
------- |
|
|
>>> stats = ExtractionStatistics( |
|
|
... total=10, |
|
|
... extracted=8, |
|
|
... skipped=1, |
|
|
... failed=1, |
|
|
... total_pages=42, |
|
|
... elapsed_seconds=15.3, |
|
|
... ) |
|
|
>>> stats.extracted + stats.skipped + stats.failed == stats.total |
|
|
True |
|
|
|
|
|
""" |
|
|
|
|
|
total: int |
|
|
extracted: int |
|
|
skipped: int |
|
|
failed: int |
|
|
total_pages: int |
|
|
elapsed_seconds: float |
|
|
|
|
|
def __post_init__(self) -> None: |
|
|
"""Validate statistics values after initialization. |
|
|
|
|
|
Raises |
|
|
------ |
|
|
ValueError: If any count is negative. |
|
|
TypeError: If elapsed_seconds is not a number. |
|
|
|
|
|
""" |
|
|
|
|
|
if self.total < 0: |
|
|
msg = f"total must be non-negative, got {self.total}" |
|
|
raise ValueError(msg) |
|
|
if self.extracted < 0: |
|
|
msg = f"extracted must be non-negative, got {self.extracted}" |
|
|
raise ValueError(msg) |
|
|
if self.skipped < 0: |
|
|
msg = f"skipped must be non-negative, got {self.skipped}" |
|
|
raise ValueError(msg) |
|
|
if self.failed < 0: |
|
|
msg = f"failed must be non-negative, got {self.failed}" |
|
|
raise ValueError(msg) |
|
|
if self.total_pages < 0: |
|
|
msg = f"total_pages must be non-negative, got {self.total_pages}" |
|
|
raise ValueError(msg) |
|
|
if self.elapsed_seconds < 0: |
|
|
msg = f"elapsed_seconds must be non-negative, got {self.elapsed_seconds}" |
|
|
raise ValueError(msg) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_args(argv: list[str] | None = None) -> argparse.Namespace: |
|
|
"""Parse command-line arguments for the extraction script. |
|
|
|
|
|
This function sets up the argument parser with all supported options |
|
|
and returns the parsed arguments. It handles validation of mutually |
|
|
exclusive flags (--verbose and --quiet cannot be used together). |
|
|
|
|
|
Args: |
|
|
---- |
|
|
argv : list[str] | None, optional |
|
|
Command-line arguments to parse. If None, uses sys.argv[1:]. |
|
|
This parameter enables testing without modifying sys.argv. |
|
|
|
|
|
Returns: |
|
|
------- |
|
|
argparse.Namespace |
|
|
Parsed arguments with the following attributes: |
|
|
- input_dir: Path - Directory containing PDF files |
|
|
- output_dir: Path - Directory for markdown output |
|
|
- force: bool - Whether to overwrite existing files |
|
|
- verbose: bool - Whether to show detailed output |
|
|
- quiet: bool - Whether to suppress output |
|
|
- dump_raw_for: str | None - Optional PDF filename/stem to dump raw output |
|
|
|
|
|
Raises: |
|
|
------ |
|
|
SystemExit |
|
|
If required arguments are missing, unknown arguments are provided, |
|
|
or --verbose and --quiet are both specified. |
|
|
|
|
|
Example: |
|
|
------- |
|
|
>>> args = parse_args(["data/raw/", "data/processed/", "--force"]) |
|
|
>>> args.input_dir |
|
|
PosixPath('data/raw') |
|
|
>>> args.force |
|
|
True |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser( |
|
|
prog="extract.py", |
|
|
description=( |
|
|
"Extract PDF documents to Markdown format for the RAG pipeline. " |
|
|
"Processes all PDF files in the input directory and saves " |
|
|
"Markdown output to the output directory." |
|
|
), |
|
|
epilog=( |
|
|
"Examples:\n" |
|
|
" %(prog)s data/raw/ data/processed/ # Basic extraction\n" |
|
|
" %(prog)s data/raw/ data/processed/ --force # Force re-extract\n" |
|
|
" %(prog)s data/raw/ data/processed/ -v # Verbose output\n" |
|
|
), |
|
|
formatter_class=argparse.RawDescriptionHelpFormatter, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument( |
|
|
"input_dir", |
|
|
type=Path, |
|
|
help="Directory containing PDF files to extract", |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
"output_dir", |
|
|
type=Path, |
|
|
help="Directory where Markdown files will be written", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument( |
|
|
"--force", |
|
|
"-f", |
|
|
action="store_true", |
|
|
default=False, |
|
|
help="Force overwrite of existing Markdown files (default: skip existing)", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
output_group = parser.add_mutually_exclusive_group() |
|
|
|
|
|
output_group.add_argument( |
|
|
"--verbose", |
|
|
"-v", |
|
|
action="store_true", |
|
|
default=False, |
|
|
help="Show detailed output including file names being processed", |
|
|
) |
|
|
|
|
|
output_group.add_argument( |
|
|
"--quiet", |
|
|
"-q", |
|
|
action="store_true", |
|
|
default=False, |
|
|
help="Suppress all output except errors (still shows summary)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--dump-raw-for", |
|
|
default=None, |
|
|
help=( |
|
|
"Dump raw pymupdf4llm markdown (pre-MarkdownConverter) for a " |
|
|
"matching PDF filename or stem to <output_dir>/<stem>.raw.md" |
|
|
), |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return parser.parse_args(argv) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _should_extract( |
|
|
pdf_path: Path, |
|
|
md_path: Path, |
|
|
force: bool, |
|
|
) -> bool: |
|
|
"""Determine if a PDF file should be extracted. |
|
|
|
|
|
This function implements the incremental extraction logic by checking: |
|
|
1. If --force is set, always extract |
|
|
2. If markdown output doesn't exist, extract |
|
|
3. If PDF is newer than markdown, extract (re-extract modified files) |
|
|
4. Otherwise, skip (markdown is up-to-date) |
|
|
|
|
|
Args: |
|
|
---- |
|
|
pdf_path : Path |
|
|
Path to the PDF file to potentially extract. |
|
|
md_path : Path |
|
|
Path where the markdown output would be written. |
|
|
force : bool |
|
|
Whether to force extraction regardless of existing output. |
|
|
|
|
|
Returns: |
|
|
------- |
|
|
bool |
|
|
True if the file should be extracted, False if it should be skipped. |
|
|
|
|
|
""" |
|
|
|
|
|
if force: |
|
|
return True |
|
|
|
|
|
|
|
|
if not md_path.exists(): |
|
|
return True |
|
|
|
|
|
|
|
|
pdf_mtime = pdf_path.stat().st_mtime |
|
|
md_mtime = md_path.stat().st_mtime |
|
|
|
|
|
return pdf_mtime > md_mtime |
|
|
|
|
|
|
|
|
def _get_extractor() -> PDFExtractor: |
|
|
"""Lazily load and return a PDFExtractor instance. |
|
|
|
|
|
This function handles the lazy import of the PDFExtractor class to avoid |
|
|
loading heavy dependencies (pymupdf, pymupdf4llm) at module import time. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
PDFExtractor |
|
|
A configured PDF extractor instance. |
|
|
|
|
|
""" |
|
|
from rag_chatbot.extraction import PDFExtractor |
|
|
|
|
|
return PDFExtractor() |
|
|
|
|
|
|
|
|
def _get_converter() -> MarkdownConverter: |
|
|
"""Lazily load and return a MarkdownConverter instance. |
|
|
|
|
|
This function handles the lazy import of the MarkdownConverter class. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
MarkdownConverter |
|
|
A configured Markdown converter instance. |
|
|
|
|
|
""" |
|
|
from rag_chatbot.extraction import MarkdownConverter |
|
|
|
|
|
return MarkdownConverter() |
|
|
|
|
|
|
|
|
def _print_summary(stats: ExtractionStatistics, quiet: bool) -> None: |
|
|
"""Print the extraction summary statistics. |
|
|
|
|
|
Displays a formatted summary of the extraction run including counts |
|
|
of extracted, skipped, and failed files, plus timing information. |
|
|
|
|
|
Args: |
|
|
---- |
|
|
stats : ExtractionStatistics |
|
|
The statistics from the extraction run. |
|
|
quiet : bool |
|
|
Unused - summary is always printed even in quiet mode. |
|
|
Kept for API consistency with run_extraction parameters. |
|
|
|
|
|
Note: |
|
|
---- |
|
|
The quiet parameter is intentionally unused because the summary |
|
|
should always be shown to the user, even in quiet mode. Only |
|
|
the progress bar is suppressed in quiet mode. |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
print() |
|
|
print("Extraction Complete") |
|
|
print("=" * 39) |
|
|
print(f"Total PDF files: {stats.total:>4}") |
|
|
print(f"Extracted: {stats.extracted:>4}") |
|
|
print(f"Skipped (existing): {stats.skipped:>4}") |
|
|
print(f"Failed: {stats.failed:>4}") |
|
|
print(f"Total pages: {stats.total_pages:>4}") |
|
|
print(f"Elapsed time: {stats.elapsed_seconds:.2f}s") |
|
|
print("=" * 39) |
|
|
|
|
|
|
|
|
def run_extraction( |
|
|
input_dir: Path, |
|
|
output_dir: Path, |
|
|
force: bool, |
|
|
verbose: bool, |
|
|
quiet: bool, |
|
|
dump_raw_for: str | None = None, |
|
|
) -> ExtractionStatistics: |
|
|
"""Run the PDF extraction process on all PDF files in the input directory. |
|
|
|
|
|
This function is the core extraction logic. It: |
|
|
1. Finds all PDF files in the input directory |
|
|
2. Determines which files need extraction (incremental or force) |
|
|
3. Extracts each PDF using PDFExtractor |
|
|
4. Applies MarkdownConverter for normalization |
|
|
5. Writes output to the output directory |
|
|
6. Tracks and returns extraction statistics |
|
|
|
|
|
Args: |
|
|
---- |
|
|
input_dir : Path |
|
|
Directory containing PDF files to extract. Must exist. |
|
|
output_dir : Path |
|
|
Directory where Markdown files will be written. Created if needed. |
|
|
force : bool |
|
|
If True, overwrite existing markdown files. If False, skip files |
|
|
that already have up-to-date markdown output. |
|
|
verbose : bool |
|
|
If True, print detailed information including file names. |
|
|
quiet : bool |
|
|
If True, suppress progress bar (but still print summary). |
|
|
dump_raw_for : str | None |
|
|
Optional PDF filename or stem to dump raw markdown for. |
|
|
|
|
|
Returns: |
|
|
------- |
|
|
ExtractionStatistics |
|
|
Statistics about the extraction run including counts and timing. |
|
|
|
|
|
Note: |
|
|
---- |
|
|
The function handles errors gracefully, continuing to process remaining |
|
|
files if one fails. Failed files are logged and counted in statistics. |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
start_time = time.perf_counter() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
except PermissionError: |
|
|
|
|
|
print( |
|
|
f"Error: Permission denied creating output directory: {output_dir}", |
|
|
file=sys.stderr, |
|
|
) |
|
|
elapsed = time.perf_counter() - start_time |
|
|
return ExtractionStatistics( |
|
|
total=0, |
|
|
extracted=0, |
|
|
skipped=0, |
|
|
failed=0, |
|
|
total_pages=0, |
|
|
elapsed_seconds=elapsed, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdf_files = sorted(input_dir.glob("*.pdf")) |
|
|
total_files = len(pdf_files) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extracted_count = 0 |
|
|
skipped_count = 0 |
|
|
failed_count = 0 |
|
|
total_pages = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if total_files == 0: |
|
|
elapsed = time.perf_counter() - start_time |
|
|
return ExtractionStatistics( |
|
|
total=0, |
|
|
extracted=0, |
|
|
skipped=0, |
|
|
failed=0, |
|
|
total_pages=0, |
|
|
elapsed_seconds=elapsed, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extractor = _get_extractor() |
|
|
converter = _get_converter() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not quiet: |
|
|
try: |
|
|
from tqdm import tqdm |
|
|
|
|
|
progress_bar = tqdm( |
|
|
pdf_files, |
|
|
desc="Extracting", |
|
|
unit="file", |
|
|
disable=False, |
|
|
) |
|
|
except ImportError: |
|
|
|
|
|
progress_bar = pdf_files |
|
|
if not quiet: |
|
|
print(f"Processing {total_files} PDF files...") |
|
|
else: |
|
|
progress_bar = pdf_files |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
raw_target = (dump_raw_for or "").strip() |
|
|
raw_target_lower = raw_target.lower() |
|
|
raw_target_is_filename = raw_target_lower.endswith(".pdf") |
|
|
|
|
|
for idx, pdf_path in enumerate(progress_bar): |
|
|
|
|
|
md_filename = pdf_path.stem + ".md" |
|
|
md_path = output_dir / md_filename |
|
|
pdf_name_lower = pdf_path.name.lower() |
|
|
pdf_stem_lower = pdf_path.stem.lower() |
|
|
raw_dump_match = False |
|
|
if raw_target_lower: |
|
|
if raw_target_is_filename: |
|
|
raw_dump_match = pdf_name_lower == raw_target_lower |
|
|
else: |
|
|
raw_dump_match = raw_target_lower in {pdf_name_lower, pdf_stem_lower} |
|
|
|
|
|
|
|
|
if not _should_extract(pdf_path, md_path, force): |
|
|
skipped_count += 1 |
|
|
if verbose: |
|
|
print(f" Skipping (up-to-date): {pdf_path.name}") |
|
|
|
|
|
|
|
|
if _update_progress is not None: |
|
|
_update_progress(idx, total_files) |
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
try: |
|
|
if verbose: |
|
|
print(f" Extracting: {pdf_path.name}") |
|
|
|
|
|
|
|
|
document: ExtractedDocument = extractor.extract(pdf_path) |
|
|
|
|
|
|
|
|
raw_markdown = document.to_markdown() |
|
|
if raw_dump_match: |
|
|
raw_dump_path = output_dir / f"{pdf_path.stem}.raw.md" |
|
|
raw_dump_path.write_text(raw_markdown, encoding="utf-8") |
|
|
if verbose: |
|
|
print(f" Wrote raw markdown: {raw_dump_path}") |
|
|
if verbose and raw_dump_match: |
|
|
raw_hash = hashlib.sha256(raw_markdown.encode("utf-8")).hexdigest()[:12] |
|
|
raw_underscore_count = raw_markdown.count("_") |
|
|
print( |
|
|
" Raw checksum/underscores:", |
|
|
raw_hash, |
|
|
f"underscores={raw_underscore_count}", |
|
|
) |
|
|
clean_markdown = converter.convert(raw_markdown) |
|
|
if verbose and raw_dump_match: |
|
|
clean_hash = hashlib.sha256(clean_markdown.encode("utf-8")).hexdigest()[ |
|
|
:12 |
|
|
] |
|
|
clean_underscore_count = clean_markdown.count("_") |
|
|
print( |
|
|
" Clean checksum/underscores:", |
|
|
clean_hash, |
|
|
f"underscores={clean_underscore_count}", |
|
|
) |
|
|
|
|
|
|
|
|
md_path.write_text(clean_markdown, encoding="utf-8") |
|
|
|
|
|
|
|
|
extracted_count += 1 |
|
|
total_pages += document.page_count |
|
|
|
|
|
if verbose: |
|
|
print( |
|
|
f" Extracted {document.page_count} pages, " |
|
|
f"{document.total_tables} tables, " |
|
|
f"{document.total_images} images" |
|
|
) |
|
|
|
|
|
except PermissionError as e: |
|
|
|
|
|
failed_count += 1 |
|
|
print(f"Error: Permission denied for {pdf_path.name}: {e}", file=sys.stderr) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
failed_count += 1 |
|
|
print(f"Error: Failed to extract {pdf_path.name}: {e}", file=sys.stderr) |
|
|
|
|
|
|
|
|
if _update_progress is not None: |
|
|
_update_progress(idx, total_files) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elapsed = time.perf_counter() - start_time |
|
|
|
|
|
stats = ExtractionStatistics( |
|
|
total=total_files, |
|
|
extracted=extracted_count, |
|
|
skipped=skipped_count, |
|
|
failed=failed_count, |
|
|
total_pages=total_pages, |
|
|
elapsed_seconds=elapsed, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_print_summary(stats, quiet) |
|
|
|
|
|
return stats |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int: |
|
|
"""Execute the extraction CLI script. |
|
|
|
|
|
This function orchestrates the entire extraction process: |
|
|
1. Parses command-line arguments |
|
|
2. Validates input directory existence |
|
|
3. Runs the extraction process |
|
|
4. Prints summary statistics |
|
|
5. Returns appropriate exit code |
|
|
|
|
|
Args: |
|
|
---- |
|
|
argv : list[str] | None, optional |
|
|
Command-line arguments to parse. If None, uses sys.argv[1:]. |
|
|
|
|
|
Returns: |
|
|
------- |
|
|
int |
|
|
Exit code indicating success or failure: |
|
|
- 0: Success (all files processed or skipped) |
|
|
- 1: Partial failure (some files failed) |
|
|
- 2: Total failure (no files processed or invalid input) |
|
|
|
|
|
Example: |
|
|
------- |
|
|
>>> exit_code = main(["data/raw/", "data/processed/"]) |
|
|
>>> exit_code |
|
|
0 |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
args = parse_args(argv) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not args.input_dir.exists(): |
|
|
print( |
|
|
f"Error: Input directory does not exist: {args.input_dir}", |
|
|
file=sys.stderr, |
|
|
) |
|
|
return EXIT_TOTAL_FAILURE |
|
|
|
|
|
if not args.input_dir.is_dir(): |
|
|
print( |
|
|
f"Error: Input path is not a directory: {args.input_dir}", |
|
|
file=sys.stderr, |
|
|
) |
|
|
return EXIT_TOTAL_FAILURE |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdf_files = list(args.input_dir.glob("*.pdf")) |
|
|
if not pdf_files: |
|
|
print( |
|
|
f"Error: No PDF files found in {args.input_dir}", |
|
|
file=sys.stderr, |
|
|
) |
|
|
return EXIT_TOTAL_FAILURE |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
stats = run_extraction( |
|
|
input_dir=args.input_dir, |
|
|
output_dir=args.output_dir, |
|
|
force=args.force, |
|
|
verbose=args.verbose, |
|
|
quiet=args.quiet, |
|
|
dump_raw_for=args.dump_raw_for, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if stats.total == 0 and len(pdf_files) > 0: |
|
|
return EXIT_TOTAL_FAILURE |
|
|
|
|
|
|
|
|
if stats.failed == 0: |
|
|
return EXIT_SUCCESS |
|
|
|
|
|
|
|
|
if stats.extracted > 0 or stats.skipped > 0: |
|
|
return EXIT_PARTIAL_FAILURE |
|
|
|
|
|
|
|
|
return EXIT_TOTAL_FAILURE |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
sys.exit(main()) |
|
|
|