Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| """ | |
| Parse PDF files using MinerU and save results to reference_papers directory | |
| """ | |
| import argparse | |
| from datetime import datetime | |
| from pathlib import Path | |
| import shutil | |
| import subprocess | |
| import sys | |
| def check_mineru_installed(): | |
| """Check if MinerU is installed""" | |
| try: | |
| # Security: Using partial path is intentional here - we need to find | |
| # the command in user's PATH. These are trusted CLI tools, not user input. | |
| result = subprocess.run( | |
| ["magic-pdf", "--version"], # nosec B607 | |
| check=False, | |
| capture_output=True, | |
| text=True, | |
| shell=False, | |
| ) | |
| if result.returncode == 0: | |
| return "magic-pdf" | |
| except FileNotFoundError: | |
| pass | |
| try: | |
| # Security: Same as above - intentionally using PATH lookup for CLI tool. | |
| result = subprocess.run( | |
| ["mineru", "--version"], # nosec B607 | |
| check=False, | |
| capture_output=True, | |
| text=True, | |
| shell=False, | |
| ) | |
| if result.returncode == 0: | |
| return "mineru" | |
| except FileNotFoundError: | |
| pass | |
| return None | |
| def parse_pdf_with_mineru(pdf_path: str, output_base_dir: str = None): | |
| """ | |
| Parse PDF file using MinerU | |
| Args: | |
| pdf_path: Path to PDF file | |
| output_base_dir: Base path for output directory, defaults to reference_papers | |
| Returns: | |
| bool: Whether parsing was successful | |
| """ | |
| mineru_cmd = check_mineru_installed() | |
| if not mineru_cmd: | |
| print("β Error: MinerU installation not detected") | |
| print("Please install MinerU first:") | |
| print(" pip install magic-pdf[full]") | |
| print("or") | |
| print(" pip install mineru") | |
| print("or visit: https://github.com/opendatalab/MinerU") | |
| return False | |
| print(f"β Detected MinerU command: {mineru_cmd}") | |
| pdf_path = Path(pdf_path).resolve() | |
| if not pdf_path.exists(): | |
| print(f"β Error: PDF file does not exist: {pdf_path}") | |
| return False | |
| if not pdf_path.suffix.lower() == ".pdf": | |
| print(f"β Error: File is not PDF format: {pdf_path}") | |
| return False | |
| # Project root is 3 levels up from deeptutor/tools/question/ | |
| project_root = Path(__file__).parent.parent.parent.parent | |
| if output_base_dir is None: | |
| output_base_dir = project_root / "reference_papers" | |
| else: | |
| output_base_dir = Path(output_base_dir) | |
| output_base_dir.mkdir(parents=True, exist_ok=True) | |
| pdf_name = pdf_path.stem | |
| output_dir = output_base_dir / pdf_name | |
| if output_dir.exists(): | |
| print(f"β οΈ Directory already exists, replacing: {output_dir.name}") | |
| shutil.rmtree(output_dir) | |
| print(f"π PDF file: {pdf_path}") | |
| print(f"π Output directory: {output_dir}") | |
| print("β Starting parsing...") | |
| try: | |
| temp_output = output_base_dir / "temp_mineru_output" | |
| temp_output.mkdir(parents=True, exist_ok=True) | |
| cmd = [mineru_cmd, "-p", str(pdf_path), "-o", str(temp_output)] | |
| print(f"π§ Executing command: {' '.join(cmd)}") | |
| result = subprocess.run(cmd, capture_output=True, text=True, check=False, shell=False) | |
| if result.returncode != 0: | |
| print("β MinerU parsing failed:") | |
| print(f"Stdout: {result.stdout}") | |
| print(f"Stderr: {result.stderr}") | |
| if temp_output.exists(): | |
| shutil.rmtree(temp_output) | |
| return False | |
| print("β MinerU parsing completed!") | |
| generated_folders = list(temp_output.iterdir()) | |
| if not generated_folders: | |
| print("β οΈ Warning: No generated files found in temp directory") | |
| if temp_output.exists(): | |
| shutil.rmtree(temp_output) | |
| return False | |
| source_folder = generated_folders[0] if generated_folders[0].is_dir() else temp_output | |
| # Create target directory and move content | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Move MinerU-generated content to target directory | |
| if source_folder.exists() and source_folder.is_dir(): | |
| # If source_folder is the PDF-named directory, move its contents | |
| for item in source_folder.iterdir(): | |
| dest_item = output_dir / item.name | |
| if dest_item.exists(): | |
| if dest_item.is_dir(): | |
| shutil.rmtree(dest_item) | |
| else: | |
| dest_item.unlink() | |
| shutil.move(str(item), str(dest_item)) | |
| print(f"π¦ Files saved to: {output_dir}") | |
| else: | |
| if output_dir.exists(): | |
| shutil.rmtree(output_dir) | |
| shutil.move(str(source_folder), str(output_dir)) | |
| print(f"π¦ Files saved to: {output_dir}") | |
| if temp_output.exists(): | |
| shutil.rmtree(temp_output) | |
| print("\nπ Generated files:") | |
| for item in output_dir.rglob("*"): | |
| if item.is_file(): | |
| rel_path = item.relative_to(output_dir) | |
| print(f" - {rel_path}") | |
| return True | |
| except Exception as e: | |
| print(f"β Error occurred during parsing: {e!s}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def main(): | |
| """Main function""" | |
| parser = argparse.ArgumentParser( | |
| description="Parse PDF files using MinerU and save results to reference_papers directory", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| # Parse a single PDF file | |
| python pdf_parser.py /path/to/paper.pdf | |
| # Parse PDF and specify output directory | |
| python pdf_parser.py /path/to/paper.pdf -o /custom/output/dir | |
| """, | |
| ) | |
| parser.add_argument("pdf_path", type=str, help="Path to PDF file") | |
| parser.add_argument( | |
| "-o", | |
| "--output", | |
| type=str, | |
| default=None, | |
| help="Base path for output directory (default: reference_papers)", | |
| ) | |
| args = parser.parse_args() | |
| success = parse_pdf_with_mineru(args.pdf_path, args.output) | |
| if success: | |
| print("\nβ Parsing completed!") | |
| sys.exit(0) | |
| else: | |
| print("\nβ Parsing failed!") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |