#!/usr/bin/env python3 """ PDF to HuggingFace Dataset Converter This script converts PDF files in a directory to a format ready for HuggingFace dataset upload. It processes PDFs, chunks them appropriately, and creates a dataset with the expected schema: - id: unique identifier for each chunk - title: title of the source document - content: text content of the chunk Usage: python pdf_to_hf_dataset.py --input_dir /path/to/pdfs --output_dir /path/to/output [options] """ import os import json import argparse from pathlib import Path from typing import List, Dict, Any, Optional from tqdm import tqdm import hashlib import re from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_core.documents import Document import pandas as pd from datasets import Dataset class PDFToHFConverter: """Converter for PDF files to HuggingFace dataset format.""" def __init__(self, chunk_size: int = 1500, chunk_overlap: int = 300): """Initialize the converter with chunking configuration.""" self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap # Define text splitting separators separators = [ "\n\n", # Double newlines (paragraphs) "\n", # Single newlines ". ", # Sentences "? ", # Questions "! ", # Exclamations "; ", # Semicolons ", ", # Commas " ", # Spaces "", # Characters ] self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=separators, length_function=len, ) def process_pdf(self, pdf_path: str) -> List[Dict[str, Any]]: """Process a single PDF file and return chunks with metadata.""" try: print(f"Processing: {pdf_path}") # Load PDF loader = PyPDFLoader(pdf_path) documents = loader.load() if not documents: print(f"Warning: No content extracted from {pdf_path}") return [] # Combine all pages into one document for better chunking full_text = "\n\n".join([doc.page_content for doc in documents]) # Extract title (filename without extension) filename = Path(pdf_path).name title = Path(pdf_path).stem # Create a single document for chunking combined_doc = Document( page_content=full_text, metadata={ "source": pdf_path, "title": title, "filename": filename, "total_pages": len(documents), }, ) # Split into chunks chunks = self.text_splitter.split_documents([combined_doc]) # Convert to HF format hf_chunks = [] for i, chunk in enumerate(chunks): # Create unique ID using hash of content + position content_hash = hashlib.md5(chunk.page_content.encode()).hexdigest()[:8] chunk_id = f"{Path(pdf_path).stem}_{i:04d}_{content_hash}" # Clean content content = chunk.page_content.strip() # Skip very short chunks if len(content) < 100: continue hf_chunk = { "id": chunk_id, "title": title, "content": content, "source": pdf_path, "filename": filename, "chunk_index": i, "total_chunks": len(chunks), "chunk_size": len(content), } hf_chunks.append(hf_chunk) print(f"Created {len(hf_chunks)} chunks from {pdf_path}") return hf_chunks except Exception as e: print(f"Error processing {pdf_path}: {str(e)}") return [] def process_directory( self, input_dir: str, output_dir: str, output_format: str = "json" ) -> None: """Process all PDFs in a directory and save in HF format.""" input_path = Path(input_dir) output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) # Find all PDF files pdf_files = list(input_path.glob("**/*.pdf")) if not pdf_files: print(f"No PDF files found in {input_dir}") return print(f"Found {len(pdf_files)} PDF files to process") all_chunks = [] # Process each PDF for pdf_path in tqdm(pdf_files, desc="Processing PDFs"): chunks = self.process_pdf(str(pdf_path)) all_chunks.extend(chunks) if not all_chunks: print("No chunks were created from any PDFs") return print(f"Total chunks created: {len(all_chunks)}") # Save in requested format if output_format.lower() == "json": self.save_as_json(all_chunks, output_path) elif output_format.lower() == "jsonl": self.save_as_jsonl(all_chunks, output_path) elif output_format.lower() == "parquet": self.save_as_parquet(all_chunks, output_path) elif output_format.lower() == "csv": self.save_as_csv(all_chunks, output_path) else: print(f"Unsupported format: {output_format}") return # Also save metadata self.save_metadata(all_chunks, output_path) print(f"Dataset saved to {output_path}") print(f"Ready for HuggingFace upload!") def save_as_json(self, chunks: List[Dict[str, Any]], output_path: Path) -> None: """Save chunks as JSON file.""" output_file = output_path / "dataset.json" with open(output_file, "w", encoding="utf-8") as f: json.dump(chunks, f, indent=2, ensure_ascii=False) print(f"Saved JSON: {output_file}") def save_as_jsonl(self, chunks: List[Dict[str, Any]], output_path: Path) -> None: """Save chunks as JSONL file.""" output_file = output_path / "dataset.jsonl" with open(output_file, "w", encoding="utf-8") as f: for chunk in chunks: json.dump(chunk, f, ensure_ascii=False) f.write("\n") print(f"Saved JSONL: {output_file}") def save_as_parquet(self, chunks: List[Dict[str, Any]], output_path: Path) -> None: """Save chunks as Parquet file.""" # Create minimal version for HF (only required fields) hf_data = [ {"id": chunk["id"], "title": chunk["title"], "content": chunk["content"]} for chunk in chunks ] df = pd.DataFrame(hf_data) output_file = output_path / "dataset.parquet" df.to_parquet(output_file, index=False) print(f"Saved Parquet: {output_file}") def save_as_csv(self, chunks: List[Dict[str, Any]], output_path: Path) -> None: """Save chunks as CSV file.""" df = pd.DataFrame(chunks) output_file = output_path / "dataset.csv" df.to_csv(output_file, index=False, encoding="utf-8") print(f"Saved CSV: {output_file}") def save_metadata(self, chunks: List[Dict[str, Any]], output_path: Path) -> None: """Save dataset metadata and statistics.""" metadata = { "total_chunks": len(chunks), "total_sources": len(set(chunk["source"] for chunk in chunks)), "avg_chunk_size": sum(chunk["chunk_size"] for chunk in chunks) / len(chunks), "chunk_size_config": self.chunk_size, "chunk_overlap_config": self.chunk_overlap, "sources": list(set(chunk["source"] for chunk in chunks)), "titles": list(set(chunk["title"] for chunk in chunks)), } metadata_file = output_path / "metadata.json" with open(metadata_file, "w", encoding="utf-8") as f: json.dump(metadata, f, indent=2, ensure_ascii=False) print(f"Saved metadata: {metadata_file}") if __name__ == "__main__": """Main function to run the converter.""" parser = argparse.ArgumentParser(description="Convert PDF files to HuggingFace dataset format") parser.add_argument("--input_dir", "-i", required=True, help="Directory containing PDF files") parser.add_argument("--output_dir", "-o", required=True, help="Output directory for dataset") parser.add_argument( "--format", "-f", default="parquet", choices=["json", "jsonl", "parquet", "csv"], help="Output format (default: parquet)", ) parser.add_argument( "--chunk_size", "-c", type=int, default=1500, help="Chunk size for text splitting (default: 1500)", ) parser.add_argument( "--chunk_overlap", "-ol", type=int, default=300, help="Chunk overlap for text splitting (default: 300)", ) args = parser.parse_args() # Create converter and process converter = PDFToHFConverter(chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap) converter.process_directory( input_dir=args.input_dir, output_dir=args.output_dir, output_format=args.format )