Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import argparse | |
| from pathlib import Path | |
| from typing import List, Dict, Any | |
| try: | |
| from PyPDF2 import PdfReader | |
| from tqdm import tqdm | |
| except ImportError: | |
| print("Installing required dependencies...") | |
| import subprocess | |
| subprocess.check_call(["pip", "install", "PyPDF2", "tqdm"]) | |
| from PyPDF2 import PdfReader | |
| from tqdm import tqdm | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| """Extract text from a PDF file.""" | |
| try: | |
| reader = PdfReader(pdf_path) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| except Exception as e: | |
| print(f"Error extracting text from {pdf_path}: {e}") | |
| return "" | |
| def process_pdfs(pdf_dir: str, output_dir: str, chunk_size: int = 1000) -> List[Dict[str, Any]]: | |
| """Process all PDFs in a directory and save the extracted text.""" | |
| pdf_files = list(Path(pdf_dir).glob("*.pdf")) | |
| if not pdf_files: | |
| raise ValueError(f"No PDF files found in {pdf_dir}") | |
| os.makedirs(output_dir, exist_ok=True) | |
| all_data = [] | |
| for pdf_file in tqdm(pdf_files, desc="Processing PDFs"): | |
| try: | |
| file_name = pdf_file.stem | |
| print(f"Processing {file_name}") | |
| text = extract_text_from_pdf(str(pdf_file)) | |
| if not text.strip(): | |
| print(f"Warning: No text extracted from {file_name}") | |
| continue | |
| # Split into chunks to avoid context length issues | |
| words = text.split() | |
| for i in range(0, len(words), chunk_size): | |
| chunk = " ".join(words[i:i+chunk_size]) | |
| if len(chunk.strip()) > 100: # Ensure chunk has enough content | |
| data_point = { | |
| "text": chunk, | |
| "source": file_name, | |
| "chunk_id": i // chunk_size | |
| } | |
| all_data.append(data_point) | |
| except Exception as e: | |
| print(f"Error processing {pdf_file}: {e}") | |
| # Save all data to a single JSON file | |
| with open(os.path.join(output_dir, "pdf_data.json"), "w", encoding="utf-8") as f: | |
| json.dump(all_data, f, ensure_ascii=False, indent=2) | |
| print(f"Processed {len(pdf_files)} PDFs into {len(all_data)} text chunks") | |
| return all_data | |
| def prepare_training_data(pdf_data: List[Dict[str, Any]], output_dir: str): | |
| """Prepare data in the format needed for fine-tuning LLMs.""" | |
| training_data = [] | |
| for item in pdf_data: | |
| # Format for instruction fine-tuning | |
| train_item = { | |
| "instruction": "Use the following text from the document to answer questions or generate content about the topics it covers.", | |
| "input": item["text"][:500], # Use beginning of text as input | |
| "output": item["text"][500:], # Use rest of text as output | |
| } | |
| training_data.append(train_item) | |
| # Create train/validation split (90/10) | |
| split_idx = int(len(training_data) * 0.9) | |
| train_data = training_data[:split_idx] | |
| val_data = training_data[split_idx:] | |
| # Save splits | |
| os.makedirs(os.path.join(output_dir, "training_data"), exist_ok=True) | |
| with open(os.path.join(output_dir, "training_data", "train.json"), "w", encoding="utf-8") as f: | |
| json.dump(train_data, f, ensure_ascii=False, indent=2) | |
| with open(os.path.join(output_dir, "training_data", "validation.json"), "w", encoding="utf-8") as f: | |
| json.dump(val_data, f, ensure_ascii=False, indent=2) | |
| print(f"Created training dataset: {len(train_data)} train, {len(val_data)} validation examples") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Process PDFs and prepare training data") | |
| parser.add_argument("--pdf_dir", type=str, required=True, help="Directory containing PDF files") | |
| parser.add_argument("--output_dir", type=str, default="processed_data", help="Output directory for processed data") | |
| parser.add_argument("--chunk_size", type=int, default=1000, help="Number of words per chunk") | |
| args = parser.parse_args() | |
| pdf_data = process_pdfs(args.pdf_dir, args.output_dir, args.chunk_size) | |
| prepare_training_data(pdf_data, args.output_dir) | |
| print("PDF processing complete. Data is ready for fine-tuning.") |