""" tools/pdf_reader.py Extract text and metadata from research papers (PDF) """ from pypdf import PdfReader from typing import Dict, Any, Optional, List import re import os class PDFReader: """ PDF extraction tool for research papers Features: - Extract full text - Extract metadata (title, author, etc.) - Identify abstract - Extract sections - Handle multi-column layouts """ def __init__(self): self.supported_extensions = ['.pdf'] print("βœ… PDF Reader initialized") def extract_text(self, pdf_path: str, max_pages: Optional[int] = None) -> str: """ Extract all text from PDF Args: pdf_path: Path to PDF file max_pages: Maximum pages to extract (None = all) Returns: Extracted text as string """ if not os.path.exists(pdf_path): raise FileNotFoundError(f"PDF not found: {pdf_path}") if not pdf_path.lower().endswith('.pdf'): raise ValueError(f"Not a PDF file: {pdf_path}") print(f"πŸ“– Reading PDF: {pdf_path}") try: reader = PdfReader(pdf_path) num_pages = len(reader.pages) print(f" Pages: {num_pages}") # Extract text from pages text_parts = [] pages_to_read = min(num_pages, max_pages) if max_pages else num_pages for i in range(pages_to_read): page = reader.pages[i] page_text = page.extract_text() text_parts.append(page_text) if (i + 1) % 10 == 0: print(f" Processed {i + 1}/{pages_to_read} pages...") full_text = '\n\n'.join(text_parts) print(f"βœ… Extracted {len(full_text)} characters from {pages_to_read} pages") return full_text except Exception as e: print(f"❌ PDF extraction error: {e}") raise def get_paper_info(self, pdf_path: str) -> Dict[str, Any]: """ Extract metadata and basic info from PDF Returns: { 'metadata': {...}, 'num_pages': int, 'abstract': str, 'sections': [...] } """ print(f"πŸ“Š Extracting paper info from: {pdf_path}") try: reader = PdfReader(pdf_path) # Get metadata metadata = {} if reader.metadata: metadata = { 'title': reader.metadata.get('/Title', ''), 'author': reader.metadata.get('/Author', ''), 'subject': reader.metadata.get('/Subject', ''), 'creator': reader.metadata.get('/Creator', ''), 'producer': reader.metadata.get('/Producer', ''), 'creation_date': str(reader.metadata.get('/CreationDate', '')), } # Get number of pages num_pages = len(reader.pages) # Extract first few pages for abstract detection first_pages_text = '' for i in range(min(3, num_pages)): # Check first 3 pages first_pages_text += reader.pages[i].extract_text() + '\n\n' # Try to extract abstract abstract = self._extract_abstract(first_pages_text) # Try to identify sections sections = self._extract_sections(first_pages_text) info = { 'metadata': metadata, 'num_pages': num_pages, 'abstract': abstract, 'sections': sections, 'file_path': pdf_path, 'file_size': os.path.getsize(pdf_path) } print(f"βœ… Paper info extracted:") print(f" Title: {metadata.get('title', 'Not found')[:50]}...") print(f" Pages: {num_pages}") print(f" Abstract: {'Found' if abstract else 'Not found'}") return info except Exception as e: print(f"❌ Error extracting paper info: {e}") return { 'metadata': {}, 'num_pages': 0, 'abstract': '', 'sections': [], 'error': str(e) } def _extract_abstract(self, text: str) -> str: """Try to extract abstract from paper text""" # Look for "Abstract" section # Common patterns: # - "Abstract\n" # - "ABSTRACT\n" # - "Abstractβ€”" # - "Abstract:" patterns = [ r'(?i)abstract[:\-β€”]\s*(.*?)(?=\n\s*\n|\n\s*1\.|\n\s*introduction|$)', r'(?i)abstract\s*\n\s*(.*?)(?=\n\s*\n|\n\s*1\.|\n\s*introduction|$)', ] for pattern in patterns: match = re.search(pattern, text, re.DOTALL | re.IGNORECASE) if match: abstract = match.group(1).strip() # Clean up abstract abstract = re.sub(r'\s+', ' ', abstract) # Remove extra whitespace abstract = abstract[:1000] # Limit length if len(abstract) > 50: # Must be substantial return abstract return '' def _extract_sections(self, text: str) -> List[str]: """Try to identify paper sections""" # Common section patterns section_patterns = [ r'(?i)^\s*\d+\.?\s+(introduction|background|related work|methodology|method|approach|experiments?|results?|evaluation|discussion|conclusion|references?)', r'(?i)^\s*(introduction|background|related work|methodology|method|approach|experiments?|results?|evaluation|discussion|conclusion)\s*\n' ] sections = [] for pattern in section_patterns: matches = re.finditer(pattern, text, re.MULTILINE) for match in matches: section_name = match.group(1).strip() if section_name.lower() not in [s.lower() for s in sections]: sections.append(section_name.title()) return sections def extract_page_range( self, pdf_path: str, start_page: int, end_page: int ) -> str: """Extract text from specific page range""" try: reader = PdfReader(pdf_path) num_pages = len(reader.pages) # Validate range start_page = max(0, min(start_page, num_pages - 1)) end_page = max(start_page, min(end_page, num_pages - 1)) text_parts = [] for i in range(start_page, end_page + 1): text_parts.append(reader.pages[i].extract_text()) return '\n\n'.join(text_parts) except Exception as e: print(f"❌ Error extracting page range: {e}") return '' def search_text(self, pdf_path: str, search_term: str) -> List[Dict[str, Any]]: """ Search for text in PDF Returns list of matches with page numbers and context """ print(f"πŸ” Searching for '{search_term}' in {pdf_path}") try: reader = PdfReader(pdf_path) matches = [] for page_num, page in enumerate(reader.pages): text = page.extract_text() # Find all occurrences pattern = re.compile(re.escape(search_term), re.IGNORECASE) for match in pattern.finditer(text): start = max(0, match.start() - 50) end = min(len(text), match.end() + 50) context = text[start:end] matches.append({ 'page': page_num + 1, 'context': context, 'position': match.start() }) print(f"βœ… Found {len(matches)} matches") return matches except Exception as e: print(f"❌ Search error: {e}") return [] def extract_references(self, pdf_path: str) -> List[str]: """Try to extract references/bibliography""" print(f"πŸ“š Extracting references from {pdf_path}") try: reader = PdfReader(pdf_path) num_pages = len(reader.pages) # References usually in last few pages last_pages_text = '' start_page = max(0, num_pages - 5) for i in range(start_page, num_pages): last_pages_text += reader.pages[i].extract_text() + '\n\n' # Look for references section ref_pattern = r'(?i)(references?|bibliography)\s*\n\s*(.*?)(?=\n\s*appendix|\Z)' match = re.search(ref_pattern, last_pages_text, re.DOTALL) if match: ref_text = match.group(2) # Split into individual references # Common patterns: [1], (1), 1., numbered lines ref_lines = ref_text.split('\n') references = [] current_ref = '' for line in ref_lines: line = line.strip() # Check if new reference (starts with number) if re.match(r'^\[?\d+\]?\.?\s+', line): if current_ref: references.append(current_ref.strip()) current_ref = line else: current_ref += ' ' + line if current_ref: references.append(current_ref.strip()) print(f"βœ… Extracted {len(references)} references") return references[:50] # Limit to first 50 return [] except Exception as e: print(f"❌ Error extracting references: {e}") return [] def get_text_stats(self, pdf_path: str) -> Dict[str, Any]: """Get statistics about the PDF text""" try: text = self.extract_text(pdf_path) stats = { 'total_characters': len(text), 'total_words': len(text.split()), 'total_lines': len(text.split('\n')), 'estimated_tokens': len(text) // 4, # Rough estimate 'avg_word_length': sum(len(word) for word in text.split()) / max(len(text.split()), 1) } return stats except Exception as e: return {'error': str(e)} def validate_pdf(self, pdf_path: str) -> Dict[str, Any]: """Validate if PDF is readable and get basic info""" validation = { 'valid': False, 'exists': False, 'is_pdf': False, 'readable': False, 'num_pages': 0, 'has_text': False, 'errors': [] } # Check existence if not os.path.exists(pdf_path): validation['errors'].append('File does not exist') return validation validation['exists'] = True # Check extension if not pdf_path.lower().endswith('.pdf'): validation['errors'].append('Not a PDF file') return validation validation['is_pdf'] = True # Try to read try: reader = PdfReader(pdf_path) validation['readable'] = True validation['num_pages'] = len(reader.pages) # Check if has extractable text if validation['num_pages'] > 0: sample_text = reader.pages[0].extract_text() if len(sample_text.strip()) > 50: validation['has_text'] = True validation['valid'] = True else: validation['errors'].append('PDF has no extractable text (may be scanned image)') else: validation['errors'].append('PDF has no pages') except Exception as e: validation['errors'].append(f'Read error: {str(e)}') return validation # ==================== HELPER FUNCTIONS ==================== def clean_text(text: str) -> str: """Clean extracted PDF text""" # Remove excessive whitespace text = re.sub(r'\s+', ' ', text) # Remove page numbers (common patterns) text = re.sub(r'\n\s*\d+\s*\n', '\n', text) # Remove headers/footers (heuristic: short lines at top/bottom) lines = text.split('\n') cleaned_lines = [] for line in lines: # Skip very short lines that might be headers/footers if len(line.strip()) > 20: cleaned_lines.append(line) return '\n'.join(cleaned_lines) def extract_tables(text: str) -> List[str]: """Try to identify table-like structures in text""" tables = [] lines = text.split('\n') # Look for lines with multiple tabs or aligned columns table_lines = [] for line in lines: # Heuristic: if line has 3+ tabs or multiple sequences of spaces if line.count('\t') >= 3 or len(re.findall(r'\s{3,}', line)) >= 3: table_lines.append(line) elif table_lines: # End of table if len(table_lines) >= 3: tables.append('\n'.join(table_lines)) table_lines = [] return tables # ==================== DEMO ==================== def demo_pdf_reader(): """Demo the PDF Reader""" print("="*60) print("πŸ“„ PDF READER DEMO") print("="*60) print() reader = PDFReader() # Ask for PDF path print("Enter path to a PDF research paper to test:") pdf_path = input("Path: ").strip() if not pdf_path: print("⏭️ No path provided, exiting demo") return print() # Validate PDF print("πŸ” Validating PDF...") validation = reader.validate_pdf(pdf_path) print(f"Valid: {validation['valid']}") if not validation['valid']: print(f"❌ Errors: {validation['errors']}") return print() # Get paper info print("πŸ“Š Extracting paper info...") info = reader.get_paper_info(pdf_path) print(f"\nMetadata:") for key, value in info['metadata'].items(): if value: print(f" {key}: {value}") print(f"\nPages: {info['num_pages']}") print(f"File size: {info['file_size']:,} bytes") if info['abstract']: print(f"\nAbstract (first 200 chars):") print(f" {info['abstract'][:200]}...") if info['sections']: print(f"\nSections found: {', '.join(info['sections'])}") print() # Extract text print("πŸ“– Extracting full text (first 5 pages)...") text = reader.extract_text(pdf_path, max_pages=5) print(f"\nExtracted text (first 500 chars):") print(f" {text[:500]}...") # Get stats print("\nπŸ“ˆ Text statistics:") stats = reader.get_text_stats(pdf_path) for key, value in stats.items(): print(f" {key}: {value}") # Search test print("\nπŸ” Search test:") search_term = input("Enter term to search (or Enter to skip): ").strip() if search_term: matches = reader.search_text(pdf_path, search_term) print(f"\nFound {len(matches)} matches:") for i, match in enumerate(matches[:3], 1): print(f"\n {i}. Page {match['page']}:") print(f" ...{match['context']}...") print("\nβœ… Demo complete!") if __name__ == "__main__": demo_pdf_reader