| """
|
| tools/pdf_reader.py
|
| Extract text and metadata from research papers (PDF)
|
| """
|
|
|
| from pypdf import PdfReader
|
| from typing import Dict, Any, Optional, List
|
| import re
|
| import os
|
|
|
| class PDFReader:
|
| """
|
| PDF extraction tool for research papers
|
|
|
| Features:
|
| - Extract full text
|
| - Extract metadata (title, author, etc.)
|
| - Identify abstract
|
| - Extract sections
|
| - Handle multi-column layouts
|
| """
|
|
|
| def __init__(self):
|
| self.supported_extensions = ['.pdf']
|
| print("β
PDF Reader initialized")
|
|
|
|
|
| def extract_text(self, pdf_path: str, max_pages: Optional[int] = None) -> str:
|
| """
|
| Extract all text from PDF
|
|
|
| Args:
|
| pdf_path: Path to PDF file
|
| max_pages: Maximum pages to extract (None = all)
|
|
|
| Returns:
|
| Extracted text as string
|
| """
|
| if not os.path.exists(pdf_path):
|
| raise FileNotFoundError(f"PDF not found: {pdf_path}")
|
|
|
| if not pdf_path.lower().endswith('.pdf'):
|
| raise ValueError(f"Not a PDF file: {pdf_path}")
|
|
|
| print(f"π Reading PDF: {pdf_path}")
|
|
|
| try:
|
| reader = PdfReader(pdf_path)
|
| num_pages = len(reader.pages)
|
|
|
| print(f" Pages: {num_pages}")
|
|
|
|
|
| text_parts = []
|
| pages_to_read = min(num_pages, max_pages) if max_pages else num_pages
|
|
|
| for i in range(pages_to_read):
|
| page = reader.pages[i]
|
| page_text = page.extract_text()
|
| text_parts.append(page_text)
|
|
|
| if (i + 1) % 10 == 0:
|
| print(f" Processed {i + 1}/{pages_to_read} pages...")
|
|
|
| full_text = '\n\n'.join(text_parts)
|
|
|
| print(f"β
Extracted {len(full_text)} characters from {pages_to_read} pages")
|
|
|
| return full_text
|
|
|
| except Exception as e:
|
| print(f"β PDF extraction error: {e}")
|
| raise
|
|
|
|
|
| def get_paper_info(self, pdf_path: str) -> Dict[str, Any]:
|
| """
|
| Extract metadata and basic info from PDF
|
|
|
| Returns:
|
| {
|
| 'metadata': {...},
|
| 'num_pages': int,
|
| 'abstract': str,
|
| 'sections': [...]
|
| }
|
| """
|
| print(f"π Extracting paper info from: {pdf_path}")
|
|
|
| try:
|
| reader = PdfReader(pdf_path)
|
|
|
|
|
| metadata = {}
|
| if reader.metadata:
|
| metadata = {
|
| 'title': reader.metadata.get('/Title', ''),
|
| 'author': reader.metadata.get('/Author', ''),
|
| 'subject': reader.metadata.get('/Subject', ''),
|
| 'creator': reader.metadata.get('/Creator', ''),
|
| 'producer': reader.metadata.get('/Producer', ''),
|
| 'creation_date': str(reader.metadata.get('/CreationDate', '')),
|
| }
|
|
|
|
|
| num_pages = len(reader.pages)
|
|
|
|
|
| first_pages_text = ''
|
| for i in range(min(3, num_pages)):
|
| first_pages_text += reader.pages[i].extract_text() + '\n\n'
|
|
|
|
|
| abstract = self._extract_abstract(first_pages_text)
|
|
|
|
|
| sections = self._extract_sections(first_pages_text)
|
|
|
| info = {
|
| 'metadata': metadata,
|
| 'num_pages': num_pages,
|
| 'abstract': abstract,
|
| 'sections': sections,
|
| 'file_path': pdf_path,
|
| 'file_size': os.path.getsize(pdf_path)
|
| }
|
|
|
| print(f"β
Paper info extracted:")
|
| print(f" Title: {metadata.get('title', 'Not found')[:50]}...")
|
| print(f" Pages: {num_pages}")
|
| print(f" Abstract: {'Found' if abstract else 'Not found'}")
|
|
|
| return info
|
|
|
| except Exception as e:
|
| print(f"β Error extracting paper info: {e}")
|
| return {
|
| 'metadata': {},
|
| 'num_pages': 0,
|
| 'abstract': '',
|
| 'sections': [],
|
| 'error': str(e)
|
| }
|
|
|
| def _extract_abstract(self, text: str) -> str:
|
| """Try to extract abstract from paper text"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| patterns = [
|
| r'(?i)abstract[:\-β]\s*(.*?)(?=\n\s*\n|\n\s*1\.|\n\s*introduction|$)',
|
| r'(?i)abstract\s*\n\s*(.*?)(?=\n\s*\n|\n\s*1\.|\n\s*introduction|$)',
|
| ]
|
|
|
| for pattern in patterns:
|
| match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
|
| if match:
|
| abstract = match.group(1).strip()
|
|
|
|
|
| abstract = re.sub(r'\s+', ' ', abstract)
|
| abstract = abstract[:1000]
|
|
|
| if len(abstract) > 50:
|
| return abstract
|
|
|
| return ''
|
|
|
|
|
| def _extract_sections(self, text: str) -> List[str]:
|
| """Try to identify paper sections"""
|
|
|
|
|
| section_patterns = [
|
| r'(?i)^\s*\d+\.?\s+(introduction|background|related work|methodology|method|approach|experiments?|results?|evaluation|discussion|conclusion|references?)',
|
| r'(?i)^\s*(introduction|background|related work|methodology|method|approach|experiments?|results?|evaluation|discussion|conclusion)\s*\n'
|
| ]
|
|
|
| sections = []
|
|
|
| for pattern in section_patterns:
|
| matches = re.finditer(pattern, text, re.MULTILINE)
|
| for match in matches:
|
| section_name = match.group(1).strip()
|
| if section_name.lower() not in [s.lower() for s in sections]:
|
| sections.append(section_name.title())
|
|
|
| return sections
|
|
|
| def extract_page_range(
|
| self,
|
| pdf_path: str,
|
| start_page: int,
|
| end_page: int
|
| ) -> str:
|
| """Extract text from specific page range"""
|
|
|
| try:
|
| reader = PdfReader(pdf_path)
|
| num_pages = len(reader.pages)
|
|
|
|
|
| start_page = max(0, min(start_page, num_pages - 1))
|
| end_page = max(start_page, min(end_page, num_pages - 1))
|
|
|
| text_parts = []
|
| for i in range(start_page, end_page + 1):
|
| text_parts.append(reader.pages[i].extract_text())
|
|
|
| return '\n\n'.join(text_parts)
|
|
|
| except Exception as e:
|
| print(f"β Error extracting page range: {e}")
|
| return ''
|
|
|
| def search_text(self, pdf_path: str, search_term: str) -> List[Dict[str, Any]]:
|
| """
|
| Search for text in PDF
|
|
|
| Returns list of matches with page numbers and context
|
| """
|
| print(f"π Searching for '{search_term}' in {pdf_path}")
|
|
|
| try:
|
| reader = PdfReader(pdf_path)
|
| matches = []
|
|
|
| for page_num, page in enumerate(reader.pages):
|
| text = page.extract_text()
|
|
|
|
|
| pattern = re.compile(re.escape(search_term), re.IGNORECASE)
|
|
|
| for match in pattern.finditer(text):
|
| start = max(0, match.start() - 50)
|
| end = min(len(text), match.end() + 50)
|
| context = text[start:end]
|
|
|
| matches.append({
|
| 'page': page_num + 1,
|
| 'context': context,
|
| 'position': match.start()
|
| })
|
|
|
| print(f"β
Found {len(matches)} matches")
|
| return matches
|
|
|
| except Exception as e:
|
| print(f"β Search error: {e}")
|
| return []
|
|
|
|
|
| def extract_references(self, pdf_path: str) -> List[str]:
|
| """Try to extract references/bibliography"""
|
|
|
| print(f"π Extracting references from {pdf_path}")
|
|
|
| try:
|
| reader = PdfReader(pdf_path)
|
| num_pages = len(reader.pages)
|
|
|
|
|
| last_pages_text = ''
|
| start_page = max(0, num_pages - 5)
|
|
|
| for i in range(start_page, num_pages):
|
| last_pages_text += reader.pages[i].extract_text() + '\n\n'
|
|
|
|
|
| ref_pattern = r'(?i)(references?|bibliography)\s*\n\s*(.*?)(?=\n\s*appendix|\Z)'
|
| match = re.search(ref_pattern, last_pages_text, re.DOTALL)
|
|
|
| if match:
|
| ref_text = match.group(2)
|
|
|
|
|
|
|
| ref_lines = ref_text.split('\n')
|
| references = []
|
| current_ref = ''
|
|
|
| for line in ref_lines:
|
| line = line.strip()
|
|
|
|
|
| if re.match(r'^\[?\d+\]?\.?\s+', line):
|
| if current_ref:
|
| references.append(current_ref.strip())
|
| current_ref = line
|
| else:
|
| current_ref += ' ' + line
|
|
|
| if current_ref:
|
| references.append(current_ref.strip())
|
|
|
| print(f"β
Extracted {len(references)} references")
|
| return references[:50]
|
|
|
| return []
|
|
|
| except Exception as e:
|
| print(f"β Error extracting references: {e}")
|
| return []
|
|
|
| def get_text_stats(self, pdf_path: str) -> Dict[str, Any]:
|
| """Get statistics about the PDF text"""
|
|
|
| try:
|
| text = self.extract_text(pdf_path)
|
|
|
| stats = {
|
| 'total_characters': len(text),
|
| 'total_words': len(text.split()),
|
| 'total_lines': len(text.split('\n')),
|
| 'estimated_tokens': len(text) // 4,
|
| 'avg_word_length': sum(len(word) for word in text.split()) / max(len(text.split()), 1)
|
| }
|
|
|
| return stats
|
|
|
| except Exception as e:
|
| return {'error': str(e)}
|
|
|
|
|
| def validate_pdf(self, pdf_path: str) -> Dict[str, Any]:
|
| """Validate if PDF is readable and get basic info"""
|
|
|
| validation = {
|
| 'valid': False,
|
| 'exists': False,
|
| 'is_pdf': False,
|
| 'readable': False,
|
| 'num_pages': 0,
|
| 'has_text': False,
|
| 'errors': []
|
| }
|
|
|
|
|
| if not os.path.exists(pdf_path):
|
| validation['errors'].append('File does not exist')
|
| return validation
|
|
|
| validation['exists'] = True
|
|
|
|
|
| if not pdf_path.lower().endswith('.pdf'):
|
| validation['errors'].append('Not a PDF file')
|
| return validation
|
|
|
| validation['is_pdf'] = True
|
|
|
|
|
| try:
|
| reader = PdfReader(pdf_path)
|
| validation['readable'] = True
|
| validation['num_pages'] = len(reader.pages)
|
|
|
|
|
| if validation['num_pages'] > 0:
|
| sample_text = reader.pages[0].extract_text()
|
| if len(sample_text.strip()) > 50:
|
| validation['has_text'] = True
|
| validation['valid'] = True
|
| else:
|
| validation['errors'].append('PDF has no extractable text (may be scanned image)')
|
| else:
|
| validation['errors'].append('PDF has no pages')
|
|
|
| except Exception as e:
|
| validation['errors'].append(f'Read error: {str(e)}')
|
|
|
| return validation
|
|
|
|
|
|
|
|
|
|
|
| def clean_text(text: str) -> str:
|
| """Clean extracted PDF text"""
|
|
|
|
|
| text = re.sub(r'\s+', ' ', text)
|
|
|
|
|
| text = re.sub(r'\n\s*\d+\s*\n', '\n', text)
|
|
|
|
|
| lines = text.split('\n')
|
| cleaned_lines = []
|
|
|
| for line in lines:
|
|
|
| if len(line.strip()) > 20:
|
| cleaned_lines.append(line)
|
|
|
| return '\n'.join(cleaned_lines)
|
|
|
|
|
| def extract_tables(text: str) -> List[str]:
|
| """Try to identify table-like structures in text"""
|
|
|
| tables = []
|
| lines = text.split('\n')
|
|
|
|
|
| table_lines = []
|
|
|
| for line in lines:
|
|
|
| if line.count('\t') >= 3 or len(re.findall(r'\s{3,}', line)) >= 3:
|
| table_lines.append(line)
|
| elif table_lines:
|
|
|
| if len(table_lines) >= 3:
|
| tables.append('\n'.join(table_lines))
|
| table_lines = []
|
|
|
| return tables
|
|
|
|
|
|
|
| def demo_pdf_reader():
|
| """Demo the PDF Reader"""
|
|
|
| print("="*60)
|
| print("π PDF READER DEMO")
|
| print("="*60)
|
| print()
|
|
|
| reader = PDFReader()
|
|
|
|
|
| print("Enter path to a PDF research paper to test:")
|
| pdf_path = input("Path: ").strip()
|
|
|
| if not pdf_path:
|
| print("βοΈ No path provided, exiting demo")
|
| return
|
|
|
| print()
|
|
|
|
|
|
|
| print("π Validating PDF...")
|
| validation = reader.validate_pdf(pdf_path)
|
| print(f"Valid: {validation['valid']}")
|
|
|
| if not validation['valid']:
|
| print(f"β Errors: {validation['errors']}")
|
| return
|
|
|
| print()
|
|
|
|
|
|
|
| print("π Extracting paper info...")
|
| info = reader.get_paper_info(pdf_path)
|
|
|
| print(f"\nMetadata:")
|
| for key, value in info['metadata'].items():
|
| if value:
|
| print(f" {key}: {value}")
|
|
|
| print(f"\nPages: {info['num_pages']}")
|
| print(f"File size: {info['file_size']:,} bytes")
|
|
|
| if info['abstract']:
|
| print(f"\nAbstract (first 200 chars):")
|
| print(f" {info['abstract'][:200]}...")
|
|
|
| if info['sections']:
|
| print(f"\nSections found: {', '.join(info['sections'])}")
|
|
|
| print()
|
|
|
|
|
| print("π Extracting full text (first 5 pages)...")
|
| text = reader.extract_text(pdf_path, max_pages=5)
|
|
|
| print(f"\nExtracted text (first 500 chars):")
|
| print(f" {text[:500]}...")
|
|
|
|
|
| print("\nπ Text statistics:")
|
| stats = reader.get_text_stats(pdf_path)
|
| for key, value in stats.items():
|
| print(f" {key}: {value}")
|
|
|
|
|
| print("\nπ Search test:")
|
| search_term = input("Enter term to search (or Enter to skip): ").strip()
|
|
|
| if search_term:
|
| matches = reader.search_text(pdf_path, search_term)
|
| print(f"\nFound {len(matches)} matches:")
|
| for i, match in enumerate(matches[:3], 1):
|
| print(f"\n {i}. Page {match['page']}:")
|
| print(f" ...{match['context']}...")
|
|
|
| print("\nβ
Demo complete!")
|
|
|
|
|
| if __name__ == "__main__":
|
| demo_pdf_reader |