Spaces:
Runtime error
Runtime error
| import re | |
| from pdfminer.high_level import extract_pages | |
| from pdfminer.layout import LTTextContainer | |
| from research_assistant.app_logging import app_logger | |
| def pdf_parser(pdf_path): | |
| """ | |
| Extracts text from a PDF file, removing headers, footers, and page numbers. | |
| Args: | |
| pdf_path (str): The file path to the PDF. | |
| Returns: | |
| str: The extracted text suitable for LLM input. | |
| """ | |
| extracted_text = [] | |
| header_counter, footer_counter = {}, {} | |
| header_patterns, footer_patterns = set(), set() | |
| # Matches lines with page numbers | |
| page_number_pattern = re.compile(r"^(Page\s+)?\d+(/\d+)?$") | |
| try: | |
| # First pass: identify headers and footers by tracking recurring lines | |
| total_pages = 0 | |
| for page_layout in extract_pages(pdf_path): | |
| total_pages += 1 | |
| page_text = [ | |
| element.get_text().strip() | |
| for element in page_layout | |
| if isinstance(element, LTTextContainer) and element.get_text().strip() | |
| ] | |
| if len(page_text) >= 2: | |
| header, footer = page_text[0], page_text[-1] | |
| header_counter[header] = header_counter.get(header, 0) + 1 | |
| footer_counter[footer] = footer_counter.get(footer, 0) + 1 | |
| # Determine most common headers and footers | |
| header_patterns = { | |
| k for k, v in header_counter.items() if v > total_pages * 0.5 | |
| } | |
| footer_patterns = { | |
| k for k, v in footer_counter.items() if v > total_pages * 0.5 | |
| } | |
| # Compile regex patterns | |
| header_regexes = [re.compile(re.escape(header)) for header in header_patterns] | |
| footer_regexes = [re.compile(re.escape(footer)) for footer in footer_patterns] | |
| # Second pass: extract and clean text | |
| for page_layout in extract_pages(pdf_path): | |
| page_text = [ | |
| element.get_text().strip() | |
| for element in page_layout | |
| if isinstance(element, LTTextContainer) and element.get_text().strip() | |
| ] | |
| extracted_text.extend( | |
| line | |
| for line in page_text | |
| if not any(regex.match(line) for regex in header_regexes) | |
| and not any(regex.match(line) for regex in footer_regexes) | |
| and not page_number_pattern.match(line) | |
| ) | |
| return " ".join(extracted_text).replace("\n", " ").strip() | |
| except Exception as e: | |
| app_logger.error(f"Failed to parse PDF {pdf_path}: {e}") | |
| return "" | |