Spaces:
Paused
Paused
| import os | |
| from langchain_core.documents import Document | |
| import pdfplumber | |
| from ingestion.loaders.normalization import normalize_text | |
| def load_pdf(file_path: str): | |
| documents = [] | |
| # Check if file exists | |
| if not os.path.exists(file_path): | |
| raise FileNotFoundError(f"File not found: {file_path}") | |
| try: | |
| with pdfplumber.open(file_path) as pdf: | |
| for page_num, page in enumerate(pdf.pages, start=1): | |
| try: | |
| text = page.extract_text() or "" | |
| text = normalize_text(text) | |
| tables = page.extract_tables() or [] | |
| # Reconstruct page text with tables preserved in order | |
| page_content = text.strip() | |
| for t_idx, table in enumerate(tables, start=1): | |
| table_text = "\n".join( | |
| ["\t".join(cell if cell else "" for cell in row) for row in table] | |
| ) | |
| table_text = normalize_text(table_text) | |
| page_content += f"\n\n=== Table {t_idx} (Page {page_num}) ===\n{table_text}" | |
| # Append as LangChain Document | |
| documents.append( | |
| Document( | |
| page_content=page_content, | |
| metadata={ | |
| "source": os.path.basename(file_path), | |
| "page_number": page_num, | |
| }, | |
| ) | |
| ) | |
| except Exception as e: | |
| print(f"Error extracting page {page_num}: {e}") | |
| continue # Skip corrupted pages, process others | |
| except Exception as e: | |
| print(f"Failed to open or read PDF file: {file_path}") | |
| print(f"Error: {e}") | |
| return [] # Return empty list instead of crashing | |
| return documents | |
| def load_pdf_with_pages(file_path: str): | |
| import fitz | |
| doc = fitz.open(file_path) | |
| pages = [] | |
| for i, page in enumerate(doc): | |
| pages.append({ | |
| "page": i + 1, | |
| "text": page.get_text() | |
| }) | |
| return pages | |