| from pypdf import PdfReader | |
| from utils import chunk_text | |
| from config import CHUNK_SIZE, CHUNK_OVERLAP | |
| def extract_text_from_pdf(file_path): | |
| reader = PdfReader(file_path) | |
| full_text = "" | |
| for i, page in enumerate(reader.pages): | |
| text = page.extract_text() | |
| if text: | |
| full_text += f"\n\n--- Page {i+1} ---\n{text}" | |
| return full_text | |
| def process_pdf(file_path): | |
| raw_text = extract_text_from_pdf(file_path) | |
| chunks = chunk_text(raw_text, CHUNK_SIZE, CHUNK_OVERLAP) | |
| return chunks | |