GirishaBuilds01's picture
Create extraction.py
b9fe585 verified
raw
history blame contribute delete
530 Bytes
from pypdf import PdfReader
from utils import chunk_text
from config import CHUNK_SIZE, CHUNK_OVERLAP
def extract_text_from_pdf(file_path):
reader = PdfReader(file_path)
full_text = ""
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text:
full_text += f"\n\n--- Page {i+1} ---\n{text}"
return full_text
def process_pdf(file_path):
raw_text = extract_text_from_pdf(file_path)
chunks = chunk_text(raw_text, CHUNK_SIZE, CHUNK_OVERLAP)
return chunks