File size: 1,084 Bytes
275cb5c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import fitz # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
def extract_text_from_pdf(file_path: str) -> str:
"""
Extracts text from a PDF file using PyMuPDF.
Args:
file_path (str): Path to the PDF file.
Returns:
str: The extracted text from the PDF.
"""
text = ""
# Open the PDF
with fitz.open(file_path) as pdf:
# Loop through each page
for page in pdf:
# Extract text from that page
text += page.get_text()
return text
def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> list:
"""
Splits extracted text into smaller overlapping chunks.
Args:
text (str): The full extracted text.
chunk_size (int): Max characters per chunk.
chunk_overlap (int): Overlap between chunks.
Returns:
list: List of text chunks.
"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
return splitter.split_text(text) |