|
|
import re |
|
|
import fitz |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_text_from_pdf(file_path: str) -> str: |
|
|
""" |
|
|
Extracts and cleans text from a PDF using PyMuPDF. |
|
|
Handles both textual and scanned PDFs gracefully. |
|
|
|
|
|
Args: |
|
|
file_path (str): Path to the PDF file. |
|
|
Returns: |
|
|
str: Combined extracted text. |
|
|
""" |
|
|
text = "" |
|
|
try: |
|
|
with fitz.open(file_path) as pdf: |
|
|
for page in pdf: |
|
|
page_text = page.get_text("text").strip() |
|
|
if not page_text: |
|
|
|
|
|
blocks = page.get_text("blocks") |
|
|
page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str)) |
|
|
text += page_text + "\n" |
|
|
except Exception as e: |
|
|
raise RuntimeError(f"β PDF extraction failed: {e}") |
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
return text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list: |
|
|
""" |
|
|
Splits text into overlapping, sentence-based chunks. |
|
|
Optimized for embedding models (E5, MiniLM, etc.) for semantic retrieval. |
|
|
|
|
|
Args: |
|
|
text (str): Input text. |
|
|
chunk_size (int): Max characters per chunk (default: 800). |
|
|
overlap (int): Overlapping characters for continuity (default: 150). |
|
|
|
|
|
Returns: |
|
|
list[str]: Chunked text segments. |
|
|
""" |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text.strip()) |
|
|
|
|
|
|
|
|
sentences = re.split(r'(?<=[.!?])\s+', text) |
|
|
|
|
|
chunks, current = [], "" |
|
|
|
|
|
for sent in sentences: |
|
|
if len(current) + len(sent) + 1 <= chunk_size: |
|
|
current += " " + sent |
|
|
else: |
|
|
|
|
|
if current.strip(): |
|
|
chunks.append(current.strip()) |
|
|
|
|
|
|
|
|
overlap_part = current[-overlap:] if overlap > 0 else "" |
|
|
current = overlap_part + " " + sent |
|
|
|
|
|
|
|
|
if current.strip(): |
|
|
chunks.append(current.strip()) |
|
|
|
|
|
return chunks |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
sample_text = """ |
|
|
Artificial Intelligence is transforming industries. |
|
|
Machine learning is a key subfield, driving automation and predictive analytics. |
|
|
Neural networks power most modern AI applications today. |
|
|
This technology is reshaping healthcare, finance, and manufacturing. |
|
|
""" |
|
|
chunks = chunk_text(sample_text, chunk_size=80, overlap=20) |
|
|
print(f"β
Chunks created: {len(chunks)}") |
|
|
for i, c in enumerate(chunks, 1): |
|
|
print(f"\n--- Chunk {i} ({len(c)} chars) ---\n{c}") |
|
|
|