Spaces:

Maitreyee22
/

Patent-Summarizer

No application file

File size: 1,497 Bytes

ea68259

import io
from typing import List
from PyPDF2 import PdfReader


def extract_text_from_pdf(file_bytes: bytes) -> str:
    """
    Extracts text from a PDF file.

    Args:
        file_bytes: Raw bytes of the uploaded PDF.

    Returns:
        A single string containing all extracted text.
    """
    reader = PdfReader(io.BytesIO(file_bytes))
    text = ""
    for page in reader.pages:
        extracted = page.extract_text()
        if extracted:
            text += extracted + "\n"
    return text.strip()


def chunk_text(text: str, max_chars: int = 2000, overlap: int = 200) -> List[str]:
    """
    Splits a long text into overlapping chunks to fit model context and
    make retrieval easier for RAG-style Q&A.

    Args:
        text: Full document text.
        max_chars: Maximum characters per chunk.
        overlap: Overlap between consecutive chunks.

    Returns:
        List of text chunks.
    """
    chunks: List[str] = []
    start = 0
    length = len(text)

    while start < length:
        end = min(start + max_chars, length)
        chunk = text[start:end]

        # Try to end at a sentence boundary when possible
        last_period = chunk.rfind(".")
        if last_period != -1 and end != length:
            end = start + last_period + 1
            chunk = text[start:end]

        chunk = chunk.strip()
        if chunk:
            chunks.append(chunk)

        # Move window forward with overlap
        start = max(0, end - overlap)

    return chunks