Spaces:

Maitreyee22
/

Patent-Summarizer

No application file

Patent-Summarizer / services /document_loader.py

Upload 6 files

ea68259 verified about 1 month ago

1.5 kB

	import io
	from typing import List
	from PyPDF2 import PdfReader


	def extract_text_from_pdf(file_bytes: bytes) -> str:
	"""
	Extracts text from a PDF file.

	Args:
	file_bytes: Raw bytes of the uploaded PDF.

	Returns:
	A single string containing all extracted text.
	"""
	reader = PdfReader(io.BytesIO(file_bytes))
	text = ""
	for page in reader.pages:
	extracted = page.extract_text()
	if extracted:
	text += extracted + "\n"
	return text.strip()


	def chunk_text(text: str, max_chars: int = 2000, overlap: int = 200) -> List[str]:
	"""
	Splits a long text into overlapping chunks to fit model context and
	make retrieval easier for RAG-style Q&A.

	Args:
	text: Full document text.
	max_chars: Maximum characters per chunk.
	overlap: Overlap between consecutive chunks.

	Returns:
	List of text chunks.
	"""
	chunks: List[str] = []
	start = 0
	length = len(text)

	while start < length:
	end = min(start + max_chars, length)
	chunk = text[start:end]

	# Try to end at a sentence boundary when possible
	last_period = chunk.rfind(".")
	if last_period != -1 and end != length:
	end = start + last_period + 1
	chunk = text[start:end]

	chunk = chunk.strip()
	if chunk:
	chunks.append(chunk)

	# Move window forward with overlap
	start = max(0, end - overlap)

	return chunks