Spaces:

BrejBala
/

rag-agent-workbench-api

Sleeping

App Files Files Community

rag-agent-workbench-api / backend /app /services /chunking.py

BrejBala

Deploy backend Docker app

e63c592 about 1 month ago

raw

history blame contribute delete

2.84 kB

	from typing import Any, Dict, List, Sequence

	from langchain_core.documents import Document
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	from app.core.config import get_settings


	def chunk_document(
	document: Document, chunk_size: int = 900, chunk_overlap: int = 120
	) -> List[Document]:
	"""Chunk a single LangChain document into smaller documents."""
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	)
	chunks = splitter.split_documents([document])
	return chunks


	MAX_CHARS_PER_CHUNK = 6000


	def documents_to_records(
	documents: Sequence[Document],
	) -> List[Dict[str, Any]]:
	"""Convert documents into Pinecone records with chunking applied.

	Each input document is expected to have at least the following metadata:
	- doc_id
	- source
	- title
	- url (optional)
	- published (optional)

	Output records follow the schema (logical representation):

	{
	"_id": "<doc_id>:<chunk_index>",
	"<text_field>": "<chunk>", # PINECONE_TEXT_FIELD (default: 'chunk_text')
	"title": "...",
	"source": "...",
	"url": "...",
	"published": "...",
	"doc_id": "...",
	"chunk_id": <int>,
	... additional metadata fields ...
	}
	"""
	records: List[Dict[str, Any]] = []
	settings = get_settings()
	text_field = settings.PINECONE_TEXT_FIELD

	for document in documents:
	metadata = document.metadata or {}
	doc_id = metadata.get("doc_id")
	source = metadata.get("source")
	title = metadata.get("title", "")
	url = metadata.get("url", "")
	published = metadata.get("published", "")

	if not doc_id or not source:
	# Skip documents missing essential metadata
	continue

	base_metadata: Dict[str, Any] = {
	k: v
	for k, v in metadata.items()
	if k not in {"doc_id", "source", "title", "url", "published"}
	}

	chunks = chunk_document(document)
	for idx, chunk in enumerate(chunks):
	chunk_text = chunk.page_content or ""
	# Safety truncation for integrated embedding models like llama-text-embed-v2
	if len(chunk_text) > MAX_CHARS_PER_CHUNK:
	chunk_text = chunk_text[:MAX_CHARS_PER_CHUNK]

	record: Dict[str, Any] = {
	"_id": f"{doc_id}:{idx}",
	text_field: chunk_text,
	"title": title,
	"source": source,
	"url": url,
	"published": published,
	"doc_id": doc_id,
	"chunk_id": idx,
	}
	# Attach additional metadata fields
	record.update(base_metadata)
	records.append(record)

	return records