Web_Scrapping_Agent / agent /chunker.py
ShivanshCodex's picture
Upload 47 files
f085180 verified
# agent/chunker.py
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict
class TextChunker:
def __init__(self, chunk_size: int = 2048, chunk_overlap: int = 512):
"""
Initialize text chunker with recursive splitting strategy.
"""
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", ".", "!", "?", " ", ""]
)
def chunk_text(self, content: str, metadata: Dict[str, str]) -> List[Dict[str, str]]:
"""
Split the text into clean chunks and attach metadata for vector storage.
Args:
content (str): Raw web page text.
metadata (Dict[str, str]): Metadata like URL, title, etc.
Returns:
List[Dict[str, str]]: List of chunks with content + metadata
"""
try:
chunks = self.splitter.split_text(content)
return [{"content": chunk, "metadata": metadata} for chunk in chunks]
except Exception as e:
import logging
logging.error(f"[Chunker] Failed to split content: {e}")
return []