Final_Assignment_Template / nodes /chunking_handler.py
carolinacon's picture
Updated the README file
a605490
import json
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.messages import ToolMessage
from langchain_core.messages.base import BaseMessage
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import MarkdownHeaderTextSplitter
from config.settings import config
def parse_mark_down(data: str) -> list:
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(data)
return md_header_splits
class OversizedContentHandler:
"""Main handler for content that exceeds context limits"""
def __init__(self,
model_name: str = "gpt-4.1",
max_context_tokens: int = 8000,
reserved_tokens: int = 2000):
self.encoding = tiktoken.encoding_for_model(model_name)
self.max_context_tokens = max_context_tokens
self.reserved_tokens = reserved_tokens
self.max_chunk_tokens = max_context_tokens - reserved_tokens
def count_tokens(self, text: str) -> int:
return len(self.encoding.encode(text))
def extract_relevant_chunks(self, content: str, query: str):
# Try to check if the content can be parsed with a Markdown parser
md_chunks = parse_mark_down(content)
# Further split large chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=15000, chunk_overlap=500)
final_chunks = text_splitter.split_documents(md_chunks)
embeddings = OpenAIEmbeddings()
vector_db = FAISS.from_documents(final_chunks, embeddings)
relevant_chunks = vector_db.similarity_search(query, k=3)
# Concatenate relevant chunk and update last message content
context_with_metadata = [
{"text": doc.page_content, "source": doc.metadata.get("source")}
for doc in relevant_chunks
]
return context_with_metadata
def process_oversized_message(self, message: BaseMessage, query: str) -> bool:
chunked = False
# At this point we are chunking only tavily_extract results messages
json_content = None
if isinstance(message, ToolMessage) and message.name == "tavily_extract":
try:
json_content = json.loads(message.content)
except Exception as e:
print("cannot parse message")
if json_content:
result = json_content['results'][0]
raw_content = result['raw_content']
content_size = self.count_tokens(raw_content)
if content_size > config.max_tokens:
print(f"Proceed with chunking, evaluated no of tokens {content_size} for message {message.id}")
chunked = True
result['raw_content'] = self.extract_relevant_chunks(raw_content, query=query)
message.content = json.dumps(json_content)
return chunked