carolinacon commited on
Commit
2e8bb22
·
1 Parent(s): e511adb

Chunk oversized tavily extracts

Browse files
config/prompts.yaml CHANGED
@@ -57,6 +57,10 @@ prompts:
57
  * **Action:** Use Tavily Web Crawl on the URL of a leading renewable energy industry website, setting `max_depth` to 2.
58
  * **Observation:** Gathered extensive content from multiple articles linked on the site, highlighting new technologies and innovations.
59
  * **Final Answer:** Provide a synthesized summary of findings with citations.
 
 
 
 
60
  type: base_system
61
  variables: ["summary"]
62
  version: 1.0
@@ -79,7 +83,7 @@ prompts:
79
 
80
 
81
  Extend the summary by taking into account the new messages above.
82
- Try to follow this guideline. If the message consists in a tool call add a new bullet point and specify the tool and its action.
83
  If the message consists in a tool call result append a summary of the result to the appropriate bullet point.
84
  After analyzing the tool call result, specify if this has been useful or not.
85
  type: memory_optimization
 
57
  * **Action:** Use Tavily Web Crawl on the URL of a leading renewable energy industry website, setting `max_depth` to 2.
58
  * **Observation:** Gathered extensive content from multiple articles linked on the site, highlighting new technologies and innovations.
59
  * **Final Answer:** Provide a synthesized summary of findings with citations.
60
+
61
+ If the value of chunked_last_tool_call is true, this means that the last tool execution returns a result formed from the concatenation
62
+ of multiple chunks.
63
+ Current value of the chunked_last_tool_call is {{chunked_last_tool_call}}
64
  type: base_system
65
  variables: ["summary"]
66
  version: 1.0
 
83
 
84
 
85
  Extend the summary by taking into account the new messages above.
86
+ Try to follow this guideline. If the message consists in a tool call add a new bullet point and specify the tool name, its mai parameters values and its action.
87
  If the message consists in a tool call result append a summary of the result to the appropriate bullet point.
88
  After analyzing the tool call result, specify if this has been useful or not.
89
  type: memory_optimization
config/settings.py CHANGED
@@ -12,6 +12,9 @@ class AgentConfig:
12
  # LLM Configuration
13
  self.MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4.1")
14
 
 
 
 
15
 
16
  # File Paths
17
  self.PROJECT_ROOT = Path(__file__).parent.parent
 
12
  # LLM Configuration
13
  self.MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4.1")
14
 
15
+ #Sizing limitations
16
+ self.MAX_CONTEXT_TOKENS = 20000
17
+
18
 
19
  # File Paths
20
  self.PROJECT_ROOT = Path(__file__).parent.parent
core/state.py CHANGED
@@ -5,3 +5,4 @@ class State(MessagesState):
5
  summary: str
6
  question: str
7
  attachment: str
 
 
5
  summary: str
6
  question: str
7
  attachment: str
8
+ chunked_last_tool_call: bool
nodes/chunking_node.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain_text_splitters import MarkdownHeaderTextSplitter
3
+ from langchain_core.messages.base import BaseMessage
4
+ from langchain_core.messages import ToolMessage
5
+ from langchain_community.embeddings import OpenAIEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+
8
+ from config.settings import config
9
+ import json
10
+ import tiktoken
11
+
12
+
13
+ def parse_mark_down(data: str) -> list:
14
+ headers_to_split_on = [
15
+ ("#", "Header 1"),
16
+ ("##", "Header 2"),
17
+ ]
18
+
19
+ markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
20
+ md_header_splits = markdown_splitter.split_text(data)
21
+ return md_header_splits
22
+
23
+
24
+ class OversizedContentHandler:
25
+ """Main handler for content that exceeds context limits"""
26
+
27
+ def __init__(self,
28
+ model_name: str = "gpt-4.1",
29
+ max_context_tokens: int = 8000,
30
+ reserved_tokens: int = 2000):
31
+ self.encoding = tiktoken.encoding_for_model(model_name)
32
+ self.max_context_tokens = max_context_tokens
33
+ self.reserved_tokens = reserved_tokens
34
+ self.max_chunk_tokens = max_context_tokens - reserved_tokens
35
+
36
+ def count_tokens(self, text: str) -> int:
37
+ return len(self.encoding.encode(text))
38
+
39
+ def extract_relevant_chunks(self, content: str, query: str):
40
+ # Try to check if the content can be parsed with a Markdown parser
41
+ md_chunks = parse_mark_down(content)
42
+ # Further split large chunks
43
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=15000, chunk_overlap=500)
44
+ final_chunks = text_splitter.split_documents(md_chunks)
45
+
46
+ embeddings = OpenAIEmbeddings()
47
+ vector_db = FAISS.from_documents(final_chunks, embeddings)
48
+
49
+ relevant_chunks = vector_db.similarity_search(query, k=3)
50
+ # Concatenate relevant chunk and update last message content
51
+ context_with_metadata = [
52
+ {"text": doc.page_content, "source": doc.metadata.get("source")}
53
+ for doc in relevant_chunks
54
+ ]
55
+ return context_with_metadata
56
+
57
+ def process_oversized_message(self, message: BaseMessage, query: str) -> bool:
58
+ chunked = False
59
+ # At this point we are chunking only tavily_extract results messages
60
+ if isinstance(message, ToolMessage) and message.name == "tavily_extract":
61
+ json_content = json.loads(message.content)
62
+ result = json_content['results'][0]
63
+ raw_content = result['raw_content']
64
+
65
+ content_size = self.count_tokens(raw_content)
66
+ if content_size > config.MAX_CONTEXT_TOKENS:
67
+ print(f"Proceed with chunking, evaluated no of tokens {content_size} for message {message.id}")
68
+ chunked = True
69
+ result['raw_content'] = self.extract_relevant_chunks(raw_content, query=query)
70
+ message.content = json.dumps(json_content)
71
+ return chunked
nodes/nodes.py CHANGED
@@ -1,9 +1,10 @@
1
- from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, RemoveMessage
2
  from langchain_openai import ChatOpenAI
3
 
4
  from core.state import State
5
  import time
6
 
 
7
  from tools.tavily_tools import llm_tools
8
  from utils.prompt_manager import prompt_mgmt
9
 
@@ -26,7 +27,7 @@ def orchestrator(state: State):
26
  messages = [HumanMessage(content=message)]
27
  response = response_processing_model.invoke(messages)
28
  if response.content == "YES":
29
- return {"question": question, "attachment": "true", "messages":[response]}
30
  return {"question": question}
31
 
32
 
@@ -41,12 +42,14 @@ def assistant(state: State):
41
  if not question:
42
  question = state["messages"][0].content
43
 
44
- sys_msg = SystemMessage(content=prompt_mgmt.render_template("base_system", {"summary": summary}))
 
45
  try:
46
  response = model.invoke([sys_msg] + state["messages"])
47
  except Exception as e:
48
  if "429" in str(e):
49
  time.sleep(5)
 
50
  response = model.invoke([sys_msg] + state["messages"])
51
  return {"messages": [response]}
52
  raise
@@ -77,9 +80,15 @@ def optimize_memory(state: State):
77
  summary_message = "Create a summary of the conversation above:"
78
 
79
  # Add prompt to our history
80
- messages = state["messages"] + [HumanMessage(content=summary_message)]
81
  response = model.invoke(messages)
82
 
 
83
  # Delete all but the 2 most recent messages and the first one
84
- delete_messages = [RemoveMessage(id=m.id) for m in state["messages"][:-2]]
85
- return {"summary": response.content, "messages": delete_messages}
 
 
 
 
 
 
1
+ from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, RemoveMessage, ToolMessage
2
  from langchain_openai import ChatOpenAI
3
 
4
  from core.state import State
5
  import time
6
 
7
+ from nodes.chunking_node import OversizedContentHandler
8
  from tools.tavily_tools import llm_tools
9
  from utils.prompt_manager import prompt_mgmt
10
 
 
27
  messages = [HumanMessage(content=message)]
28
  response = response_processing_model.invoke(messages)
29
  if response.content == "YES":
30
+ return {"question": question, "attachment": "true", "messages": [response]}
31
  return {"question": question}
32
 
33
 
 
42
  if not question:
43
  question = state["messages"][0].content
44
 
45
+ prompt_params = {"summary": summary, "chunked_last_tool_call": state.get("chunked_last_tool_call", False)}
46
+ sys_msg = SystemMessage(content=prompt_mgmt.render_template("base_system", prompt_params))
47
  try:
48
  response = model.invoke([sys_msg] + state["messages"])
49
  except Exception as e:
50
  if "429" in str(e):
51
  time.sleep(5)
52
+ print("Retrying after receiving 429 error")
53
  response = model.invoke([sys_msg] + state["messages"])
54
  return {"messages": [response]}
55
  raise
 
80
  summary_message = "Create a summary of the conversation above:"
81
 
82
  # Add prompt to our history
83
+ messages = state["messages"][:-2] + [HumanMessage(content=summary_message)]
84
  response = model.invoke(messages)
85
 
86
+ print("&&&" * 50, state["messages"][-1].type)
87
  # Delete all but the 2 most recent messages and the first one
88
+ remaining_messages = [RemoveMessage(id=m.id) for m in state["messages"][:-2]]
89
+
90
+ # If the last message returned from a tool is oversized, chunk it and retrieve only the relevant chunks
91
+ content_handler = OversizedContentHandler()
92
+ chunked = content_handler.process_oversized_message(state["messages"][-1], state.get("question"))
93
+
94
+ return {"summary": response.content, "messages": remaining_messages, "chunked_last_tool_call": chunked}
requirements.txt CHANGED
@@ -3,4 +3,6 @@ requests
3
  langchain_openai
4
  langchain_core
5
  langgraph
6
- langchain-tavily
 
 
 
3
  langchain_openai
4
  langchain_core
5
  langgraph
6
+ langchain-tavily
7
+ langchain-community
8
+ faiss-cpu
tools/tavily_tools.py CHANGED
@@ -6,6 +6,8 @@ from langchain_tavily import TavilyCrawl
6
  tavily_search_tool = TavilySearch(
7
  max_results=10,
8
  topic="general",
 
 
9
  )
10
 
11
  # Define the LangChain extract tool
 
6
  tavily_search_tool = TavilySearch(
7
  max_results=10,
8
  topic="general",
9
+ # Make sure to avoid retrieving the response from a dataset or a space
10
+ exclude_domains =["https://huggingface.co/datasets", "https://huggingface.co/spaces"]
11
  )
12
 
13
  # Define the LangChain extract tool