Charles Grandjean commited on
Commit
9ce8464
·
1 Parent(s): bd87ed7

revamp analysis of docs

Browse files
agent_api.py CHANGED
@@ -37,6 +37,7 @@ from agents.doc_assistant import DocAssistant
37
  from langchain_openai import ChatOpenAI
38
  from langchain_xai import ChatXAI
39
  from langchain_google_genai import ChatGoogleGenerativeAI
 
40
  from mistralai import Mistral
41
  import logging
42
  import traceback
@@ -83,21 +84,20 @@ class LLMConfig:
83
  "X-Cerebras-3rd-Party-Integration": "langgraph"
84
  }
85
  ))
86
-
87
- self.llm = NormalizedLLM(ChatOpenAI(
88
- model=os.getenv("OPENROUTER_MODEL"),
89
  api_key=os.getenv("OPENROUTER_API_KEY"),
90
  base_url=os.getenv("OPENROUTER_URL"),
 
 
 
91
  ))
92
  self.llm = NormalizedLLM(ChatGoogleGenerativeAI(
93
  model=os.getenv("GEMINI_TOOL_MODEL", "gemini-3-flash-preview"),
94
  api_key=os.getenv("GOOGLE_API_KEY"),
95
  thinking_level="medium"
96
  ))
97
- # self.llm = NormalizedLLM(ChatXAI(
98
- # model=os.getenv("XAI_TOOL_MODEL"),
99
- # ))
100
-
101
  # logger.info("✅ LLMConfig initialized with NormalizedLLM wrapper:")
102
  # logger.info(f" - OpenAI LLM: {os.getenv('LLM_MODEL', 'gpt-5-nano-2025-08-07')}")
103
  # logger.info(f" - Gemini LLM: {os.getenv('GEMINI_TOOL_MODEL', 'gemini-3-flash-preview')} (for tool calling)")
@@ -154,7 +154,7 @@ class CyberLegalAPI:
154
 
155
  self.agent_client = CyberLegalAgent(llm=self.llm_config.slm, tools=tools.tools_for_client,tools_facade=tools.tools_for_client_facade)
156
  self.agent_lawyer = CyberLegalAgent(llm=self.llm_config.slm, tools=tools.tools_for_lawyer,tools_facade=tools.tools_for_lawyer_facade)
157
- self.pdf_analyzer = PDFAnalyzerAgent(llm=self.llm_config.slm, mistral_client=mistral_client)
158
  # Initialize doc_editor with tools
159
  self.doc_editor = DocumentEditorAgent(
160
  llm=self.llm_config.slm,
@@ -193,13 +193,13 @@ class CyberLegalAPI:
193
  elif node.type == "file" and node.analysis:
194
  analysis_parts = []
195
  if node.analysis.summary:
196
- summary_preview = node.analysis.summary[:100] + "..." if len(node.analysis.summary) > 100 else node.analysis.summary
197
  analysis_parts.append(f"summary: {summary_preview}")
198
  if node.analysis.actors:
199
- actors_preview = node.analysis.actors[:100] + "..." if len(node.analysis.actors) > 100 else node.analysis.actors
200
  analysis_parts.append(f"actors: {actors_preview}")
201
  if node.analysis.key_details:
202
- details_preview = node.analysis.key_details[:100] + "..." if len(node.analysis.key_details) > 100 else node.analysis.key_details
203
  analysis_parts.append(f"key_details: {details_preview}")
204
 
205
  analysis_text = " | ".join(analysis_parts) if analysis_parts else "No analysis available"
 
37
  from langchain_openai import ChatOpenAI
38
  from langchain_xai import ChatXAI
39
  from langchain_google_genai import ChatGoogleGenerativeAI
40
+ from langchain_openrouter import ChatOpenRouter
41
  from mistralai import Mistral
42
  import logging
43
  import traceback
 
84
  "X-Cerebras-3rd-Party-Integration": "langgraph"
85
  }
86
  ))
87
+ self.utils_llm = NormalizedLLM(ChatOpenRouter(
88
+ model=os.getenv("OPENROUTER_MAIN_MODEL"),
 
89
  api_key=os.getenv("OPENROUTER_API_KEY"),
90
  base_url=os.getenv("OPENROUTER_URL"),
91
+ extra_body={
92
+ "models": json.loads(os.getenv("OPENROUTER_MODELS", "[]"))
93
+ },
94
  ))
95
  self.llm = NormalizedLLM(ChatGoogleGenerativeAI(
96
  model=os.getenv("GEMINI_TOOL_MODEL", "gemini-3-flash-preview"),
97
  api_key=os.getenv("GOOGLE_API_KEY"),
98
  thinking_level="medium"
99
  ))
100
+
 
 
 
101
  # logger.info("✅ LLMConfig initialized with NormalizedLLM wrapper:")
102
  # logger.info(f" - OpenAI LLM: {os.getenv('LLM_MODEL', 'gpt-5-nano-2025-08-07')}")
103
  # logger.info(f" - Gemini LLM: {os.getenv('GEMINI_TOOL_MODEL', 'gemini-3-flash-preview')} (for tool calling)")
 
154
 
155
  self.agent_client = CyberLegalAgent(llm=self.llm_config.slm, tools=tools.tools_for_client,tools_facade=tools.tools_for_client_facade)
156
  self.agent_lawyer = CyberLegalAgent(llm=self.llm_config.slm, tools=tools.tools_for_lawyer,tools_facade=tools.tools_for_lawyer_facade)
157
+ self.pdf_analyzer = PDFAnalyzerAgent(llm=self.llm_config.utils_llm, mistral_client=mistral_client)
158
  # Initialize doc_editor with tools
159
  self.doc_editor = DocumentEditorAgent(
160
  llm=self.llm_config.slm,
 
193
  elif node.type == "file" and node.analysis:
194
  analysis_parts = []
195
  if node.analysis.summary:
196
+ summary_preview = node.analysis.summary
197
  analysis_parts.append(f"summary: {summary_preview}")
198
  if node.analysis.actors:
199
+ actors_preview = node.analysis.actors
200
  analysis_parts.append(f"actors: {actors_preview}")
201
  if node.analysis.key_details:
202
+ details_preview = node.analysis.key_details
203
  analysis_parts.append(f"key_details: {details_preview}")
204
 
205
  analysis_text = " | ".join(analysis_parts) if analysis_parts else "No analysis available"
agent_states/actors_merger.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import TypedDict, List, Dict, Any
3
+
4
+
5
+ class ActorsMergerState(TypedDict):
6
+ new_extractions: List[Dict[str, Any]]
7
+ existing_actors: List[Dict[str, Any]]
8
+ messages: List[Any]
9
+ completed: bool
agent_states/pdf_analyzer_state.py CHANGED
@@ -3,33 +3,26 @@
3
  State management for PDF Analysis Agent
4
  """
5
 
6
- from typing import TypedDict, Optional, List
7
- from langchain_core.messages import BaseMessage
8
 
9
 
10
  class PDFAnalyzerState(TypedDict):
11
- """
12
- State definition for the PDF Analysis Agent workflow
13
- """
14
  # Input
15
  pdf_path: str
16
- pdf_content: Optional[str]
17
-
18
- # Extraction results
19
  extracted_text: Optional[str]
20
-
21
- # OCR tracking
22
- needs_ocr: bool # True if PDF is scanned/image-based
23
- ocr_performed: bool # True if OCR was used
24
- ocr_method: Optional[str] # "mistral" or None
25
-
26
  # Analysis results
27
  summary: Optional[str]
28
  actors: Optional[str]
29
  key_details: Optional[str]
30
-
31
- intermediate_steps: List[BaseMessage]
32
-
33
  # Metadata
34
  document_type: Optional[str]
35
- processing_status: str # "pending", "extracting", "analyzing", "complete"
 
3
  State management for PDF Analysis Agent
4
  """
5
 
6
+ from typing import TypedDict, Optional
 
7
 
8
 
9
  class PDFAnalyzerState(TypedDict):
 
 
 
10
  # Input
11
  pdf_path: str
12
+
13
+ # Extraction
 
14
  extracted_text: Optional[str]
15
+
16
+ # OCR
17
+ needs_ocr: bool
18
+ ocr_performed: bool
19
+ ocr_method: Optional[str]
20
+
21
  # Analysis results
22
  summary: Optional[str]
23
  actors: Optional[str]
24
  key_details: Optional[str]
25
+
 
 
26
  # Metadata
27
  document_type: Optional[str]
28
+ error: Optional[str]
agents/actors_merger.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ import logging
4
+ from typing import TypedDict, List, Dict, Any
5
+ from agent_states.actors_merger_state import ActorResolutionState
6
+ from langgraph.graph import StateGraph, END
7
+ from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage
8
+ from prompts.actors_merger import SYSTEM_PROMPT,ACTOR_MERGER_PROMPT
9
+ from utils.tools import tools_for_actors_merger, tools_for_actors_merger_facade
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class ActorsMergerAgent:
15
+ def __init__(self, llm):
16
+ self.llm = llm.bind_tools(tools_for_actors_merger_facade, tool_choice="any")
17
+ self.workflow = self._build_workflow()
18
+
19
+ def _build_workflow(self):
20
+ workflow = StateGraph(ActorResolutionState)
21
+ workflow.add_node("reason", self._reason)
22
+ workflow.add_node("run_tools", self._run_tools)
23
+
24
+ workflow.set_entry_point("reason")
25
+ workflow.add_edge("reason", "run_tools")
26
+
27
+ workflow.add_conditional_edges(
28
+ "run_tools",
29
+ self._should_continue,
30
+ {
31
+ "continue": "reason",
32
+ "done": END,
33
+ },
34
+ )
35
+
36
+ return workflow.compile()
37
+
38
+ def _should_continue(self, state: ActorResolutionState) -> str:
39
+ return "done" if state["completed"] else "continue"
40
+
41
+ async def _reason(self, state: ActorResolutionState) -> ActorResolutionState:
42
+ if not state["messages"]:
43
+ state["messages"] = [
44
+ SystemMessage(content=SYSTEM_PROMPT),
45
+ HumanMessage(
46
+ content=f"""
47
+ {ACTOR_MERGER_PROMPT}
48
+
49
+ NEW_ACTORS:
50
+ {json.dumps(new_extractions, indent=2)}
51
+
52
+ EXISTING_ACTORS:
53
+ {json.dumps(existing_actors, indent=2)}
54
+ """
55
+ ),
56
+ ]
57
+
58
+ response = await self.llm.ainvoke(state["messages"])
59
+ state["messages"].append(response)
60
+ return state
61
+
62
+ async def _run_tools(self, state: ActorResolutionState) -> ActorResolutionState:
63
+ last_message = state["messages"][-1]
64
+ tool_calls = getattr(last_message, "tool_calls", []) or []
65
+
66
+ for tool_call in tool_calls:
67
+ name = tool_call["name"]
68
+ args = tool_call.get("args", {})
69
+
70
+ if name == "add_actors":
71
+ actors = args.get("actors", [])
72
+ state["existing_actors"].extend(actors)
73
+ result = {"ok": True, "added": len(actors)}
74
+
75
+ elif name == "modify_actors":
76
+ modifications = args.get("modifications", [])
77
+ updated = 0
78
+
79
+ for mod in modifications:
80
+ target_name = mod.get("target_name")
81
+ if not target_name:
82
+ continue
83
+
84
+ for actor in state["existing_actors"]:
85
+ if actor.get("name") == target_name:
86
+ if mod.get("name"):
87
+ actor["name"] = mod["name"]
88
+
89
+ if mod.get("aliases"):
90
+ actor["aliases"] = list(dict.fromkeys(
91
+ (actor.get("aliases", []) or []) + (mod.get("aliases", []) or [])
92
+ ))
93
+
94
+ if mod.get("description"):
95
+ actor["description"] = mod["description"]
96
+
97
+ if mod.get("implication"):
98
+ actor["implication"] = mod["implication"]
99
+
100
+ updated += 1
101
+ break
102
+
103
+ result = {"ok": True, "updated": updated}
104
+
105
+ elif name == "attempt_completion_actor_resolution":
106
+ state["completed"] = True
107
+ result = {"ok": True, "completed": True}
108
+
109
+ else:
110
+ result = {"ok": False, "error": f"Unknown tool: {name}"}
111
+
112
+ state["messages"].append(
113
+ ToolMessage(
114
+ content=json.dumps(result, ensure_ascii=False),
115
+ tool_call_id=tool_call["id"],
116
+ )
117
+ )
118
+
119
+ return state
120
+
121
+ async def resolve(
122
+ self,
123
+ new_extractions: List[Dict[str, Any]],
124
+ existing_actors: List[Dict[str, Any]]
125
+ ) -> Dict[str, Any]:
126
+ initial_state: ActorResolutionState = {
127
+ "new_extractions": new_extractions,
128
+ "existing_actors": existing_actors,
129
+ "messages": [],
130
+ "completed": False,
131
+ }
132
+
133
+ final_state = await self.workflow.ainvoke(initial_state)
134
+
135
+ return {
136
+ "existing_actors": final_state["existing_actors"],
137
+ "completed": final_state["completed"],
138
+ }
agents/pdf_analyzer.py CHANGED
@@ -35,16 +35,37 @@ class PDFAnalyzerAgent:
35
  workflow.add_node("extract_key_details", self._extract_key_details)
36
  workflow.add_node("generate_summary", self._generate_summary)
37
  workflow.set_entry_point("detect_pdf_type")
38
- workflow.add_conditional_edges("detect_pdf_type", self._should_use_ocr, {"ocr": "ocr_pdf", "extract": "extract_content"})
 
 
 
 
 
39
  workflow.add_edge("ocr_pdf", "extract_actors")
 
 
 
40
  workflow.add_edge("extract_content", "extract_actors")
41
- workflow.add_edge("extract_actors", "extract_key_details")
42
- workflow.add_edge("extract_key_details", "generate_summary")
 
 
 
43
  workflow.add_edge("generate_summary", END)
 
44
  return workflow.compile()
45
 
46
  def _should_use_ocr(self, state: PDFAnalyzerState) -> str:
47
  return "ocr" if state.get("needs_ocr", False) else "extract"
 
 
 
 
 
 
 
 
 
48
 
49
  async def _detect_pdf_type(self, state: PDFAnalyzerState) -> PDFAnalyzerState:
50
  import os
@@ -71,7 +92,6 @@ class PDFAnalyzerAgent:
71
  state["document_type"] = "unknown"
72
  logger.warning(f"⚠️ Unknown file format: {file_ext}, will attempt OCR")
73
 
74
- state["processing_status"] = "extracting"
75
  return state
76
 
77
  async def _ocr_pdf(self, state: PDFAnalyzerState) -> PDFAnalyzerState:
@@ -118,17 +138,14 @@ class PDFAnalyzerAgent:
118
 
119
  except Exception as e:
120
  logger.error(f"❌ OCR failed: {e}")
121
- state["processing_status"] = "failed"
122
- state["extracted_text"] = f"Error: OCR processing failed - {str(e)}"
123
  return state
124
 
125
- state["processing_status"] = "analyzing"
126
  return state
127
 
128
  async def _extract_content(self, state: PDFAnalyzerState) -> PDFAnalyzerState:
129
  """Extract text content from PDF file"""
130
- state["processing_status"] = "extracting"
131
-
132
  try:
133
  pdf_path = state["pdf_path"]
134
  logger.info(f"📄 Extracting content from PDF: {pdf_path}")
@@ -138,9 +155,8 @@ class PDFAnalyzerAgent:
138
  num_pages = len(reader.pages)
139
  for page_num in range(num_pages):
140
  page = reader.pages[page_num]
141
- extracted_text += page.extract_text() + "\n\n"
142
  state["extracted_text"] = extracted_text
143
- state["processing_status"] = "analyzing"
144
  logger.info(f"✅ Extracted {num_pages} pages from PDF")
145
 
146
  except Exception as e:
@@ -157,29 +173,16 @@ class PDFAnalyzerAgent:
157
 
158
  logger.info("👥 Extracting actors...")
159
 
160
- # Build conversation history with system message and document content
161
- intermediate_steps = state.get("intermediate_steps", [])
162
-
163
- # Add system message if not present
164
- if not intermediate_steps:
165
- intermediate_steps.append(SystemMessage(content=SYSTEM_PROMPT))
166
- intermediate_steps.append(HumanMessage(content=f"Here is the legal document to analyze:\n\n{state['extracted_text']}"))
167
-
168
- # Add prompt to extract actors
169
- intermediate_steps.append(HumanMessage(content=EXTRACT_ACTORS_PROMPT))
170
-
171
- response = await self.llm.ainvoke(intermediate_steps)
172
- intermediate_steps.append(response)
173
- state["actors"] = response.content
174
- state["intermediate_steps"] = intermediate_steps
175
 
176
  # Log detailed LLM response
177
  logger.info("=" * 80)
178
  logger.info("🤖 LLM RESPONSE (extract_actors)")
179
  logger.info("=" * 80)
180
- logger.info(f"📊 Response length: {len(response.content)} characters")
181
  logger.info(f"📄 Content preview (first 300 chars):")
182
- logger.info(response.content[:300] + ("..." if len(response.content) > 300 else ""))
183
  logger.info("=" * 80)
184
  logger.info("✅ Actors extracted")
185
 
@@ -193,22 +196,16 @@ class PDFAnalyzerAgent:
193
 
194
  logger.info("🔑 Extracting key details...")
195
 
196
- # Continue the conversation
197
- intermediate_steps = state.get("intermediate_steps", [])
198
- intermediate_steps.append(HumanMessage(content=EXTRACT_KEY_DETAILS_PROMPT))
199
-
200
- response = await self.llm.ainvoke(intermediate_steps)
201
- intermediate_steps.append(response)
202
- state["key_details"] = response.content
203
- state["intermediate_steps"] = intermediate_steps
204
 
205
  # Log detailed LLM response
206
  logger.info("=" * 80)
207
  logger.info("🤖 LLM RESPONSE (extract_key_details)")
208
  logger.info("=" * 80)
209
- logger.info(f"📊 Response length: {len(response.content)} characters")
210
  logger.info(f"📄 Content preview (first 300 chars):")
211
- logger.info(response.content[:300] + ("..." if len(response.content) > 300 else ""))
212
  logger.info("=" * 80)
213
  logger.info("✅ Key details extracted")
214
 
@@ -222,23 +219,16 @@ class PDFAnalyzerAgent:
222
 
223
  logger.info("📝 Generating document summary...")
224
 
225
- # Continue the conversation
226
- intermediate_steps = state.get("intermediate_steps", [])
227
- intermediate_steps.append(HumanMessage(content=GENERATE_SUMMARY_PROMPT))
228
-
229
- response = await self.llm.ainvoke(intermediate_steps)
230
- intermediate_steps.append(response)
231
- state["summary"] = response.content
232
- state["intermediate_steps"] = intermediate_steps
233
- state["processing_status"] = "complete"
234
 
235
  # Log detailed LLM response
236
  logger.info("=" * 80)
237
  logger.info("🤖 LLM RESPONSE (generate_summary)")
238
  logger.info("=" * 80)
239
- logger.info(f"📊 Response length: {len(response.content)} characters")
240
  logger.info(f"📄 Content preview (first 300 chars):")
241
- logger.info(response.content[:300] + ("..." if len(response.content) > 300 else ""))
242
  logger.info("=" * 80)
243
  logger.info("✅ Summary generated")
244
 
@@ -260,29 +250,26 @@ class PDFAnalyzerAgent:
260
 
261
  initial_state: PDFAnalyzerState = {
262
  "pdf_path": pdf_path,
263
- "pdf_content": None,
264
  "extracted_text": None,
265
  "summary": None,
266
  "actors": None,
267
  "key_details": None,
268
- "document_type": "image" if file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'] else None,
269
- "processing_status": "pending",
270
- "intermediate_steps": [],
271
  "needs_ocr": False,
272
  "ocr_performed": False,
273
- "ocr_method": None
 
274
  }
275
 
276
  logger.info(f"🚀 Starting PDF analysis for: {pdf_path}")
277
  final_state = await self.workflow.ainvoke(initial_state)
278
 
279
- logger.info(f"✅ PDF analysis complete. Status: {final_state['processing_status']}")
280
 
281
  return {
282
  "summary": final_state.get("summary"),
283
  "actors": final_state.get("actors"),
284
  "key_details": final_state.get("key_details"),
285
- "processing_status": final_state.get("processing_status"),
286
  "ocr_used": final_state.get("ocr_performed", False),
287
  "ocr_method": final_state.get("ocr_method")
288
- }
 
35
  workflow.add_node("extract_key_details", self._extract_key_details)
36
  workflow.add_node("generate_summary", self._generate_summary)
37
  workflow.set_entry_point("detect_pdf_type")
38
+ workflow.add_conditional_edges(
39
+ "detect_pdf_type",
40
+ self._should_use_ocr,
41
+ {"ocr": "ocr_pdf", "extract": "extract_content"}
42
+ )
43
+
44
  workflow.add_edge("ocr_pdf", "extract_actors")
45
+ workflow.add_edge("ocr_pdf", "extract_key_details")
46
+ workflow.add_edge("ocr_pdf", "generate_summary")
47
+
48
  workflow.add_edge("extract_content", "extract_actors")
49
+ workflow.add_edge("extract_content", "extract_key_details")
50
+ workflow.add_edge("extract_content", "generate_summary")
51
+
52
+ workflow.add_edge("extract_actors", END)
53
+ workflow.add_edge("extract_key_details", END)
54
  workflow.add_edge("generate_summary", END)
55
+
56
  return workflow.compile()
57
 
58
  def _should_use_ocr(self, state: PDFAnalyzerState) -> str:
59
  return "ocr" if state.get("needs_ocr", False) else "extract"
60
+
61
+ async def _run_prompt(self, extracted_text: str, task_prompt: str) -> str:
62
+ messages = [
63
+ SystemMessage(content=SYSTEM_PROMPT),
64
+ HumanMessage(content=f"Here is the legal document to analyze:\n\n{extracted_text}"),
65
+ HumanMessage(content=task_prompt),
66
+ ]
67
+ response = await self.llm.ainvoke(messages)
68
+ return response.content
69
 
70
  async def _detect_pdf_type(self, state: PDFAnalyzerState) -> PDFAnalyzerState:
71
  import os
 
92
  state["document_type"] = "unknown"
93
  logger.warning(f"⚠️ Unknown file format: {file_ext}, will attempt OCR")
94
 
 
95
  return state
96
 
97
  async def _ocr_pdf(self, state: PDFAnalyzerState) -> PDFAnalyzerState:
 
138
 
139
  except Exception as e:
140
  logger.error(f"❌ OCR failed: {e}")
141
+ state["error"] = str(e)
142
+ state["extracted_text"] = None
143
  return state
144
 
 
145
  return state
146
 
147
  async def _extract_content(self, state: PDFAnalyzerState) -> PDFAnalyzerState:
148
  """Extract text content from PDF file"""
 
 
149
  try:
150
  pdf_path = state["pdf_path"]
151
  logger.info(f"📄 Extracting content from PDF: {pdf_path}")
 
155
  num_pages = len(reader.pages)
156
  for page_num in range(num_pages):
157
  page = reader.pages[page_num]
158
+ extracted_text += (page.extract_text() or "") + "\n\n"
159
  state["extracted_text"] = extracted_text
 
160
  logger.info(f"✅ Extracted {num_pages} pages from PDF")
161
 
162
  except Exception as e:
 
173
 
174
  logger.info("👥 Extracting actors...")
175
 
176
+ response_content = await self._run_prompt(state["extracted_text"], EXTRACT_ACTORS_PROMPT)
177
+ state["actors"] = response_content
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  # Log detailed LLM response
180
  logger.info("=" * 80)
181
  logger.info("🤖 LLM RESPONSE (extract_actors)")
182
  logger.info("=" * 80)
183
+ logger.info(f"📊 Response length: {len(response_content)} characters")
184
  logger.info(f"📄 Content preview (first 300 chars):")
185
+ logger.info(response_content[:300] + ("..." if len(response_content) > 300 else ""))
186
  logger.info("=" * 80)
187
  logger.info("✅ Actors extracted")
188
 
 
196
 
197
  logger.info("🔑 Extracting key details...")
198
 
199
+ response_content = await self._run_prompt(state["extracted_text"], EXTRACT_KEY_DETAILS_PROMPT)
200
+ state["key_details"] = response_content
 
 
 
 
 
 
201
 
202
  # Log detailed LLM response
203
  logger.info("=" * 80)
204
  logger.info("🤖 LLM RESPONSE (extract_key_details)")
205
  logger.info("=" * 80)
206
+ logger.info(f"📊 Response length: {len(response_content)} characters")
207
  logger.info(f"📄 Content preview (first 300 chars):")
208
+ logger.info(response_content[:300] + ("..." if len(response_content) > 300 else ""))
209
  logger.info("=" * 80)
210
  logger.info("✅ Key details extracted")
211
 
 
219
 
220
  logger.info("📝 Generating document summary...")
221
 
222
+ response_content = await self._run_prompt(state["extracted_text"], GENERATE_SUMMARY_PROMPT)
223
+ state["summary"] = response_content
 
 
 
 
 
 
 
224
 
225
  # Log detailed LLM response
226
  logger.info("=" * 80)
227
  logger.info("🤖 LLM RESPONSE (generate_summary)")
228
  logger.info("=" * 80)
229
+ logger.info(f"📊 Response length: {len(response_content)} characters")
230
  logger.info(f"📄 Content preview (first 300 chars):")
231
+ logger.info(response_content[:300] + ("..." if len(response_content) > 300 else ""))
232
  logger.info("=" * 80)
233
  logger.info("✅ Summary generated")
234
 
 
250
 
251
  initial_state: PDFAnalyzerState = {
252
  "pdf_path": pdf_path,
 
253
  "extracted_text": None,
254
  "summary": None,
255
  "actors": None,
256
  "key_details": None,
257
+ "document_type": None,
 
 
258
  "needs_ocr": False,
259
  "ocr_performed": False,
260
+ "ocr_method": None,
261
+ "error": None,
262
  }
263
 
264
  logger.info(f"🚀 Starting PDF analysis for: {pdf_path}")
265
  final_state = await self.workflow.ainvoke(initial_state)
266
 
267
+ logger.info(f"✅ PDF analysis complete.")
268
 
269
  return {
270
  "summary": final_state.get("summary"),
271
  "actors": final_state.get("actors"),
272
  "key_details": final_state.get("key_details"),
 
273
  "ocr_used": final_state.get("ocr_performed", False),
274
  "ocr_method": final_state.get("ocr_method")
275
+ }
prompts/actors_merger.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ SYSTEM_PROMPT = """You are an actor resolution agent.
4
+
5
+ You receive:
6
+ 1. Newly extracted actors from a legal document
7
+ 2. Existing actors already known in the system
8
+
9
+ Your goal is to integrate all newly extracted actors into the existing actor list.
10
+
11
+ Each actor has:
12
+ - name
13
+ - aliases
14
+ - description
15
+ - implication
16
+
17
+ You can use only these tools:
18
+ - add_actors: create one or more new actors
19
+ - modify_actors: update existing actors if a new extraction clearly refers to them
20
+ - attempt_completion: call this only when all extracted actors have been handled
21
+
22
+ Rules:
23
+ - Prefer modifying an existing actor if the new actor clearly matches it
24
+ - Prefer adding a new actor if there is uncertainty
25
+ - Use name, aliases, description, and implication to decide
26
+ - Do not leave any extracted actor untreated
27
+ """
28
+
29
+ ACTOR_MERGER_PROMPT = """
30
+ You are integrating newly extracted actors from a legal document into an existing actor registry.
31
+
32
+ You are given:
33
+
34
+ NEW_ACTORS:
35
+ Actors extracted from the current document.
36
+
37
+ EXISTING_ACTORS:
38
+ Actors already known in the system.
39
+
40
+ Your goal is to integrate the new actors into the existing registry.
41
+
42
+ Rules:
43
+
44
+ 1. If a NEW_ACTOR clearly refers to an EXISTING_ACTOR:
45
+ - Use modify_actors
46
+ - Add missing information such as:
47
+ - new aliases
48
+ - additional description
49
+ - additional implications
50
+ - Do NOT duplicate the actor.
51
+
52
+ 2. If a NEW_ACTOR does NOT match any existing actor:
53
+ - Use add_actors to create a new actor entry.
54
+
55
+ 3. Matching should consider:
56
+ - similar names
57
+ - aliases
58
+ - descriptions
59
+ - contextual role in the document.
60
+
61
+ 4. Be conservative with merges.
62
+ If you are unsure whether two actors are the same, create a new actor.
63
+
64
+ 5. Continue until ALL NEW_ACTORS have been handled.
65
+
66
+ 6. When every actor has been processed, call attempt_completion.
67
+
68
+ Remember:
69
+ - Never ignore a NEW_ACTOR.
70
+ - Never duplicate actors unnecessarily.
71
+ - Prefer enriching existing actors rather than recreating them.
72
+ """
prompts/doc_assistant.py CHANGED
@@ -3,7 +3,7 @@
3
  System prompts for the doc creator router agent
4
  """
5
 
6
- ROUTER_SYSTEM_PROMPT = """You are a Document Router Agent that decides whether to respond to a user's question or modify their HTML document.
7
 
8
  ## CRITICAL RULES
9
 
 
3
  System prompts for the doc creator router agent
4
  """
5
 
6
+ ROUTER_SYSTEM_PROMPT = """You were created by Hexiagon labs. You are Hexiagon AI, a Document Assistant Agent that decides whether to respond to a user's question or modify their HTML document.
7
 
8
  ## CRITICAL RULES
9
 
requirements.txt CHANGED
@@ -12,6 +12,7 @@ langchain>=0.1.0
12
  langchain-openai>=0.1.0
13
  langchain-community>=0.0.20
14
  langchain-google-genai>=1.0.0
 
15
  mistralai>=1.0.0
16
  langchain-xai==1.2.2
17
  # FastAPI and server dependencies
 
12
  langchain-openai>=0.1.0
13
  langchain-community>=0.0.20
14
  langchain-google-genai>=1.0.0
15
+ langchain-openrouter>=0.1.0
16
  mistralai>=1.0.0
17
  langchain-xai==1.2.2
18
  # FastAPI and server dependencies
utils/tools.py CHANGED
@@ -1077,6 +1077,168 @@ async def _attempt_completion(message: str) -> Dict[str, Any]:
1077
  "message": message
1078
  }
1079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1080
 
1081
  # Export tool sets for different user types
1082
  tools_for_client_facade = [query_knowledge_graph, find_lawyers, message_lawyer, search_web]
@@ -1091,4 +1253,8 @@ tools_for_doc_assistant = [_query_knowledge_graph, _retrieve_lawyer_document, _e
1091
  tools_for_doc_editor_facade = [replace_html, add_html, delete_html, view_current_document, attempt_completion]
1092
  tools_for_doc_editor = [_replace_html, _add_html, _delete_html, _view_current_document, _attempt_completion]
1093
 
 
 
 
 
1094
  tools = tools_for_client
 
1077
  "message": message
1078
  }
1079
 
1080
+ # ============ ACTOR RESOLUTION TOOLS ============
1081
+
1082
+ @tool
1083
+ async def add_actors(actors: List[Dict[str, Any]]) -> str:
1084
+ """
1085
+ Add one or more new actors to the actor registry.
1086
+
1087
+ Each actor should contain:
1088
+ - name
1089
+ - aliases
1090
+ - description
1091
+ - implication
1092
+
1093
+ Use this tool when a newly extracted actor does not reasonably match
1094
+ an existing actor and should be added as a new entry.
1095
+ """
1096
+ return
1097
+
1098
+
1099
+ @tool
1100
+ async def modify_actors(modifications: List[Dict[str, Any]]) -> str:
1101
+ """
1102
+ Modify one or more existing actors.
1103
+
1104
+ Each modification should usually contain:
1105
+ - target_name: name of the existing actor to update
1106
+ - optionally updated name
1107
+ - aliases
1108
+ - description
1109
+ - implication
1110
+
1111
+ Use this tool when a newly extracted actor appears to refer to an
1112
+ already existing actor and you want to enrich or update that actor.
1113
+ """
1114
+ return
1115
+
1116
+
1117
+ @tool
1118
+ async def attempt_completion_actor_resolution(message: str) -> Dict[str, Any]:
1119
+ """
1120
+ Signal that actor resolution is complete.
1121
+
1122
+ Call this only when all newly extracted actors have been handled,
1123
+ either by adding them as new actors or modifying existing ones.
1124
+
1125
+ Args:
1126
+ message: Short summary of what was resolved
1127
+
1128
+ Returns:
1129
+ Dict with 'ok' and 'message'
1130
+ """
1131
+ logger.info(f" ✅ attempt_completion_actor_resolution | {message}")
1132
+ return {
1133
+ "ok": True,
1134
+ "message": message
1135
+ }
1136
+
1137
+
1138
+ @tool
1139
+ async def _add_actors(
1140
+ existing_actors: List[Dict[str, Any]],
1141
+ actors: List[Dict[str, Any]]
1142
+ ) -> Dict[str, Any]:
1143
+ """
1144
+ Real implementation for adding actors.
1145
+
1146
+ Args:
1147
+ existing_actors: Current actor registry from state
1148
+ actors: New actors to add
1149
+
1150
+ Returns:
1151
+ Dict with updated actor list
1152
+ """
1153
+ try:
1154
+ updated_actors = existing_actors.copy()
1155
+ updated_actors.extend(actors)
1156
+
1157
+ logger.info(f" ✅ add_actors | added:{len(actors)}")
1158
+
1159
+ return {
1160
+ "ok": True,
1161
+ "existing_actors": updated_actors,
1162
+ "added": len(actors)
1163
+ }
1164
+ except Exception as e:
1165
+ return {
1166
+ "ok": False,
1167
+ "error": f"Error adding actors: {str(e)}"
1168
+ }
1169
+
1170
+
1171
+ @tool
1172
+ async def _modify_actors(
1173
+ existing_actors: List[Dict[str, Any]],
1174
+ modifications: List[Dict[str, Any]]
1175
+ ) -> Dict[str, Any]:
1176
+ """
1177
+ Real implementation for modifying existing actors.
1178
+
1179
+ Matching is intentionally simple for MVP:
1180
+ - match by target_name against actor['name']
1181
+
1182
+ Each modification may contain:
1183
+ - target_name
1184
+ - name
1185
+ - aliases
1186
+ - description
1187
+ - implication
1188
+ """
1189
+ try:
1190
+ updated_actors = [actor.copy() for actor in existing_actors]
1191
+ updated_count = 0
1192
+
1193
+ for mod in modifications:
1194
+ target_name = mod.get("target_name")
1195
+ if not target_name:
1196
+ continue
1197
+
1198
+ for actor in updated_actors:
1199
+ if actor.get("name") == target_name:
1200
+ if mod.get("name"):
1201
+ actor["name"] = mod["name"]
1202
+
1203
+ if mod.get("aliases"):
1204
+ existing_aliases = actor.get("aliases", []) or []
1205
+ new_aliases = mod.get("aliases", []) or []
1206
+ actor["aliases"] = list(dict.fromkeys(existing_aliases + new_aliases))
1207
+
1208
+ if mod.get("description"):
1209
+ actor["description"] = mod["description"]
1210
+
1211
+ if mod.get("implication"):
1212
+ actor["implication"] = mod["implication"]
1213
+
1214
+ updated_count += 1
1215
+ break
1216
+
1217
+ logger.info(f" ✅ modify_actors | updated:{updated_count}")
1218
+
1219
+ return {
1220
+ "ok": True,
1221
+ "existing_actors": updated_actors,
1222
+ "updated": updated_count
1223
+ }
1224
+ except Exception as e:
1225
+ return {
1226
+ "ok": False,
1227
+ "error": f"Error modifying actors: {str(e)}"
1228
+ }
1229
+
1230
+
1231
+ @tool
1232
+ async def _attempt_completion_actor_resolution(message: str) -> Dict[str, Any]:
1233
+ """
1234
+ Real implementation for actor resolution completion.
1235
+ """
1236
+ logger.info(f" ✅ attempt_completion_actor_resolution | {message}")
1237
+ return {
1238
+ "ok": True,
1239
+ "message": message
1240
+ }
1241
+
1242
 
1243
  # Export tool sets for different user types
1244
  tools_for_client_facade = [query_knowledge_graph, find_lawyers, message_lawyer, search_web]
 
1253
  tools_for_doc_editor_facade = [replace_html, add_html, delete_html, view_current_document, attempt_completion]
1254
  tools_for_doc_editor = [_replace_html, _add_html, _delete_html, _view_current_document, _attempt_completion]
1255
 
1256
+ tools_for_actors_merger_facade = [add_actors,modify_actors,attempt_completion_actor_resolution]
1257
+
1258
+ tools_for_actors_merger = [_add_actors,_modify_actors,_attempt_completion_actor_resolution]
1259
+
1260
  tools = tools_for_client