Spaces:
Sleeping
Sleeping
Upload 21 files
Browse files- src/__init__.py +9 -0
- src/config/__init__.py +28 -0
- src/config/logging_config.py +290 -0
- src/config/schema.py +87 -0
- src/config/settings.py +81 -0
- src/models/__init__.py +6 -0
- src/models/project.py +163 -0
- src/models/state.py +32 -0
- src/parsers/__init__.py +6 -0
- src/parsers/project_parser.py +319 -0
- src/parsers/smart_chunker.py +197 -0
- src/services/__init__.py +19 -0
- src/services/answerer.py +498 -0
- src/services/builder.py +693 -0
- src/services/cache.py +317 -0
- src/services/cypher_templates.py +1332 -0
- src/services/neo4j_service.py +588 -0
- src/services/reranker.py +254 -0
- src/services/retriever.py +177 -0
- src/ui/__init__.py +5 -0
- src/ui/gradio_app.py +301 -0
src/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GraphRAG Solution Package.
|
| 3 |
+
|
| 4 |
+
A modular GraphRAG (Graph Retrieval-Augmented Generation) application
|
| 5 |
+
for analyzing industrial project-report PDFs using Neo4j and Together AI.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
__version__ = "1.0.0"
|
| 9 |
+
__author__ = "GraphRAG Team"
|
src/config/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration module for GraphRAG application."""
|
| 2 |
+
|
| 3 |
+
from src.config.schema import SchemaPolicy
|
| 4 |
+
from src.config.settings import Settings
|
| 5 |
+
from src.config.logging_config import (
|
| 6 |
+
configure_logging,
|
| 7 |
+
get_logger,
|
| 8 |
+
get_flow_logger,
|
| 9 |
+
trace_step,
|
| 10 |
+
trace_flow,
|
| 11 |
+
trace_context,
|
| 12 |
+
log_step,
|
| 13 |
+
TraceContext,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
__all__ = [
|
| 17 |
+
"SchemaPolicy",
|
| 18 |
+
"Settings",
|
| 19 |
+
# Logging
|
| 20 |
+
"configure_logging",
|
| 21 |
+
"get_logger",
|
| 22 |
+
"get_flow_logger",
|
| 23 |
+
"trace_step",
|
| 24 |
+
"trace_flow",
|
| 25 |
+
"trace_context",
|
| 26 |
+
"log_step",
|
| 27 |
+
"TraceContext",
|
| 28 |
+
]
|
src/config/logging_config.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Centralized logging configuration with flow tracing support."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import functools
|
| 6 |
+
import logging
|
| 7 |
+
import sys
|
| 8 |
+
import threading
|
| 9 |
+
import time
|
| 10 |
+
import uuid
|
| 11 |
+
from contextlib import contextmanager
|
| 12 |
+
from dataclasses import dataclass, field
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
from typing import Any, Callable, Dict, List, Optional, TypeVar
|
| 15 |
+
|
| 16 |
+
F = TypeVar('F', bound=Callable[..., Any])
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class GraphRAGFormatter(logging.Formatter):
|
| 20 |
+
"""Custom formatter with color support and structured output."""
|
| 21 |
+
|
| 22 |
+
COLORS = {
|
| 23 |
+
'DEBUG': '\033[36m',
|
| 24 |
+
'INFO': '\033[32m',
|
| 25 |
+
'WARNING': '\033[33m',
|
| 26 |
+
'ERROR': '\033[31m',
|
| 27 |
+
'CRITICAL': '\033[35m',
|
| 28 |
+
'RESET': '\033[0m',
|
| 29 |
+
'DIM': '\033[2m',
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
STEP_ICONS = {
|
| 33 |
+
'start': '▶',
|
| 34 |
+
'end': '✓',
|
| 35 |
+
'error': '✗',
|
| 36 |
+
'info': '•',
|
| 37 |
+
'substep': ' ↳',
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
def __init__(self, fmt: Optional[str] = None, datefmt: Optional[str] = None, use_colors: bool = True):
|
| 41 |
+
super().__init__(fmt, datefmt)
|
| 42 |
+
self.use_colors = use_colors and sys.stdout.isatty()
|
| 43 |
+
|
| 44 |
+
def format(self, record: logging.LogRecord) -> str:
|
| 45 |
+
trace_id = getattr(record, 'trace_id', None)
|
| 46 |
+
step_type = getattr(record, 'step_type', None)
|
| 47 |
+
duration = getattr(record, 'duration', None)
|
| 48 |
+
|
| 49 |
+
prefix_parts = []
|
| 50 |
+
if trace_id:
|
| 51 |
+
prefix_parts.append(f"[{trace_id[:8]}]")
|
| 52 |
+
if step_type and step_type in self.STEP_ICONS:
|
| 53 |
+
prefix_parts.append(self.STEP_ICONS[step_type])
|
| 54 |
+
prefix = " ".join(prefix_parts) + " " if prefix_parts else ""
|
| 55 |
+
|
| 56 |
+
suffix = f" ({duration:.3f}s)" if duration is not None else ""
|
| 57 |
+
|
| 58 |
+
if self.use_colors:
|
| 59 |
+
level_color = self.COLORS.get(record.levelname, '')
|
| 60 |
+
reset = self.COLORS['RESET']
|
| 61 |
+
dim = self.COLORS['DIM']
|
| 62 |
+
timestamp = datetime.fromtimestamp(record.created).strftime('%H:%M:%S.%f')[:-3]
|
| 63 |
+
return (
|
| 64 |
+
f"{dim}{timestamp}{reset} | "
|
| 65 |
+
f"{level_color}{record.levelname:8}{reset} | "
|
| 66 |
+
f"{dim}{record.name:30}{reset} | "
|
| 67 |
+
f"{prefix}{record.getMessage()}{suffix}"
|
| 68 |
+
)
|
| 69 |
+
return f"{prefix}{super().format(record)}{suffix}"
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@dataclass
|
| 73 |
+
class TraceContext:
|
| 74 |
+
"""Context for tracking execution flow."""
|
| 75 |
+
|
| 76 |
+
trace_id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
| 77 |
+
steps: List[Dict[str, Any]] = field(default_factory=list)
|
| 78 |
+
start_time: float = field(default_factory=time.time)
|
| 79 |
+
current_step: int = 0
|
| 80 |
+
|
| 81 |
+
def add_step(self, name: str, status: str = "completed", duration: Optional[float] = None,
|
| 82 |
+
details: Optional[Dict[str, Any]] = None) -> None:
|
| 83 |
+
self.current_step += 1
|
| 84 |
+
self.steps.append({
|
| 85 |
+
"step": self.current_step,
|
| 86 |
+
"name": name,
|
| 87 |
+
"status": status,
|
| 88 |
+
"duration": duration,
|
| 89 |
+
"details": details or {},
|
| 90 |
+
"timestamp": time.time(),
|
| 91 |
+
})
|
| 92 |
+
|
| 93 |
+
def get_summary(self) -> Dict[str, Any]:
|
| 94 |
+
return {
|
| 95 |
+
"trace_id": self.trace_id,
|
| 96 |
+
"total_duration": time.time() - self.start_time,
|
| 97 |
+
"step_count": len(self.steps),
|
| 98 |
+
"steps": self.steps,
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
_trace_context = threading.local()
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def get_current_trace() -> Optional[TraceContext]:
|
| 106 |
+
return getattr(_trace_context, 'current', None)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def set_current_trace(trace: Optional[TraceContext]) -> None:
|
| 110 |
+
_trace_context.current = trace
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
class FlowLogger:
|
| 114 |
+
"""Logger wrapper with flow tracing capabilities."""
|
| 115 |
+
|
| 116 |
+
def __init__(self, name: str):
|
| 117 |
+
self.logger = logging.getLogger(name)
|
| 118 |
+
self.name = name
|
| 119 |
+
|
| 120 |
+
def _log_with_context(self, level: int, msg: str, step_type: Optional[str] = None,
|
| 121 |
+
duration: Optional[float] = None, **kwargs) -> None:
|
| 122 |
+
trace = get_current_trace()
|
| 123 |
+
extra = kwargs.pop('extra', {})
|
| 124 |
+
extra['step_type'] = step_type
|
| 125 |
+
extra['duration'] = duration
|
| 126 |
+
extra['trace_id'] = trace.trace_id if trace else None
|
| 127 |
+
self.logger.log(level, msg, extra=extra, **kwargs)
|
| 128 |
+
|
| 129 |
+
def step_start(self, step_name: str, details: str = "") -> float:
|
| 130 |
+
msg = f"Starting: {step_name}" + (f" - {details}" if details else "")
|
| 131 |
+
self._log_with_context(logging.INFO, msg, step_type='start')
|
| 132 |
+
return time.time()
|
| 133 |
+
|
| 134 |
+
def step_end(self, step_name: str, start_time: float, details: str = "") -> None:
|
| 135 |
+
duration = time.time() - start_time
|
| 136 |
+
msg = f"Completed: {step_name}" + (f" - {details}" if details else "")
|
| 137 |
+
self._log_with_context(logging.INFO, msg, step_type='end', duration=duration)
|
| 138 |
+
trace = get_current_trace()
|
| 139 |
+
if trace:
|
| 140 |
+
trace.add_step(step_name, "completed", duration)
|
| 141 |
+
|
| 142 |
+
def step_error(self, step_name: str, error: Exception, start_time: Optional[float] = None) -> None:
|
| 143 |
+
duration = time.time() - start_time if start_time else None
|
| 144 |
+
msg = f"Failed: {step_name} - {type(error).__name__}: {error}"
|
| 145 |
+
self._log_with_context(logging.ERROR, msg, step_type='error', duration=duration)
|
| 146 |
+
trace = get_current_trace()
|
| 147 |
+
if trace:
|
| 148 |
+
trace.add_step(step_name, "failed", duration, {"error": str(error)})
|
| 149 |
+
|
| 150 |
+
def substep(self, msg: str) -> None:
|
| 151 |
+
self._log_with_context(logging.DEBUG, msg, step_type='substep')
|
| 152 |
+
|
| 153 |
+
def info(self, msg: str, **kwargs) -> None:
|
| 154 |
+
self._log_with_context(logging.INFO, msg, step_type='info', **kwargs)
|
| 155 |
+
|
| 156 |
+
def debug(self, msg: str, **kwargs) -> None:
|
| 157 |
+
self._log_with_context(logging.DEBUG, msg, **kwargs)
|
| 158 |
+
|
| 159 |
+
def warning(self, msg: str, **kwargs) -> None:
|
| 160 |
+
self._log_with_context(logging.WARNING, msg, **kwargs)
|
| 161 |
+
|
| 162 |
+
def error(self, msg: str, **kwargs) -> None:
|
| 163 |
+
self._log_with_context(logging.ERROR, msg, **kwargs)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def get_flow_logger(name: str) -> FlowLogger:
|
| 167 |
+
return FlowLogger(name)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def get_logger(name: str) -> FlowLogger:
|
| 171 |
+
return FlowLogger(name)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def trace_step(step_name: Optional[str] = None):
|
| 175 |
+
"""Decorator to trace a function as a step."""
|
| 176 |
+
def decorator(func: F) -> F:
|
| 177 |
+
@functools.wraps(func)
|
| 178 |
+
def wrapper(*args, **kwargs):
|
| 179 |
+
name = step_name or func.__name__
|
| 180 |
+
logger = get_flow_logger(func.__module__)
|
| 181 |
+
start = logger.step_start(name)
|
| 182 |
+
try:
|
| 183 |
+
result = func(*args, **kwargs)
|
| 184 |
+
logger.step_end(name, start)
|
| 185 |
+
return result
|
| 186 |
+
except Exception as e:
|
| 187 |
+
logger.step_error(name, e, start)
|
| 188 |
+
raise
|
| 189 |
+
return wrapper # type: ignore
|
| 190 |
+
return decorator
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def trace_flow(flow_name: str):
|
| 194 |
+
"""Decorator to trace an entire flow with a new trace context."""
|
| 195 |
+
def decorator(func: F) -> F:
|
| 196 |
+
@functools.wraps(func)
|
| 197 |
+
def wrapper(*args, **kwargs):
|
| 198 |
+
logger = get_flow_logger(func.__module__)
|
| 199 |
+
trace = TraceContext()
|
| 200 |
+
set_current_trace(trace)
|
| 201 |
+
logger.info(f"{'='*60}")
|
| 202 |
+
logger.info(f"FLOW START: {flow_name} [Trace: {trace.trace_id[:8]}]")
|
| 203 |
+
logger.info(f"{'='*60}")
|
| 204 |
+
start = time.time()
|
| 205 |
+
try:
|
| 206 |
+
result = func(*args, **kwargs)
|
| 207 |
+
duration = time.time() - start
|
| 208 |
+
logger.info(f"{'='*60}")
|
| 209 |
+
logger.info(f"FLOW COMPLETE: {flow_name} ({duration:.3f}s)")
|
| 210 |
+
logger.info(f"Steps completed: {len(trace.steps)}")
|
| 211 |
+
logger.info(f"{'='*60}")
|
| 212 |
+
return result
|
| 213 |
+
except Exception as e:
|
| 214 |
+
duration = time.time() - start
|
| 215 |
+
logger.error(f"{'='*60}")
|
| 216 |
+
logger.error(f"FLOW FAILED: {flow_name} ({duration:.3f}s)")
|
| 217 |
+
logger.error(f"Error: {type(e).__name__}: {e}")
|
| 218 |
+
logger.error(f"{'='*60}")
|
| 219 |
+
raise
|
| 220 |
+
finally:
|
| 221 |
+
set_current_trace(None)
|
| 222 |
+
return wrapper # type: ignore
|
| 223 |
+
return decorator
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
@contextmanager
|
| 227 |
+
def trace_context(flow_name: str):
|
| 228 |
+
"""Context manager for tracing a flow."""
|
| 229 |
+
logger = get_flow_logger(__name__)
|
| 230 |
+
trace = TraceContext()
|
| 231 |
+
set_current_trace(trace)
|
| 232 |
+
logger.info(f"{'='*60}")
|
| 233 |
+
logger.info(f"FLOW START: {flow_name} [Trace: {trace.trace_id[:8]}]")
|
| 234 |
+
logger.info(f"{'='*60}")
|
| 235 |
+
start = time.time()
|
| 236 |
+
try:
|
| 237 |
+
yield trace
|
| 238 |
+
duration = time.time() - start
|
| 239 |
+
logger.info(f"{'='*60}")
|
| 240 |
+
logger.info(f"FLOW COMPLETE: {flow_name} ({duration:.3f}s)")
|
| 241 |
+
logger.info(f"{'='*60}")
|
| 242 |
+
except Exception as e:
|
| 243 |
+
duration = time.time() - start
|
| 244 |
+
logger.error(f"{'='*60}")
|
| 245 |
+
logger.error(f"FLOW FAILED: {flow_name} ({duration:.3f}s)")
|
| 246 |
+
logger.error(f"Error: {type(e).__name__}: {e}")
|
| 247 |
+
logger.error(f"{'='*60}")
|
| 248 |
+
raise
|
| 249 |
+
finally:
|
| 250 |
+
set_current_trace(None)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
@contextmanager
|
| 254 |
+
def log_step(logger: FlowLogger, step_name: str, details: str = ""):
|
| 255 |
+
"""Context manager for logging a step."""
|
| 256 |
+
start = logger.step_start(step_name, details)
|
| 257 |
+
try:
|
| 258 |
+
yield
|
| 259 |
+
logger.step_end(step_name, start)
|
| 260 |
+
except Exception as e:
|
| 261 |
+
logger.step_error(step_name, e, start)
|
| 262 |
+
raise
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def configure_logging(level: int = logging.INFO, use_colors: bool = True,
|
| 266 |
+
log_file: Optional[str] = None, detailed: bool = False) -> None:
|
| 267 |
+
"""Configure logging for the application."""
|
| 268 |
+
root = logging.getLogger()
|
| 269 |
+
root.handlers.clear()
|
| 270 |
+
root.setLevel(level)
|
| 271 |
+
|
| 272 |
+
console_handler = logging.StreamHandler(sys.stdout)
|
| 273 |
+
console_handler.setLevel(level)
|
| 274 |
+
console_handler.setFormatter(GraphRAGFormatter(use_colors=use_colors))
|
| 275 |
+
root.addHandler(console_handler)
|
| 276 |
+
|
| 277 |
+
if log_file:
|
| 278 |
+
file_handler = logging.FileHandler(log_file)
|
| 279 |
+
file_handler.setLevel(level)
|
| 280 |
+
file_handler.setFormatter(logging.Formatter(
|
| 281 |
+
"%(asctime)s | %(levelname)-8s | %(name)-30s | %(message)s"
|
| 282 |
+
))
|
| 283 |
+
root.addHandler(file_handler)
|
| 284 |
+
|
| 285 |
+
for logger_name in ["httpx", "httpcore", "neo4j", "urllib3"]:
|
| 286 |
+
logging.getLogger(logger_name).setLevel(logging.WARNING)
|
| 287 |
+
|
| 288 |
+
if not detailed:
|
| 289 |
+
for logger_name in ["langchain", "langchain_community"]:
|
| 290 |
+
logging.getLogger(logger_name).setLevel(logging.WARNING)
|
src/config/schema.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Schema policy for LLM-driven graph extraction."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import List
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class SchemaPolicy:
|
| 9 |
+
"""Defines allowed node labels and relationship types for LLM graph extraction.
|
| 10 |
+
|
| 11 |
+
The LLMGraphTransformer benefits from explicit schema constraints. This schema
|
| 12 |
+
is intentionally broad to support diverse project report questions (stakeholders,
|
| 13 |
+
contracts, permitting, schedule, finance, risks, etc.).
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
ALLOWED_NODES: List[str] = [
|
| 17 |
+
# Document structure
|
| 18 |
+
"Project", "Report", "Document", "Section", "Chunk", "Source", "Evidence",
|
| 19 |
+
# Organizations
|
| 20 |
+
"Organization", "Company", "Owner", "ParentCompany", "Client", "Customer",
|
| 21 |
+
"Partner", "JV", "Consortium", "Contractor", "Subcontractor", "Vendor", "Supplier",
|
| 22 |
+
"Consultant", "EngineeringFirm", "EPC", "EPCM", "Operator",
|
| 23 |
+
"GovernmentAgency", "Regulator", "Stakeholder",
|
| 24 |
+
# People
|
| 25 |
+
"Person", "Role", "Team", "Department",
|
| 26 |
+
# Geography
|
| 27 |
+
"Location", "Address", "City", "State", "Province", "Region", "Country", "County",
|
| 28 |
+
"Zone", "Port", "Site", "Plant",
|
| 29 |
+
# Finance
|
| 30 |
+
"Budget", "Cost", "Capex", "Opex", "Estimate", "Investment", "Funding",
|
| 31 |
+
"Currency", "TIV", "Revenue", "Tariff", "Price",
|
| 32 |
+
# Timeline
|
| 33 |
+
"Timeline", "Schedule", "Milestone", "Phase", "Stage", "Date", "Quarter", "Year",
|
| 34 |
+
"Duration", "StartDate", "EndDate",
|
| 35 |
+
# Technical
|
| 36 |
+
"Industry", "Sector", "Market", "Demand", "Product", "Output", "Capacity",
|
| 37 |
+
"Feedstock", "Fuel", "Technology", "Process", "Equipment", "Unit", "System", "Utility",
|
| 38 |
+
"Specification", "Standard",
|
| 39 |
+
# Contracts
|
| 40 |
+
"Contract", "Agreement", "Tender", "Bid", "RFQ", "Procurement", "Permit",
|
| 41 |
+
"WorkPackage", "Deliverable", "Requirement", "KPI", "Metric",
|
| 42 |
+
# Status
|
| 43 |
+
"Status", "StatusReason", "Decision", "Change", "Assumption", "Dependency",
|
| 44 |
+
"Risk", "Issue", "Challenge", "Constraint", "Delay", "Cancellation",
|
| 45 |
+
# ESG
|
| 46 |
+
"EnvironmentalAspect", "Emissions", "Wastewater", "Water", "Waste", "Safety",
|
| 47 |
+
"Regulation", "Compliance",
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
ALLOWED_RELATIONSHIPS: List[str] = [
|
| 51 |
+
# Document structure
|
| 52 |
+
"HAS_REPORT", "HAS_DOCUMENT", "HAS_SECTION", "HAS_CHUNK", "HAS_EVIDENCE",
|
| 53 |
+
"EVIDENCED_BY", "SUPPORTED_BY", "MENTIONS", "ABOUT",
|
| 54 |
+
# Lifecycle
|
| 55 |
+
"HAS_STATUS", "HAS_STATUS_REASON", "HAS_PHASE", "HAS_STAGE",
|
| 56 |
+
"HAS_TIMELINE", "HAS_SCHEDULE", "HAS_MILESTONE",
|
| 57 |
+
"STARTS_AT", "ENDS_AT", "UPDATED_ON", "RELEASED_ON", "COMPLETES_AT",
|
| 58 |
+
# Organizations
|
| 59 |
+
"OWNED_BY", "PARENT_OF", "HAS_PARENT", "MANAGED_BY", "OPERATED_BY",
|
| 60 |
+
"LED_BY", "RESPONSIBLE_FOR", "WORKS_FOR", "HAS_ROLE",
|
| 61 |
+
"PARTNERED_WITH", "CONTRACTED_BY", "DESIGNED_BY", "ENGINEERED_BY",
|
| 62 |
+
"CONSTRUCTED_BY", "PROCURED_BY", "SUPPLIED_BY", "REGULATED_BY",
|
| 63 |
+
# Geography
|
| 64 |
+
"LOCATED_IN", "HAS_ADDRESS", "IN_CITY", "IN_STATE", "IN_COUNTRY", "IN_REGION", "IN_ZONE",
|
| 65 |
+
# Finance
|
| 66 |
+
"HAS_BUDGET", "HAS_COST", "HAS_CAPEX", "HAS_OPEX", "HAS_TIV", "IN_CURRENCY",
|
| 67 |
+
"FUNDED_BY", "ALLOCATED_TO",
|
| 68 |
+
# Technical
|
| 69 |
+
"IN_INDUSTRY", "IN_SECTOR", "IN_MARKET",
|
| 70 |
+
"PRODUCES", "USES_FEEDSTOCK", "USES_FUEL", "USES_TECHNOLOGY", "USES_PROCESS",
|
| 71 |
+
"REQUIRES_EQUIPMENT", "HAS_UNIT", "HAS_SYSTEM", "HAS_UTILITY", "HAS_CAPACITY",
|
| 72 |
+
"MEETS_STANDARD",
|
| 73 |
+
# Governance
|
| 74 |
+
"REQUIRES_PERMIT", "HAS_REQUIREMENT", "HAS_DELIVERABLE",
|
| 75 |
+
"HAS_ENVIRONMENTAL_ASPECT", "HAS_SAFETY_REQUIREMENT",
|
| 76 |
+
# Risks
|
| 77 |
+
"HAS_RISK", "HAS_ISSUE", "HAS_CHALLENGE", "HAS_CONSTRAINT",
|
| 78 |
+
"CAUSED_BY", "RESULTED_IN", "AFFECTED_BY", "DELAYED_BY", "CANCELLED_DUE_TO",
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
@classmethod
|
| 82 |
+
def get_allowed_nodes(cls) -> List[str]:
|
| 83 |
+
return cls.ALLOWED_NODES.copy()
|
| 84 |
+
|
| 85 |
+
@classmethod
|
| 86 |
+
def get_allowed_relationships(cls) -> List[str]:
|
| 87 |
+
return cls.ALLOWED_RELATIONSHIPS.copy()
|
src/config/settings.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Application settings and configuration management."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class Neo4jConfig:
|
| 14 |
+
"""Neo4j database connection configuration."""
|
| 15 |
+
|
| 16 |
+
uri: str = ""
|
| 17 |
+
username: str = "neo4j"
|
| 18 |
+
password: str = ""
|
| 19 |
+
database: str = "neo4j"
|
| 20 |
+
|
| 21 |
+
def is_valid(self) -> bool:
|
| 22 |
+
return bool(self.uri and self.username and self.password)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class TogetherAIConfig:
|
| 27 |
+
"""Together AI API configuration."""
|
| 28 |
+
|
| 29 |
+
api_key: str = ""
|
| 30 |
+
chat_model: str = "meta-llama/meta-llama-3.1-8b-instruct-turbo"
|
| 31 |
+
embedding_model: str = "BAAI/bge-base-en-v1.5"
|
| 32 |
+
|
| 33 |
+
def is_valid(self) -> bool:
|
| 34 |
+
return bool(self.api_key)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@dataclass
|
| 38 |
+
class AppConfig:
|
| 39 |
+
"""Application-level configuration."""
|
| 40 |
+
|
| 41 |
+
port: int = 7860
|
| 42 |
+
host: str = "0.0.0.0"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@dataclass
|
| 46 |
+
class Settings:
|
| 47 |
+
"""Centralized application settings."""
|
| 48 |
+
|
| 49 |
+
neo4j: Neo4jConfig = field(default_factory=Neo4jConfig)
|
| 50 |
+
together_ai: TogetherAIConfig = field(default_factory=TogetherAIConfig)
|
| 51 |
+
app: AppConfig = field(default_factory=AppConfig)
|
| 52 |
+
|
| 53 |
+
@classmethod
|
| 54 |
+
def from_env(cls, dotenv_path: Optional[str] = None) -> "Settings":
|
| 55 |
+
"""Load settings from environment variables."""
|
| 56 |
+
load_dotenv(dotenv_path)
|
| 57 |
+
|
| 58 |
+
neo4j = Neo4jConfig(
|
| 59 |
+
uri=os.getenv("NEO4J_URI", ""),
|
| 60 |
+
username=os.getenv("NEO4J_USERNAME", "neo4j"),
|
| 61 |
+
password=os.getenv("NEO4J_PASSWORD", ""),
|
| 62 |
+
database=os.getenv("NEO4J_DATABASE", "neo4j"),
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
together_ai = TogetherAIConfig(
|
| 66 |
+
api_key=os.getenv("TOGETHER_API_KEY", ""),
|
| 67 |
+
chat_model=os.getenv("TOGETHER_CHAT_MODEL", "meta-llama/meta-llama-3.1-8b-instruct-turbo"),
|
| 68 |
+
embedding_model=os.getenv("TOGETHER_EMBED_MODEL", "BAAI/bge-base-en-v1.5"),
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
app = AppConfig(
|
| 72 |
+
port=int(os.getenv("PORT", "7860")),
|
| 73 |
+
host=os.getenv("HOST", "0.0.0.0"),
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
return cls(neo4j=neo4j, together_ai=together_ai, app=app)
|
| 77 |
+
|
| 78 |
+
def apply_to_env(self) -> None:
|
| 79 |
+
"""Apply current settings to environment variables."""
|
| 80 |
+
if self.together_ai.api_key:
|
| 81 |
+
os.environ["TOGETHER_API_KEY"] = self.together_ai.api_key
|
src/models/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data models for GraphRAG application."""
|
| 2 |
+
|
| 3 |
+
from src.models.project import ProjectRecord
|
| 4 |
+
from src.models.state import AppState
|
| 5 |
+
|
| 6 |
+
__all__ = ["ProjectRecord", "AppState"]
|
src/models/project.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Project record data model for structured extraction from PDF reports."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import Any, Dict, Optional
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@dataclass
|
| 10 |
+
class ProjectRecord:
|
| 11 |
+
"""Canonical structured fields parsed from a single PDF project report."""
|
| 12 |
+
|
| 13 |
+
# Identification
|
| 14 |
+
source: str
|
| 15 |
+
project_id: Optional[str] = None
|
| 16 |
+
project_name: Optional[str] = None
|
| 17 |
+
|
| 18 |
+
# Classification
|
| 19 |
+
industry_code: Optional[str] = None
|
| 20 |
+
project_type: Optional[str] = None
|
| 21 |
+
sector: Optional[str] = None
|
| 22 |
+
sic_code: Optional[str] = None
|
| 23 |
+
sic_product: Optional[str] = None
|
| 24 |
+
|
| 25 |
+
# Financial
|
| 26 |
+
tiv_amount: Optional[float] = None
|
| 27 |
+
tiv_currency: Optional[str] = None
|
| 28 |
+
|
| 29 |
+
# Status
|
| 30 |
+
status: Optional[str] = None
|
| 31 |
+
status_reason: Optional[str] = None
|
| 32 |
+
project_probability: Optional[str] = None
|
| 33 |
+
|
| 34 |
+
# Timeline
|
| 35 |
+
last_update: Optional[str] = None
|
| 36 |
+
initial_release: Optional[str] = None
|
| 37 |
+
pec_timing: Optional[str] = None
|
| 38 |
+
pec_activity: Optional[str] = None
|
| 39 |
+
|
| 40 |
+
# Location
|
| 41 |
+
address: Optional[str] = None
|
| 42 |
+
city_state_line: Optional[str] = None
|
| 43 |
+
zone_county: Optional[str] = None
|
| 44 |
+
|
| 45 |
+
# Plant Info
|
| 46 |
+
plant_owner: Optional[str] = None
|
| 47 |
+
plant_parent: Optional[str] = None
|
| 48 |
+
plant_name: Optional[str] = None
|
| 49 |
+
plant_id: Optional[str] = None
|
| 50 |
+
unit_name: Optional[str] = None
|
| 51 |
+
|
| 52 |
+
# Contacts
|
| 53 |
+
project_manager: Optional[str] = None
|
| 54 |
+
project_manager_company: Optional[str] = None
|
| 55 |
+
project_manager_title: Optional[str] = None
|
| 56 |
+
project_manager_email: Optional[str] = None
|
| 57 |
+
project_manager_phone: Optional[str] = None
|
| 58 |
+
engineer_company: Optional[str] = None
|
| 59 |
+
ec_firm: Optional[str] = None
|
| 60 |
+
phone: Optional[str] = None
|
| 61 |
+
|
| 62 |
+
# Technical
|
| 63 |
+
scope_text: Optional[str] = None
|
| 64 |
+
project_capacity: Optional[str] = None
|
| 65 |
+
environmental: Optional[str] = None
|
| 66 |
+
construction_labor: Optional[str] = None
|
| 67 |
+
operations_labor: Optional[str] = None
|
| 68 |
+
fuel_type: Optional[str] = None
|
| 69 |
+
|
| 70 |
+
# Derived text sections
|
| 71 |
+
schedule_text: Optional[str] = None
|
| 72 |
+
details_text: Optional[str] = None
|
| 73 |
+
|
| 74 |
+
@property
|
| 75 |
+
def owner_company(self) -> Optional[str]:
|
| 76 |
+
"""Alias for plant_owner (backward compatibility)."""
|
| 77 |
+
return self.plant_owner
|
| 78 |
+
|
| 79 |
+
def get_unique_key(self) -> str:
|
| 80 |
+
return self.project_id or self.project_name or self.source
|
| 81 |
+
|
| 82 |
+
def has_budget_info(self) -> bool:
|
| 83 |
+
return self.tiv_amount is not None and self.tiv_currency is not None
|
| 84 |
+
|
| 85 |
+
def has_location_info(self) -> bool:
|
| 86 |
+
return any([self.address, self.city_state_line, self.zone_county])
|
| 87 |
+
|
| 88 |
+
def has_timeline_info(self) -> bool:
|
| 89 |
+
return bool(self.schedule_text)
|
| 90 |
+
|
| 91 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 92 |
+
"""Convert record to dictionary with non-None fields only."""
|
| 93 |
+
return {
|
| 94 |
+
k: v for k, v in {
|
| 95 |
+
"source": self.source,
|
| 96 |
+
"project_id": self.project_id,
|
| 97 |
+
"project_name": self.project_name,
|
| 98 |
+
"industry_code": self.industry_code,
|
| 99 |
+
"project_type": self.project_type,
|
| 100 |
+
"sector": self.sector,
|
| 101 |
+
"sic_code": self.sic_code,
|
| 102 |
+
"sic_product": self.sic_product,
|
| 103 |
+
"tiv_amount": self.tiv_amount,
|
| 104 |
+
"tiv_currency": self.tiv_currency,
|
| 105 |
+
"status": self.status,
|
| 106 |
+
"status_reason": self.status_reason,
|
| 107 |
+
"project_probability": self.project_probability,
|
| 108 |
+
"last_update": self.last_update,
|
| 109 |
+
"initial_release": self.initial_release,
|
| 110 |
+
"pec_timing": self.pec_timing,
|
| 111 |
+
"pec_activity": self.pec_activity,
|
| 112 |
+
"address": self.address,
|
| 113 |
+
"city_state_line": self.city_state_line,
|
| 114 |
+
"zone_county": self.zone_county,
|
| 115 |
+
"plant_owner": self.plant_owner,
|
| 116 |
+
"plant_parent": self.plant_parent,
|
| 117 |
+
"plant_name": self.plant_name,
|
| 118 |
+
"plant_id": self.plant_id,
|
| 119 |
+
"unit_name": self.unit_name,
|
| 120 |
+
"project_manager": self.project_manager,
|
| 121 |
+
"project_manager_company": self.project_manager_company,
|
| 122 |
+
"project_manager_title": self.project_manager_title,
|
| 123 |
+
"project_manager_email": self.project_manager_email,
|
| 124 |
+
"project_manager_phone": self.project_manager_phone,
|
| 125 |
+
"engineer_company": self.engineer_company,
|
| 126 |
+
"ec_firm": self.ec_firm,
|
| 127 |
+
"phone": self.phone,
|
| 128 |
+
"scope_text": self.scope_text,
|
| 129 |
+
"project_capacity": self.project_capacity,
|
| 130 |
+
"environmental": self.environmental,
|
| 131 |
+
"construction_labor": self.construction_labor,
|
| 132 |
+
"operations_labor": self.operations_labor,
|
| 133 |
+
"fuel_type": self.fuel_type,
|
| 134 |
+
"schedule_text": self.schedule_text,
|
| 135 |
+
"details_text": self.details_text,
|
| 136 |
+
}.items() if v is not None
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
@dataclass
|
| 141 |
+
class Milestone:
|
| 142 |
+
"""A project milestone extracted from schedule text."""
|
| 143 |
+
|
| 144 |
+
name: str
|
| 145 |
+
date_text: str = ""
|
| 146 |
+
sentence: str = ""
|
| 147 |
+
source: str = ""
|
| 148 |
+
|
| 149 |
+
def to_dict(self) -> Dict[str, str]:
|
| 150 |
+
return {"name": self.name, "dateText": self.date_text, "sentence": self.sentence}
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
@dataclass
|
| 154 |
+
class GeoComponents:
|
| 155 |
+
"""Parsed geographic components from city/state line."""
|
| 156 |
+
|
| 157 |
+
city: Optional[str] = None
|
| 158 |
+
state: Optional[str] = None
|
| 159 |
+
postal: Optional[str] = None
|
| 160 |
+
country: Optional[str] = None
|
| 161 |
+
|
| 162 |
+
def to_dict(self) -> Dict[str, Optional[str]]:
|
| 163 |
+
return {"city": self.city, "state": self.state, "postal": self.postal, "country": self.country}
|
src/models/state.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Application state container for runtime handles."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import TYPE_CHECKING, Any, Optional
|
| 7 |
+
|
| 8 |
+
if TYPE_CHECKING:
|
| 9 |
+
from src.services.neo4j_service import Neo4jService
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class AppState:
|
| 14 |
+
"""Runtime handles required for query-time execution after ingestion."""
|
| 15 |
+
|
| 16 |
+
neo4j: Optional[Any] = None
|
| 17 |
+
vector: Optional[Any] = None
|
| 18 |
+
qa_chain: Optional[Any] = None
|
| 19 |
+
llm: Optional[Any] = None
|
| 20 |
+
|
| 21 |
+
def is_ready(self) -> bool:
|
| 22 |
+
return all([self.neo4j, self.vector, self.qa_chain, self.llm])
|
| 23 |
+
|
| 24 |
+
def get_graph(self) -> Optional[Any]:
|
| 25 |
+
return self.neo4j.graph if self.neo4j else None
|
| 26 |
+
|
| 27 |
+
def close(self) -> None:
|
| 28 |
+
if self.neo4j:
|
| 29 |
+
try:
|
| 30 |
+
self.neo4j.close()
|
| 31 |
+
except Exception:
|
| 32 |
+
pass
|
src/parsers/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Parsing utilities for document extraction."""
|
| 2 |
+
|
| 3 |
+
from src.parsers.project_parser import ProjectReportParser
|
| 4 |
+
from src.parsers.smart_chunker import SemanticChunker, get_chunker
|
| 5 |
+
|
| 6 |
+
__all__ = ["ProjectReportParser", "SemanticChunker", "get_chunker"]
|
src/parsers/project_parser.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Project report parser for semi-structured PDF documents."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
from typing import Dict, List, Optional, Tuple
|
| 7 |
+
|
| 8 |
+
from src.models.project import GeoComponents, Milestone, ProjectRecord
|
| 9 |
+
from src.config import get_logger
|
| 10 |
+
|
| 11 |
+
logger = get_logger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ProjectReportParser:
|
| 15 |
+
"""Comprehensive parser for semi-structured project report PDFs."""
|
| 16 |
+
|
| 17 |
+
# Identification patterns
|
| 18 |
+
PATTERN_PROJECT_ID = r"Project ID:\s*([0-9]+)"
|
| 19 |
+
PATTERN_PROJECT_NAME = r"Project Name\s+(.+?)\s+PEC Activity Diagram"
|
| 20 |
+
|
| 21 |
+
# Classification patterns
|
| 22 |
+
PATTERN_INDUSTRY_CODE = r"Industry Code\s+([0-9]+\s+[A-Za-z\s&\(\)]+?)(?:\s+Project Type)"
|
| 23 |
+
PATTERN_PROJECT_TYPE = r"Project Type\s+([A-Za-z]+)"
|
| 24 |
+
PATTERN_SECTOR = r"Sector\s+([A-Za-z\s]+?)(?:\s+SIC Product|\s+Status)"
|
| 25 |
+
PATTERN_SIC_CODE = r"SIC Code\s+([0-9]+\s+[A-Za-z\s&,\[\]]+?)(?:\s+Sector)"
|
| 26 |
+
PATTERN_SIC_PRODUCT = r"SIC Product\s+([0-9\*]+\s+[A-Za-z\s,\(\)\-]+?)(?:\s+Status)"
|
| 27 |
+
|
| 28 |
+
# Financial patterns
|
| 29 |
+
PATTERN_TIV_USD = r"TIV \(USD\)\s*([0-9,]+)"
|
| 30 |
+
PATTERN_TIV_CNY = r"TIV \(CNY\)\s*([0-9,]+)"
|
| 31 |
+
|
| 32 |
+
# Status patterns
|
| 33 |
+
PATTERN_STATUS = r"Status\s+([A-Za-z]+)\s+Last Update"
|
| 34 |
+
PATTERN_STATUS_REASON = r"Status Reason\s+(.+?)\s+Environmental"
|
| 35 |
+
PATTERN_PROJECT_PROBABILITY = r"Project Probability\s+([A-Za-z]+\s*\([0-9\-]+%\))"
|
| 36 |
+
|
| 37 |
+
# Timeline patterns
|
| 38 |
+
PATTERN_LAST_UPDATE = r"Last Update\s+([0-9]{2}-[A-Za-z]{3}-[0-9]{4})"
|
| 39 |
+
PATTERN_INITIAL_RELEASE = r"Initial Release\s+([0-9]{2}-[A-Za-z]{3}-[0-9]{4})"
|
| 40 |
+
PATTERN_PEC_TIMING = r"PEC.\s*Timing\s+([A-Z][0-9])"
|
| 41 |
+
PATTERN_PEC_ACTIVITY = r"PEC.\s*Activity\s+([A-Za-z\s\-]+?)(?:\s+Project Probability)"
|
| 42 |
+
|
| 43 |
+
# Location patterns
|
| 44 |
+
PATTERN_LOCATION = r"Location\s+(.+?)\s+Phone"
|
| 45 |
+
PATTERN_CITY_STATE = r"City/State\s+(.+?)\s+Zone/County"
|
| 46 |
+
PATTERN_ZONE_COUNTY = r"Zone/County\s+(.+?)\s+Project Responsibility"
|
| 47 |
+
PATTERN_PHONE = r"Phone\s+(\+?[0-9\s\-]+)"
|
| 48 |
+
|
| 49 |
+
# Plant info patterns
|
| 50 |
+
PATTERN_PLANT_OWNER = r"Plant Owner\s+([A-Za-z\s&,\.]+?)(?:\s+Plant Parent)"
|
| 51 |
+
PATTERN_PLANT_PARENT = r"Plant Parent\s+([A-Za-z\s&,\.]+?)(?:\s+Plant Name|\s+Unit Name)"
|
| 52 |
+
PATTERN_PLANT_NAME = r"Plant Name\s+([A-Za-z\s&,\.]+?)(?:\s+Unit Name|\s+Plant ID)"
|
| 53 |
+
PATTERN_PLANT_ID = r"Plant ID\s+([0-9]+)"
|
| 54 |
+
PATTERN_UNIT_NAME = r"Unit Name\s+([A-Za-z0-9\s&]+?)(?:\s+Plant ID|\s+Location)"
|
| 55 |
+
|
| 56 |
+
# Contact patterns
|
| 57 |
+
PATTERN_PROJECT_MANAGER = r"Project Manager\s+([A-Za-z\s&,\.]+?)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:\d|No\.|[A-Z][a-z]+\s+(?:Road|Street|Drive|Ave|Suite|Manager))"
|
| 58 |
+
PATTERN_ENGINEER = r"Eng\s+([A-Za-z\s&,\.]+?)\s+(?:[A-Z][a-z]+\s+[A-Z][a-z]+|[0-9])"
|
| 59 |
+
PATTERN_EC_FIRM = r"E&C\s+([A-Za-z\s&,\.]+?)\s+(?:[A-Z][a-z]+\s+[A-Z][a-z]+|[0-9])"
|
| 60 |
+
PATTERN_EMAIL = r"\[E-Mail\]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})"
|
| 61 |
+
|
| 62 |
+
# Technical patterns
|
| 63 |
+
PATTERN_SCOPE = r"Scope\s+(.+?)\s+Schedule\s+"
|
| 64 |
+
PATTERN_PROJECT_CAPACITY = r"Project Capacity\s+(?:Planned\s+)?([0-9,]+\s*(?:MW|BBL|Megawatts)[^\n]*)"
|
| 65 |
+
PATTERN_ENVIRONMENTAL = r"Environmental\s+(Air\s*\([A-Z]\)[^C]*?)(?:\s+Construction Labor)"
|
| 66 |
+
PATTERN_CONSTRUCTION_LABOR = r"Construction Labor Preference\s+([A-Za-z\-]+)"
|
| 67 |
+
PATTERN_OPERATIONS_LABOR = r"Operations Labor Preference\s+([A-Za-z\-]+)"
|
| 68 |
+
PATTERN_FUEL_TYPE = r"Project Fuel Type\s+([A-Za-z]+)"
|
| 69 |
+
|
| 70 |
+
# Schedule/details patterns
|
| 71 |
+
PATTERN_SCHEDULE = r"Schedule\s+(.+?)\bDetails\b"
|
| 72 |
+
PATTERN_SCHEDULE_FALLBACK = r"Schedule\s+(.+?)\s+Engineering\s+(?:Civil|Contracting|Electrical)"
|
| 73 |
+
PATTERN_DETAILS = r"Details\s+(.+?)\s+Engineering\s+(?:Civil|Contracting)"
|
| 74 |
+
|
| 75 |
+
# Milestone pattern
|
| 76 |
+
PATTERN_MILESTONE = (
|
| 77 |
+
r"(?P<name>[A-Za-z0-9\-\s&/]+?)\s+"
|
| 78 |
+
r"(?P<date>(?:[1-4]Q\d{2,4}|\d{4}|[A-Za-z]{3}-\d{4})(?:\s*\([^\)]*\))?)"
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
CHALLENGE_KEYWORDS = r"funding|partners|agreement|RFQ|bid|cancelled|delay|escalat"
|
| 82 |
+
PATTERN_GEO = r"^(?P<city>[^,]+),\s*(?P<state>[^\d]+?)\s+(?P<postal>\d+)\s+(?P<country>.+)$"
|
| 83 |
+
|
| 84 |
+
def __init__(self) -> None:
|
| 85 |
+
self._compiled_patterns: Dict[str, re.Pattern] = {}
|
| 86 |
+
|
| 87 |
+
def _get_pattern(self, pattern: str, flags: int = 0) -> re.Pattern:
|
| 88 |
+
key = f"{pattern}:{flags}"
|
| 89 |
+
if key not in self._compiled_patterns:
|
| 90 |
+
self._compiled_patterns[key] = re.compile(pattern, flags)
|
| 91 |
+
return self._compiled_patterns[key]
|
| 92 |
+
|
| 93 |
+
def _find_match(self, text: str, pattern: str, flags: int = 0) -> Optional[str]:
|
| 94 |
+
compiled = self._get_pattern(pattern, flags)
|
| 95 |
+
match = compiled.search(text)
|
| 96 |
+
return match.group(1).strip() if match else None
|
| 97 |
+
|
| 98 |
+
def _find_all_matches(self, text: str, pattern: str, flags: int = 0) -> List[str]:
|
| 99 |
+
compiled = self._get_pattern(pattern, flags)
|
| 100 |
+
return [m.group(1).strip() for m in compiled.finditer(text)]
|
| 101 |
+
|
| 102 |
+
@staticmethod
|
| 103 |
+
def _money_to_float(value: str) -> Optional[float]:
|
| 104 |
+
try:
|
| 105 |
+
return float(value.replace(",", ""))
|
| 106 |
+
except (ValueError, AttributeError):
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
+
def _extract_project_manager(self, text: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
| 110 |
+
"""Extract project manager name, company, and email."""
|
| 111 |
+
pm_pattern = self._get_pattern(self.PATTERN_PROJECT_MANAGER, re.IGNORECASE)
|
| 112 |
+
pm_match = pm_pattern.search(text)
|
| 113 |
+
|
| 114 |
+
name, company, email = None, None, None
|
| 115 |
+
|
| 116 |
+
if pm_match:
|
| 117 |
+
company = pm_match.group(1).strip()
|
| 118 |
+
name = pm_match.group(2).strip()
|
| 119 |
+
pm_section = text[pm_match.start():pm_match.start() + 500]
|
| 120 |
+
email_match = re.search(self.PATTERN_EMAIL, pm_section)
|
| 121 |
+
if email_match:
|
| 122 |
+
email = email_match.group(1)
|
| 123 |
+
logger.info(f"Found Project Manager: {name} ({company})")
|
| 124 |
+
|
| 125 |
+
return name, company, email
|
| 126 |
+
|
| 127 |
+
def parse(self, text: str, source_name: str) -> ProjectRecord:
|
| 128 |
+
"""Parse a report into a ProjectRecord with comprehensive field extraction."""
|
| 129 |
+
normalized = re.sub(r"\s+", " ", text)
|
| 130 |
+
|
| 131 |
+
# Identification
|
| 132 |
+
project_id = self._find_match(normalized, self.PATTERN_PROJECT_ID)
|
| 133 |
+
project_name = self._find_match(normalized, self.PATTERN_PROJECT_NAME, re.IGNORECASE)
|
| 134 |
+
|
| 135 |
+
# Classification
|
| 136 |
+
industry_code = self._find_match(normalized, self.PATTERN_INDUSTRY_CODE, re.IGNORECASE)
|
| 137 |
+
project_type = self._find_match(normalized, self.PATTERN_PROJECT_TYPE, re.IGNORECASE)
|
| 138 |
+
sector = self._find_match(normalized, self.PATTERN_SECTOR, re.IGNORECASE)
|
| 139 |
+
sic_code = self._find_match(normalized, self.PATTERN_SIC_CODE, re.IGNORECASE)
|
| 140 |
+
sic_product = self._find_match(normalized, self.PATTERN_SIC_PRODUCT, re.IGNORECASE)
|
| 141 |
+
|
| 142 |
+
# Financial
|
| 143 |
+
tiv_usd = self._find_match(normalized, self.PATTERN_TIV_USD)
|
| 144 |
+
tiv_cny = self._find_match(normalized, self.PATTERN_TIV_CNY)
|
| 145 |
+
tiv_amount: Optional[float] = None
|
| 146 |
+
tiv_currency: Optional[str] = None
|
| 147 |
+
if tiv_usd:
|
| 148 |
+
tiv_amount = self._money_to_float(tiv_usd)
|
| 149 |
+
tiv_currency = "USD"
|
| 150 |
+
elif tiv_cny:
|
| 151 |
+
tiv_amount = self._money_to_float(tiv_cny)
|
| 152 |
+
tiv_currency = "CNY"
|
| 153 |
+
|
| 154 |
+
# Status
|
| 155 |
+
status = self._find_match(normalized, self.PATTERN_STATUS, re.IGNORECASE)
|
| 156 |
+
status_reason = self._find_match(normalized, self.PATTERN_STATUS_REASON, re.IGNORECASE)
|
| 157 |
+
project_probability = self._find_match(normalized, self.PATTERN_PROJECT_PROBABILITY, re.IGNORECASE)
|
| 158 |
+
|
| 159 |
+
# Timeline
|
| 160 |
+
last_update = self._find_match(normalized, self.PATTERN_LAST_UPDATE)
|
| 161 |
+
initial_release = self._find_match(normalized, self.PATTERN_INITIAL_RELEASE)
|
| 162 |
+
pec_timing = self._find_match(normalized, self.PATTERN_PEC_TIMING, re.IGNORECASE)
|
| 163 |
+
pec_activity = self._find_match(normalized, self.PATTERN_PEC_ACTIVITY, re.IGNORECASE)
|
| 164 |
+
|
| 165 |
+
# Location
|
| 166 |
+
address = self._find_match(normalized, self.PATTERN_LOCATION, re.IGNORECASE)
|
| 167 |
+
city_state_line = self._find_match(normalized, self.PATTERN_CITY_STATE, re.IGNORECASE)
|
| 168 |
+
zone_county = self._find_match(normalized, self.PATTERN_ZONE_COUNTY, re.IGNORECASE)
|
| 169 |
+
phone = self._find_match(normalized, self.PATTERN_PHONE)
|
| 170 |
+
|
| 171 |
+
# Plant info
|
| 172 |
+
plant_owner = self._find_match(normalized, self.PATTERN_PLANT_OWNER, re.IGNORECASE)
|
| 173 |
+
plant_parent = self._find_match(normalized, self.PATTERN_PLANT_PARENT, re.IGNORECASE)
|
| 174 |
+
plant_name = self._find_match(normalized, self.PATTERN_PLANT_NAME, re.IGNORECASE)
|
| 175 |
+
plant_id = self._find_match(normalized, self.PATTERN_PLANT_ID)
|
| 176 |
+
unit_name = self._find_match(normalized, self.PATTERN_UNIT_NAME, re.IGNORECASE)
|
| 177 |
+
|
| 178 |
+
# Contacts
|
| 179 |
+
project_manager, project_manager_company, project_manager_email = self._extract_project_manager(normalized)
|
| 180 |
+
engineer_company = self._find_match(normalized, self.PATTERN_ENGINEER, re.IGNORECASE)
|
| 181 |
+
ec_firm = self._find_match(normalized, self.PATTERN_EC_FIRM, re.IGNORECASE)
|
| 182 |
+
|
| 183 |
+
# Technical
|
| 184 |
+
scope_text = self._find_match(normalized, self.PATTERN_SCOPE, re.IGNORECASE | re.DOTALL)
|
| 185 |
+
project_capacity = self._find_match(normalized, self.PATTERN_PROJECT_CAPACITY, re.IGNORECASE)
|
| 186 |
+
environmental = self._find_match(normalized, self.PATTERN_ENVIRONMENTAL, re.IGNORECASE)
|
| 187 |
+
construction_labor = self._find_match(normalized, self.PATTERN_CONSTRUCTION_LABOR, re.IGNORECASE)
|
| 188 |
+
operations_labor = self._find_match(normalized, self.PATTERN_OPERATIONS_LABOR, re.IGNORECASE)
|
| 189 |
+
fuel_type = self._find_match(normalized, self.PATTERN_FUEL_TYPE, re.IGNORECASE)
|
| 190 |
+
|
| 191 |
+
# Schedule/details
|
| 192 |
+
schedule_text = self._find_match(normalized, self.PATTERN_SCHEDULE, re.IGNORECASE | re.DOTALL)
|
| 193 |
+
if not schedule_text:
|
| 194 |
+
schedule_text = self._find_match(normalized, self.PATTERN_SCHEDULE_FALLBACK, re.IGNORECASE | re.DOTALL)
|
| 195 |
+
details_text = self._find_match(normalized, self.PATTERN_DETAILS, re.IGNORECASE | re.DOTALL)
|
| 196 |
+
|
| 197 |
+
extracted_count = sum(1 for v in [
|
| 198 |
+
project_id, project_name, industry_code, project_type, sector,
|
| 199 |
+
tiv_amount, status, plant_owner, project_manager, scope_text,
|
| 200 |
+
schedule_text, pec_timing, pec_activity
|
| 201 |
+
] if v is not None)
|
| 202 |
+
logger.info(f"Extracted {extracted_count}/13 key fields from {source_name}")
|
| 203 |
+
|
| 204 |
+
return ProjectRecord(
|
| 205 |
+
source=source_name,
|
| 206 |
+
project_id=project_id,
|
| 207 |
+
project_name=project_name,
|
| 208 |
+
industry_code=industry_code,
|
| 209 |
+
project_type=project_type,
|
| 210 |
+
sector=sector,
|
| 211 |
+
sic_code=sic_code,
|
| 212 |
+
sic_product=sic_product,
|
| 213 |
+
tiv_amount=tiv_amount,
|
| 214 |
+
tiv_currency=tiv_currency,
|
| 215 |
+
status=status,
|
| 216 |
+
status_reason=status_reason,
|
| 217 |
+
project_probability=project_probability,
|
| 218 |
+
last_update=last_update,
|
| 219 |
+
initial_release=initial_release,
|
| 220 |
+
pec_timing=pec_timing,
|
| 221 |
+
pec_activity=pec_activity,
|
| 222 |
+
address=address,
|
| 223 |
+
city_state_line=city_state_line,
|
| 224 |
+
zone_county=zone_county,
|
| 225 |
+
plant_owner=plant_owner,
|
| 226 |
+
plant_parent=plant_parent,
|
| 227 |
+
plant_name=plant_name,
|
| 228 |
+
plant_id=plant_id,
|
| 229 |
+
unit_name=unit_name,
|
| 230 |
+
project_manager=project_manager,
|
| 231 |
+
project_manager_company=project_manager_company,
|
| 232 |
+
project_manager_email=project_manager_email,
|
| 233 |
+
engineer_company=engineer_company,
|
| 234 |
+
ec_firm=ec_firm,
|
| 235 |
+
phone=phone,
|
| 236 |
+
scope_text=scope_text,
|
| 237 |
+
project_capacity=project_capacity,
|
| 238 |
+
environmental=environmental,
|
| 239 |
+
construction_labor=construction_labor,
|
| 240 |
+
operations_labor=operations_labor,
|
| 241 |
+
fuel_type=fuel_type,
|
| 242 |
+
schedule_text=schedule_text,
|
| 243 |
+
details_text=details_text,
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
def extract_milestones(self, schedule_text: Optional[str]) -> List[Milestone]:
|
| 247 |
+
"""Extract milestone-like statements from schedule text."""
|
| 248 |
+
if not schedule_text:
|
| 249 |
+
return []
|
| 250 |
+
|
| 251 |
+
milestones: List[Milestone] = []
|
| 252 |
+
pattern = self._get_pattern(self.PATTERN_MILESTONE)
|
| 253 |
+
|
| 254 |
+
for match in pattern.finditer(schedule_text):
|
| 255 |
+
name = match.group("name").strip()
|
| 256 |
+
date_text = match.group("date").strip()
|
| 257 |
+
if len(name) >= 3 and name.lower() not in ("the", "and", "for", "with"):
|
| 258 |
+
milestones.append(Milestone(
|
| 259 |
+
name=name,
|
| 260 |
+
date_text=date_text,
|
| 261 |
+
sentence=schedule_text[max(0, match.start()-50):match.end()+20].strip(),
|
| 262 |
+
))
|
| 263 |
+
|
| 264 |
+
if not milestones and schedule_text.strip():
|
| 265 |
+
milestones.append(Milestone(name="Schedule", date_text="", sentence=schedule_text.strip()[:200]))
|
| 266 |
+
|
| 267 |
+
return milestones
|
| 268 |
+
|
| 269 |
+
def derive_challenges(self, record: ProjectRecord) -> List[str]:
|
| 270 |
+
"""Derive candidate challenges/constraints from record fields."""
|
| 271 |
+
candidates: List[str] = []
|
| 272 |
+
|
| 273 |
+
if record.status_reason:
|
| 274 |
+
candidates.append(f"Status reason: {record.status_reason}")
|
| 275 |
+
if record.details_text:
|
| 276 |
+
candidates.append(record.details_text)
|
| 277 |
+
if record.schedule_text and re.search(self.CHALLENGE_KEYWORDS, record.schedule_text, re.IGNORECASE):
|
| 278 |
+
candidates.append("Dependencies / commercial gating mentioned in schedule (funding, partners, RFQs/bids).")
|
| 279 |
+
if record.status and record.status.lower() == "cancelled":
|
| 280 |
+
candidates.append("Project status is Cancelled.")
|
| 281 |
+
|
| 282 |
+
seen: set = set()
|
| 283 |
+
cleaned: List[str] = []
|
| 284 |
+
for candidate in candidates:
|
| 285 |
+
candidate = candidate.strip()
|
| 286 |
+
if candidate and candidate not in seen:
|
| 287 |
+
seen.add(candidate)
|
| 288 |
+
cleaned.append(candidate)
|
| 289 |
+
return cleaned
|
| 290 |
+
|
| 291 |
+
def parse_city_state_country(self, city_state_line: Optional[str]) -> GeoComponents:
|
| 292 |
+
"""Parse City/State line into structured components."""
|
| 293 |
+
if not city_state_line:
|
| 294 |
+
return GeoComponents()
|
| 295 |
+
|
| 296 |
+
line = city_state_line.strip()
|
| 297 |
+
pattern = self._get_pattern(self.PATTERN_GEO)
|
| 298 |
+
match = pattern.match(line)
|
| 299 |
+
|
| 300 |
+
if not match:
|
| 301 |
+
return GeoComponents(city=line)
|
| 302 |
+
|
| 303 |
+
return GeoComponents(
|
| 304 |
+
city=match.group("city").strip(),
|
| 305 |
+
state=match.group("state").strip(),
|
| 306 |
+
postal=match.group("postal").strip(),
|
| 307 |
+
country=match.group("country").strip(),
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
_default_parser: Optional[ProjectReportParser] = None
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def get_parser() -> ProjectReportParser:
|
| 315 |
+
"""Get the default parser instance (singleton)."""
|
| 316 |
+
global _default_parser
|
| 317 |
+
if _default_parser is None:
|
| 318 |
+
_default_parser = ProjectReportParser()
|
| 319 |
+
return _default_parser
|
src/parsers/smart_chunker.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Smart chunking for semi-structured project reports."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
from typing import Any, Dict, List, Optional
|
| 7 |
+
|
| 8 |
+
from langchain.schema import Document
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class SemanticChunker:
|
| 12 |
+
"""Section-aware chunking that respects document structure."""
|
| 13 |
+
|
| 14 |
+
SECTION_PATTERNS = [
|
| 15 |
+
r"^(?:Project\s+)?(?:ID|Name|Summary|Overview)",
|
| 16 |
+
r"^(?:Budget|TIV|Investment|Cost)",
|
| 17 |
+
r"^(?:Schedule|Timeline|Milestones?)",
|
| 18 |
+
r"^(?:Location|Site|Address)",
|
| 19 |
+
r"^(?:Status|Progress|Update)",
|
| 20 |
+
r"^(?:Details?|Description|Scope)",
|
| 21 |
+
r"^(?:Challenge|Risk|Issue|Constraint)",
|
| 22 |
+
r"^(?:Engineering|Construction|Procurement)",
|
| 23 |
+
r"^(?:Environmental|Regulatory|Permit)",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
DENSE_INDICATORS = [
|
| 27 |
+
r'\$[\d,]+',
|
| 28 |
+
r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
|
| 29 |
+
r'\b[A-Z]{2,}\b',
|
| 30 |
+
r'\d+\s*(?:MW|GW|tons?|MT|units?|km|miles?)',
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
max_chunk_size: int = 1200,
|
| 36 |
+
min_chunk_size: int = 200,
|
| 37 |
+
overlap_sentences: int = 2,
|
| 38 |
+
) -> None:
|
| 39 |
+
self.max_chunk_size = max_chunk_size
|
| 40 |
+
self.min_chunk_size = min_chunk_size
|
| 41 |
+
self.overlap_sentences = overlap_sentences
|
| 42 |
+
self._section_pattern = re.compile(
|
| 43 |
+
"|".join(f"({p})" for p in self.SECTION_PATTERNS),
|
| 44 |
+
re.IGNORECASE | re.MULTILINE
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
def _detect_sections(self, text: str) -> List[Dict[str, Any]]:
|
| 48 |
+
"""Identify section boundaries in document."""
|
| 49 |
+
sections: List[Dict[str, Any]] = []
|
| 50 |
+
matches = list(self._section_pattern.finditer(text))
|
| 51 |
+
|
| 52 |
+
for i, match in enumerate(matches):
|
| 53 |
+
start = match.start()
|
| 54 |
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
| 55 |
+
sections.append({
|
| 56 |
+
"header": match.group().strip(),
|
| 57 |
+
"start": start,
|
| 58 |
+
"end": end,
|
| 59 |
+
"content": text[start:end].strip()
|
| 60 |
+
})
|
| 61 |
+
|
| 62 |
+
if not sections:
|
| 63 |
+
sections.append({
|
| 64 |
+
"header": "Document",
|
| 65 |
+
"start": 0,
|
| 66 |
+
"end": len(text),
|
| 67 |
+
"content": text.strip()
|
| 68 |
+
})
|
| 69 |
+
|
| 70 |
+
return sections
|
| 71 |
+
|
| 72 |
+
def _calculate_density(self, text: str) -> float:
|
| 73 |
+
"""Calculate information density of text (matches per 100 chars)."""
|
| 74 |
+
total_matches = sum(len(re.findall(p, text)) for p in self.DENSE_INDICATORS)
|
| 75 |
+
return (total_matches / max(len(text), 1)) * 100
|
| 76 |
+
|
| 77 |
+
def _optimal_chunk_size(self, text: str) -> int:
|
| 78 |
+
"""Determine optimal chunk size based on content density."""
|
| 79 |
+
density = self._calculate_density(text)
|
| 80 |
+
if density > 5:
|
| 81 |
+
return 600
|
| 82 |
+
elif density > 2:
|
| 83 |
+
return 900
|
| 84 |
+
return 1200
|
| 85 |
+
|
| 86 |
+
def _split_section(
|
| 87 |
+
self,
|
| 88 |
+
section: Dict[str, Any],
|
| 89 |
+
source: str,
|
| 90 |
+
chunk_size: Optional[int] = None
|
| 91 |
+
) -> List[Document]:
|
| 92 |
+
"""Split a section into appropriately sized chunks."""
|
| 93 |
+
content = section["content"]
|
| 94 |
+
header = section["header"]
|
| 95 |
+
effective_chunk_size = chunk_size or self.max_chunk_size
|
| 96 |
+
|
| 97 |
+
if len(content) <= effective_chunk_size:
|
| 98 |
+
return [Document(
|
| 99 |
+
page_content=f"[{header}] {content}",
|
| 100 |
+
metadata={
|
| 101 |
+
"source": source,
|
| 102 |
+
"section": header,
|
| 103 |
+
"chunk_size": len(content),
|
| 104 |
+
"density": self._calculate_density(content),
|
| 105 |
+
}
|
| 106 |
+
)]
|
| 107 |
+
|
| 108 |
+
sentences = re.split(r'(?<=[.!?])\s+', content)
|
| 109 |
+
chunks: List[Document] = []
|
| 110 |
+
current_chunk: List[str] = []
|
| 111 |
+
current_length = 0
|
| 112 |
+
|
| 113 |
+
for sentence in sentences:
|
| 114 |
+
sentence_len = len(sentence)
|
| 115 |
+
|
| 116 |
+
if current_length + sentence_len > effective_chunk_size and current_chunk:
|
| 117 |
+
chunk_text = " ".join(current_chunk)
|
| 118 |
+
chunks.append(Document(
|
| 119 |
+
page_content=f"[{header}] {chunk_text}",
|
| 120 |
+
metadata={
|
| 121 |
+
"source": source,
|
| 122 |
+
"section": header,
|
| 123 |
+
"chunk_size": len(chunk_text),
|
| 124 |
+
"density": self._calculate_density(chunk_text),
|
| 125 |
+
}
|
| 126 |
+
))
|
| 127 |
+
current_chunk = current_chunk[-self.overlap_sentences:]
|
| 128 |
+
current_length = sum(len(s) for s in current_chunk)
|
| 129 |
+
|
| 130 |
+
current_chunk.append(sentence)
|
| 131 |
+
current_length += sentence_len
|
| 132 |
+
|
| 133 |
+
if current_chunk:
|
| 134 |
+
chunk_text = " ".join(current_chunk)
|
| 135 |
+
if len(chunk_text) >= self.min_chunk_size or not chunks:
|
| 136 |
+
chunks.append(Document(
|
| 137 |
+
page_content=f"[{header}] {chunk_text}",
|
| 138 |
+
metadata={
|
| 139 |
+
"source": source,
|
| 140 |
+
"section": header,
|
| 141 |
+
"chunk_size": len(chunk_text),
|
| 142 |
+
"density": self._calculate_density(chunk_text),
|
| 143 |
+
}
|
| 144 |
+
))
|
| 145 |
+
|
| 146 |
+
return chunks
|
| 147 |
+
|
| 148 |
+
def chunk_document(self, text: str, source: str, adaptive: bool = True) -> List[Document]:
|
| 149 |
+
"""Chunk document respecting section boundaries."""
|
| 150 |
+
sections = self._detect_sections(text)
|
| 151 |
+
all_chunks: List[Document] = []
|
| 152 |
+
|
| 153 |
+
for section in sections:
|
| 154 |
+
chunk_size = self._optimal_chunk_size(section["content"]) if adaptive else self.max_chunk_size
|
| 155 |
+
chunks = self._split_section(section, source, chunk_size)
|
| 156 |
+
all_chunks.extend(chunks)
|
| 157 |
+
|
| 158 |
+
return all_chunks
|
| 159 |
+
|
| 160 |
+
def chunk_pages(self, pages: List[Document], adaptive: bool = True) -> List[Document]:
|
| 161 |
+
"""Chunk a list of page Documents."""
|
| 162 |
+
if not pages:
|
| 163 |
+
return []
|
| 164 |
+
|
| 165 |
+
source = pages[0].metadata.get("source", "document.pdf")
|
| 166 |
+
full_text = ""
|
| 167 |
+
page_boundaries: List[int] = []
|
| 168 |
+
|
| 169 |
+
for page in pages:
|
| 170 |
+
page_boundaries.append(len(full_text))
|
| 171 |
+
full_text += page.page_content + "\n\n"
|
| 172 |
+
|
| 173 |
+
chunks = self.chunk_document(full_text, source, adaptive)
|
| 174 |
+
|
| 175 |
+
for chunk in chunks:
|
| 176 |
+
chunk_start = full_text.find(
|
| 177 |
+
chunk.page_content.replace(f"[{chunk.metadata.get('section', '')}] ", "")[:50]
|
| 178 |
+
)
|
| 179 |
+
if chunk_start >= 0:
|
| 180 |
+
page_num = 1
|
| 181 |
+
for i, boundary in enumerate(page_boundaries):
|
| 182 |
+
if chunk_start >= boundary:
|
| 183 |
+
page_num = i + 1
|
| 184 |
+
chunk.metadata["page"] = page_num
|
| 185 |
+
|
| 186 |
+
return chunks
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
_default_chunker: Optional[SemanticChunker] = None
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def get_chunker() -> SemanticChunker:
|
| 193 |
+
"""Get the default chunker instance (singleton)."""
|
| 194 |
+
global _default_chunker
|
| 195 |
+
if _default_chunker is None:
|
| 196 |
+
_default_chunker = SemanticChunker()
|
| 197 |
+
return _default_chunker
|
src/services/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Core services for GraphRAG application."""
|
| 2 |
+
|
| 3 |
+
from src.services.neo4j_service import Neo4jService, Neo4jConnectionError
|
| 4 |
+
from src.services.builder import GraphRAGBuilder
|
| 5 |
+
from src.services.answerer import QueryAnswerer
|
| 6 |
+
from src.services.retriever import OptimizedRetriever
|
| 7 |
+
from src.services.cache import QueryCache, AnswerCache, get_query_cache, get_answer_cache
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"Neo4jService",
|
| 11 |
+
"Neo4jConnectionError",
|
| 12 |
+
"GraphRAGBuilder",
|
| 13 |
+
"QueryAnswerer",
|
| 14 |
+
"OptimizedRetriever",
|
| 15 |
+
"QueryCache",
|
| 16 |
+
"AnswerCache",
|
| 17 |
+
"get_query_cache",
|
| 18 |
+
"get_answer_cache",
|
| 19 |
+
]
|
src/services/answerer.py
ADDED
|
@@ -0,0 +1,498 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Query answering service with hybrid strategy."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
| 6 |
+
|
| 7 |
+
from langchain.schema import Document
|
| 8 |
+
|
| 9 |
+
from src.config import get_logger, trace_flow, log_step
|
| 10 |
+
from src.models.state import AppState
|
| 11 |
+
from src.services.retriever import OptimizedRetriever
|
| 12 |
+
from src.services.cache import AnswerCache, get_answer_cache
|
| 13 |
+
from src.services.cypher_templates import (
|
| 14 |
+
CypherTemplateRouter,
|
| 15 |
+
TemplateResultFormatter,
|
| 16 |
+
QueryIntent,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Module logger
|
| 20 |
+
logger = get_logger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class QueryAnswerer:
|
| 24 |
+
"""Answers user questions using an optimized hybrid strategy.
|
| 25 |
+
|
| 26 |
+
Strategy:
|
| 27 |
+
1) Template-first routing: Pattern matching classifies intent and
|
| 28 |
+
executes pre-validated Cypher templates for most queries.
|
| 29 |
+
This is deterministic, fast, and reliable.
|
| 30 |
+
|
| 31 |
+
2) For general queries: GraphRAG with optimized retrieval:
|
| 32 |
+
- Pattern-based query expansion (no LLM)
|
| 33 |
+
- Cross-encoder reranking (faster than LLM)
|
| 34 |
+
- Single LLM call for synthesis only
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
# Default retrieval settings
|
| 38 |
+
DEFAULT_K = 6
|
| 39 |
+
|
| 40 |
+
# Optimized synthesis prompt (simpler, more focused)
|
| 41 |
+
SYNTHESIS_PROMPT = """You are an expert analyst for industrial project reports.
|
| 42 |
+
|
| 43 |
+
## Question
|
| 44 |
+
{question}
|
| 45 |
+
|
| 46 |
+
## Retrieved Document Excerpts
|
| 47 |
+
{context}
|
| 48 |
+
|
| 49 |
+
## Graph Database Context
|
| 50 |
+
{graph_context}
|
| 51 |
+
|
| 52 |
+
## Instructions
|
| 53 |
+
1. Answer directly and concisely based on the evidence
|
| 54 |
+
2. If information is incomplete, acknowledge what's missing
|
| 55 |
+
3. For comparison questions, structure answer by project
|
| 56 |
+
4. Use citations like [1], [2] to reference sources
|
| 57 |
+
5. For challenges/risks, consider: cancellation reasons, delays, funding issues, permitting
|
| 58 |
+
|
| 59 |
+
Answer:""".strip()
|
| 60 |
+
|
| 61 |
+
def __init__(
|
| 62 |
+
self,
|
| 63 |
+
k: int = DEFAULT_K,
|
| 64 |
+
use_optimized_retrieval: bool = True,
|
| 65 |
+
use_caching: bool = True,
|
| 66 |
+
cache_ttl: float = 3600,
|
| 67 |
+
use_reranking: bool = True,
|
| 68 |
+
) -> None:
|
| 69 |
+
"""Initialize query answerer.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
k: Number of chunks to retrieve for similarity search.
|
| 73 |
+
use_optimized_retrieval: If True, uses fast pattern-based expansion
|
| 74 |
+
and cross-encoder reranking. If False, uses original LLM-based.
|
| 75 |
+
use_caching: If True, caches answers for repeated queries.
|
| 76 |
+
cache_ttl: Cache time-to-live.
|
| 77 |
+
use_reranking: If True, uses cross-encoder reranking.
|
| 78 |
+
"""
|
| 79 |
+
self.k = k
|
| 80 |
+
self.use_optimized_retrieval = use_optimized_retrieval
|
| 81 |
+
self.use_caching = use_caching
|
| 82 |
+
self.use_reranking = use_reranking
|
| 83 |
+
self._retriever: Optional[OptimizedRetriever] = None
|
| 84 |
+
self._cache: Optional[AnswerCache] = None
|
| 85 |
+
|
| 86 |
+
# Initialize template router for fast intent classification
|
| 87 |
+
self._template_router = CypherTemplateRouter()
|
| 88 |
+
|
| 89 |
+
if use_caching:
|
| 90 |
+
self._cache = get_answer_cache(default_ttl=cache_ttl)
|
| 91 |
+
|
| 92 |
+
def _format_citations(self, docs: List[Document]) -> str:
|
| 93 |
+
"""Format unique citations from retrieved chunk documents.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
docs: List of retrieved documents.
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
Formatted citation string.
|
| 100 |
+
"""
|
| 101 |
+
seen: Set[Tuple[str, Optional[int]]] = set()
|
| 102 |
+
lines: List[str] = []
|
| 103 |
+
|
| 104 |
+
for doc in docs:
|
| 105 |
+
src = doc.metadata.get("source", "")
|
| 106 |
+
page = doc.metadata.get("page", None)
|
| 107 |
+
key = (src, page)
|
| 108 |
+
|
| 109 |
+
if key in seen:
|
| 110 |
+
continue
|
| 111 |
+
seen.add(key)
|
| 112 |
+
|
| 113 |
+
if page is not None:
|
| 114 |
+
lines.append(f"- {src} p.{page}")
|
| 115 |
+
else:
|
| 116 |
+
lines.append(f"- {src}")
|
| 117 |
+
|
| 118 |
+
return "\n".join(lines)
|
| 119 |
+
|
| 120 |
+
def _format_budget_value(
|
| 121 |
+
self,
|
| 122 |
+
budget: Optional[Any],
|
| 123 |
+
currency: Optional[str]
|
| 124 |
+
) -> str:
|
| 125 |
+
"""Format budget value for display.
|
| 126 |
+
|
| 127 |
+
Args:
|
| 128 |
+
budget: Budget amount (may be None or numeric).
|
| 129 |
+
currency: Currency code.
|
| 130 |
+
|
| 131 |
+
Returns:
|
| 132 |
+
Formatted budget string.
|
| 133 |
+
"""
|
| 134 |
+
if isinstance(budget, (int, float)) and currency:
|
| 135 |
+
return f"{budget:,.0f} {currency}"
|
| 136 |
+
elif budget:
|
| 137 |
+
return str(budget)
|
| 138 |
+
return "—"
|
| 139 |
+
|
| 140 |
+
def _format_location(self, row: Dict[str, Any]) -> str:
|
| 141 |
+
"""Format location components into a string.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
row: Query result row with location fields.
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
Formatted location string.
|
| 148 |
+
"""
|
| 149 |
+
loc_parts = [
|
| 150 |
+
x for x in [
|
| 151 |
+
row.get("address"),
|
| 152 |
+
row.get("city"),
|
| 153 |
+
row.get("state"),
|
| 154 |
+
row.get("postal"),
|
| 155 |
+
row.get("country"),
|
| 156 |
+
] if x
|
| 157 |
+
]
|
| 158 |
+
return ", ".join(loc_parts) if loc_parts else "—"
|
| 159 |
+
|
| 160 |
+
def _budget_location(self, graph: Any) -> str:
|
| 161 |
+
"""Deterministic answer for budget allocation and location.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
graph: Neo4jGraph instance.
|
| 165 |
+
|
| 166 |
+
Returns:
|
| 167 |
+
Formatted budget and location answer.
|
| 168 |
+
"""
|
| 169 |
+
rows = graph.query(self.CYPHER_BUDGET_LOCATION)
|
| 170 |
+
|
| 171 |
+
if not rows:
|
| 172 |
+
return "No structured budget/location data found in the graph yet."
|
| 173 |
+
|
| 174 |
+
out = ["**Budget allocation (TIV) and location**"]
|
| 175 |
+
for row in rows:
|
| 176 |
+
budget_str = self._format_budget_value(
|
| 177 |
+
row.get("budget"),
|
| 178 |
+
row.get("currency"),
|
| 179 |
+
)
|
| 180 |
+
loc = self._format_location(row)
|
| 181 |
+
out.append(f"- **{row.get('project')}**: {budget_str}; {loc}")
|
| 182 |
+
|
| 183 |
+
return "\n".join(out)
|
| 184 |
+
|
| 185 |
+
def _timelines(self, graph: Any) -> str:
|
| 186 |
+
"""Deterministic timeline comparison using extracted milestones.
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
graph: Neo4jGraph instance.
|
| 190 |
+
|
| 191 |
+
Returns:
|
| 192 |
+
Formatted timeline answer.
|
| 193 |
+
"""
|
| 194 |
+
rows = graph.query(self.CYPHER_TIMELINES)
|
| 195 |
+
logger.info(f"Timeline query returned {len(rows) if rows else 0} rows")
|
| 196 |
+
|
| 197 |
+
if not rows:
|
| 198 |
+
return "No structured timeline data found in the graph yet."
|
| 199 |
+
|
| 200 |
+
out = ["**Timelines (milestones extracted from Schedule)**"]
|
| 201 |
+
for row in rows:
|
| 202 |
+
project_name = row.get('project') or 'Unknown Project'
|
| 203 |
+
out.append(f"\n### {project_name}")
|
| 204 |
+
milestones = row.get("milestones") or []
|
| 205 |
+
logger.info(f"Project '{project_name}': {len(milestones)} milestones raw")
|
| 206 |
+
|
| 207 |
+
# Filter out null milestones (from OPTIONAL MATCH returning nulls)
|
| 208 |
+
valid_milestones = [m for m in milestones if m and m.get("name")]
|
| 209 |
+
logger.info(f"Project '{project_name}': {len(valid_milestones)} valid milestones")
|
| 210 |
+
|
| 211 |
+
if not valid_milestones:
|
| 212 |
+
out.append("- No milestones extracted")
|
| 213 |
+
else:
|
| 214 |
+
for m in valid_milestones[:14]: # Limit display
|
| 215 |
+
dt = (m.get("dateText") or "").strip()
|
| 216 |
+
nm = (m.get("name") or "Milestone").strip()
|
| 217 |
+
if dt:
|
| 218 |
+
out.append(f"- {nm}: {dt}")
|
| 219 |
+
else:
|
| 220 |
+
sent = m.get('sentence') or ''
|
| 221 |
+
out.append(f"- {nm}: {sent[:100]}")
|
| 222 |
+
|
| 223 |
+
result = "\n".join(out)
|
| 224 |
+
logger.info(f"Timeline result: {len(result)} chars")
|
| 225 |
+
return result
|
| 226 |
+
|
| 227 |
+
def _challenges(self, graph: Any) -> str:
|
| 228 |
+
"""Deterministic challenges listing from structured Challenge nodes.
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
graph: Neo4jGraph instance.
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
Formatted challenges answer.
|
| 235 |
+
"""
|
| 236 |
+
rows = graph.query(self.CYPHER_CHALLENGES)
|
| 237 |
+
|
| 238 |
+
if not rows:
|
| 239 |
+
return "No structured challenges found yet."
|
| 240 |
+
|
| 241 |
+
out = [
|
| 242 |
+
"**Potential challenges / constraints "
|
| 243 |
+
"(from Status reason + Details + schedule heuristics)**"
|
| 244 |
+
]
|
| 245 |
+
for row in rows:
|
| 246 |
+
out.append(f"\n### {row['project']}")
|
| 247 |
+
challenges = [x for x in (row.get("challenges") or []) if x]
|
| 248 |
+
|
| 249 |
+
if not challenges:
|
| 250 |
+
out.append("- —")
|
| 251 |
+
else:
|
| 252 |
+
for ch in challenges[:14]: # Limit display
|
| 253 |
+
out.append(f"- {ch}")
|
| 254 |
+
|
| 255 |
+
return "\n".join(out)
|
| 256 |
+
|
| 257 |
+
def _get_graph_context(self, question: str, graph: Any) -> str:
|
| 258 |
+
"""Get relevant graph context without LLM Cypher generation.
|
| 259 |
+
|
| 260 |
+
Uses simple pattern matching to find related entities.
|
| 261 |
+
|
| 262 |
+
Args:
|
| 263 |
+
question: User question
|
| 264 |
+
graph: Neo4j graph instance
|
| 265 |
+
|
| 266 |
+
Returns:
|
| 267 |
+
Formatted graph context string
|
| 268 |
+
"""
|
| 269 |
+
import re
|
| 270 |
+
|
| 271 |
+
# Extract potential project names from question
|
| 272 |
+
potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', question)
|
| 273 |
+
|
| 274 |
+
if not potential_names:
|
| 275 |
+
return ""
|
| 276 |
+
|
| 277 |
+
context_parts = []
|
| 278 |
+
|
| 279 |
+
for name in potential_names[:2]:
|
| 280 |
+
try:
|
| 281 |
+
results = graph.query("""
|
| 282 |
+
MATCH (p:Project)
|
| 283 |
+
WHERE toLower(p.name) CONTAINS toLower($name)
|
| 284 |
+
OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
|
| 285 |
+
OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
|
| 286 |
+
RETURN p.name AS project,
|
| 287 |
+
p.status AS status,
|
| 288 |
+
b.amount AS budget,
|
| 289 |
+
b.currency AS currency,
|
| 290 |
+
l.city AS city,
|
| 291 |
+
l.country AS country
|
| 292 |
+
LIMIT 3
|
| 293 |
+
""", {"name": name.lower()})
|
| 294 |
+
|
| 295 |
+
for r in results:
|
| 296 |
+
parts = [f"**{r['project']}**"]
|
| 297 |
+
if r.get('status'):
|
| 298 |
+
parts.append(f"Status: {r['status']}")
|
| 299 |
+
if r.get('budget'):
|
| 300 |
+
parts.append(f"Budget: {r['budget']:,.0f} {r.get('currency', '')}")
|
| 301 |
+
if r.get('city'):
|
| 302 |
+
parts.append(f"Location: {r['city']}, {r.get('country', '')}")
|
| 303 |
+
context_parts.append(" | ".join(parts))
|
| 304 |
+
|
| 305 |
+
except Exception:
|
| 306 |
+
pass
|
| 307 |
+
|
| 308 |
+
return "\n".join(context_parts) if context_parts else ""
|
| 309 |
+
|
| 310 |
+
def _get_retriever(self, state: AppState) -> OptimizedRetriever:
|
| 311 |
+
"""Get or create the optimized retriever.
|
| 312 |
+
|
| 313 |
+
Args:
|
| 314 |
+
state: Application state with vector store.
|
| 315 |
+
|
| 316 |
+
Returns:
|
| 317 |
+
OptimizedRetriever instance (fast pattern-based + cross-encoder).
|
| 318 |
+
"""
|
| 319 |
+
if self._retriever is None:
|
| 320 |
+
self._retriever = OptimizedRetriever(
|
| 321 |
+
vector_store=state.vector,
|
| 322 |
+
k_initial=self.k * 2, # Retrieve more initially for reranking
|
| 323 |
+
k_final=self.k,
|
| 324 |
+
use_expansion=True,
|
| 325 |
+
use_reranking=self.use_reranking,
|
| 326 |
+
use_cache=True,
|
| 327 |
+
)
|
| 328 |
+
return self._retriever
|
| 329 |
+
|
| 330 |
+
def _format_context(self, docs: List[Document]) -> str:
|
| 331 |
+
"""Format retrieved documents into context string.
|
| 332 |
+
|
| 333 |
+
Args:
|
| 334 |
+
docs: List of retrieved documents.
|
| 335 |
+
|
| 336 |
+
Returns:
|
| 337 |
+
Formatted context string with source attribution.
|
| 338 |
+
"""
|
| 339 |
+
context_parts = []
|
| 340 |
+
for i, doc in enumerate(docs, 1):
|
| 341 |
+
source = doc.metadata.get('source', 'Unknown')
|
| 342 |
+
page = doc.metadata.get('page', '?')
|
| 343 |
+
section = doc.metadata.get('section', '')
|
| 344 |
+
|
| 345 |
+
header = f"[{i}] Source: {source}, Page {page}"
|
| 346 |
+
if section:
|
| 347 |
+
header += f", Section: {section}"
|
| 348 |
+
|
| 349 |
+
context_parts.append(f"{header}\n{doc.page_content}")
|
| 350 |
+
|
| 351 |
+
return "\n\n---\n\n".join(context_parts)
|
| 352 |
+
|
| 353 |
+
def _graphrag_answer(
|
| 354 |
+
self,
|
| 355 |
+
question: str,
|
| 356 |
+
state: AppState,
|
| 357 |
+
) -> str:
|
| 358 |
+
"""Generate answer using optimized GraphRAG approach.
|
| 359 |
+
|
| 360 |
+
Optimized flow:
|
| 361 |
+
1. Retrieve with optimized retriever (pattern expansion + cross-encoder)
|
| 362 |
+
2. Get graph context (no LLM Cypher generation)
|
| 363 |
+
3. Single LLM call for synthesis
|
| 364 |
+
|
| 365 |
+
Args:
|
| 366 |
+
question: User question.
|
| 367 |
+
state: Application state.
|
| 368 |
+
|
| 369 |
+
Returns:
|
| 370 |
+
Synthesized answer with citations.
|
| 371 |
+
"""
|
| 372 |
+
with log_step(logger, "GraphRAG answer generation"):
|
| 373 |
+
# Retrieve relevant chunks with optimized retriever
|
| 374 |
+
with log_step(logger, "Retrieve relevant chunks"):
|
| 375 |
+
if self.use_optimized_retrieval:
|
| 376 |
+
logger.substep("Using optimized retrieval (pattern expansion + cross-encoder)")
|
| 377 |
+
retriever = self._get_retriever(state)
|
| 378 |
+
docs = retriever.retrieve(question)
|
| 379 |
+
else:
|
| 380 |
+
logger.substep("Using simple similarity search")
|
| 381 |
+
docs = state.vector.similarity_search(question, k=self.k)
|
| 382 |
+
logger.info(f"Retrieved {len(docs)} chunks")
|
| 383 |
+
|
| 384 |
+
# Get graph context (fast, no LLM)
|
| 385 |
+
with log_step(logger, "Get graph context"):
|
| 386 |
+
graph = state.get_graph()
|
| 387 |
+
graph_context = self._get_graph_context(question, graph)
|
| 388 |
+
if graph_context:
|
| 389 |
+
logger.substep(f"Found graph context")
|
| 390 |
+
else:
|
| 391 |
+
logger.substep("No direct graph context found")
|
| 392 |
+
|
| 393 |
+
# Format context
|
| 394 |
+
context = self._format_context(docs)
|
| 395 |
+
|
| 396 |
+
# Single LLM call for synthesis
|
| 397 |
+
with log_step(logger, "Synthesize answer"):
|
| 398 |
+
logger.substep("Invoking LLM for synthesis")
|
| 399 |
+
synthesis_prompt = self.SYNTHESIS_PROMPT.format(
|
| 400 |
+
question=question,
|
| 401 |
+
context=context,
|
| 402 |
+
graph_context=graph_context if graph_context else "(No structured data found)",
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
resp = state.llm.invoke(synthesis_prompt)
|
| 406 |
+
answer = getattr(resp, "content", str(resp))
|
| 407 |
+
|
| 408 |
+
# Cache the answer
|
| 409 |
+
if self._cache and self.use_caching:
|
| 410 |
+
logger.substep("Caching answer")
|
| 411 |
+
self._cache.set_answer(
|
| 412 |
+
query=question,
|
| 413 |
+
answer=answer,
|
| 414 |
+
documents=docs,
|
| 415 |
+
cypher_result=graph_context,
|
| 416 |
+
)
|
| 417 |
+
|
| 418 |
+
return answer
|
| 419 |
+
|
| 420 |
+
def clear_cache(self) -> int:
|
| 421 |
+
"""Clear the answer cache.
|
| 422 |
+
|
| 423 |
+
Returns:
|
| 424 |
+
Number of cached entries cleared.
|
| 425 |
+
"""
|
| 426 |
+
if self._cache:
|
| 427 |
+
return self._cache.invalidate_all()
|
| 428 |
+
return 0
|
| 429 |
+
|
| 430 |
+
def get_cache_stats(self) -> Dict[str, Any]:
|
| 431 |
+
"""Get cache statistics.
|
| 432 |
+
|
| 433 |
+
Returns:
|
| 434 |
+
Dictionary with cache metrics.
|
| 435 |
+
"""
|
| 436 |
+
if self._cache:
|
| 437 |
+
return self._cache.get_stats()
|
| 438 |
+
return {"caching_enabled": False}
|
| 439 |
+
|
| 440 |
+
@trace_flow("Query Processing")
|
| 441 |
+
def answer(self, question: str, state: AppState) -> str:
|
| 442 |
+
"""Answer a user question using optimized hybrid approach.
|
| 443 |
+
|
| 444 |
+
Flow:
|
| 445 |
+
1. Check answer cache
|
| 446 |
+
2. Template routing with pattern classification
|
| 447 |
+
3. For structured queries: Execute template + format
|
| 448 |
+
4. For general queries: Vector search + rerank + synthesis
|
| 449 |
+
|
| 450 |
+
Args:
|
| 451 |
+
question: Natural language user query.
|
| 452 |
+
state: AppState initialized after successful ingestion.
|
| 453 |
+
|
| 454 |
+
Returns:
|
| 455 |
+
Markdown response suitable for display.
|
| 456 |
+
"""
|
| 457 |
+
logger.info(f"Processing question: {question[:80]}...")
|
| 458 |
+
|
| 459 |
+
if not state or not state.is_ready():
|
| 460 |
+
logger.warning("State not ready - PDFs not ingested")
|
| 461 |
+
return "Please ingest PDFs first."
|
| 462 |
+
|
| 463 |
+
# Check cache first
|
| 464 |
+
if self._cache and self.use_caching:
|
| 465 |
+
with log_step(logger, "Check cache"):
|
| 466 |
+
cached = self._cache.get_answer(question)
|
| 467 |
+
if cached:
|
| 468 |
+
logger.info("Cache hit")
|
| 469 |
+
return cached.answer
|
| 470 |
+
|
| 471 |
+
graph = state.get_graph()
|
| 472 |
+
|
| 473 |
+
# Try template routing first (handles 70-80% of queries)
|
| 474 |
+
with log_step(logger, "Template routing"):
|
| 475 |
+
results, intent = self._template_router.route_query(question, graph)
|
| 476 |
+
|
| 477 |
+
if intent != QueryIntent.GENERAL and results is not None:
|
| 478 |
+
# Format template results (no LLM needed)
|
| 479 |
+
answer = TemplateResultFormatter.format(results, intent)
|
| 480 |
+
|
| 481 |
+
# Cache the answer
|
| 482 |
+
if self._cache and self.use_caching:
|
| 483 |
+
self._cache.set_answer(
|
| 484 |
+
query=question,
|
| 485 |
+
answer=answer,
|
| 486 |
+
documents=[],
|
| 487 |
+
cypher_result=str(results[:3]) if results else "",
|
| 488 |
+
)
|
| 489 |
+
|
| 490 |
+
logger.info(f"Template answer (intent: {intent.value})")
|
| 491 |
+
return answer
|
| 492 |
+
|
| 493 |
+
logger.info(f"Intent: {intent.value} - using RAG fallback")
|
| 494 |
+
|
| 495 |
+
# GraphRAG fallback for general queries
|
| 496 |
+
answer = self._graphrag_answer(question, state)
|
| 497 |
+
logger.info("RAG answer generated")
|
| 498 |
+
return answer
|
src/services/builder.py
ADDED
|
@@ -0,0 +1,693 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""GraphRAG builder for PDF ingestion."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import time
|
| 7 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 8 |
+
from typing import Any, Dict, Generator, List, Optional, Tuple
|
| 9 |
+
|
| 10 |
+
from langchain.schema import Document
|
| 11 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 12 |
+
from langchain.prompts import PromptTemplate
|
| 13 |
+
|
| 14 |
+
from src.config import get_logger, trace_flow, log_step
|
| 15 |
+
|
| 16 |
+
# LangChain imports with compatibility handling
|
| 17 |
+
try:
|
| 18 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 19 |
+
from langchain_community.vectorstores import Neo4jVector
|
| 20 |
+
except ImportError:
|
| 21 |
+
from langchain.document_loaders import PyPDFLoader
|
| 22 |
+
from langchain.vectorstores import Neo4jVector
|
| 23 |
+
|
| 24 |
+
from langchain_experimental.graph_transformers import LLMGraphTransformer
|
| 25 |
+
from langchain_community.chains.graph_qa.cypher import GraphCypherQAChain
|
| 26 |
+
from langchain_together import ChatTogether, TogetherEmbeddings
|
| 27 |
+
|
| 28 |
+
from src.config.schema import SchemaPolicy
|
| 29 |
+
from src.config.settings import Neo4jConfig, TogetherAIConfig
|
| 30 |
+
from src.models.state import AppState
|
| 31 |
+
from src.parsers.project_parser import ProjectReportParser
|
| 32 |
+
from src.parsers.smart_chunker import SemanticChunker
|
| 33 |
+
from src.services.neo4j_service import Neo4jService, Neo4jConnectionError
|
| 34 |
+
|
| 35 |
+
# Module logger
|
| 36 |
+
logger = get_logger(__name__)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class GraphRAGBuilder:
|
| 40 |
+
"""Builds and populates Neo4j-backed GraphRAG resources from uploaded PDFs.
|
| 41 |
+
|
| 42 |
+
Responsibilities:
|
| 43 |
+
- Configure Together AI chat + embeddings models.
|
| 44 |
+
- Parse PDFs into pages and chunks with provenance metadata.
|
| 45 |
+
- Upsert deterministic structured graph nodes for stable Q/A.
|
| 46 |
+
- Run LLMGraphTransformer for broader entity/relationship extraction.
|
| 47 |
+
- Create/refresh Neo4jVector hybrid indexes.
|
| 48 |
+
- Create GraphCypherQAChain for graph-native Q/A.
|
| 49 |
+
|
| 50 |
+
This class is intentionally stateless across runs; it returns AppState
|
| 51 |
+
for query-time usage.
|
| 52 |
+
|
| 53 |
+
Attributes:
|
| 54 |
+
llm: Chat model instance.
|
| 55 |
+
embeddings: Embeddings model instance.
|
| 56 |
+
|
| 57 |
+
Example:
|
| 58 |
+
>>> builder = GraphRAGBuilder(
|
| 59 |
+
... together_config=TogetherAIConfig(api_key="key")
|
| 60 |
+
... )
|
| 61 |
+
>>> message, state = builder.ingest(pdf_files, neo4j_config)
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
# Chunk configuration
|
| 65 |
+
DEFAULT_CHUNK_SIZE = 900
|
| 66 |
+
DEFAULT_CHUNK_OVERLAP = 150
|
| 67 |
+
|
| 68 |
+
# Parallel extraction configuration (optimized for speed)
|
| 69 |
+
EXTRACTION_BATCH_SIZE = 8 # Increased from 5
|
| 70 |
+
MAX_EXTRACTION_WORKERS = 5 # Increased from 3
|
| 71 |
+
|
| 72 |
+
# Vector index configuration
|
| 73 |
+
INDEX_NAME = "project_chunks_vector"
|
| 74 |
+
KEYWORD_INDEX_NAME = "project_chunks_keyword"
|
| 75 |
+
NODE_LABEL = "Chunk"
|
| 76 |
+
|
| 77 |
+
# Enhanced Cypher QA prompt with examples
|
| 78 |
+
CYPHER_PROMPT_TEMPLATE = """You are a Neo4j Cypher expert. Generate a Cypher query to answer the question.
|
| 79 |
+
|
| 80 |
+
## Schema
|
| 81 |
+
{schema}
|
| 82 |
+
|
| 83 |
+
## Key Patterns
|
| 84 |
+
|
| 85 |
+
1. **Project with Budget and Location:**
|
| 86 |
+
```cypher
|
| 87 |
+
MATCH (p:Project)
|
| 88 |
+
OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
|
| 89 |
+
OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
|
| 90 |
+
RETURN p.name, b.amount, b.currency, l.city, l.country
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
2. **Project Milestones/Timeline:**
|
| 94 |
+
```cypher
|
| 95 |
+
MATCH (p:Project)-[:HAS_MILESTONE]->(m:Milestone)
|
| 96 |
+
RETURN p.name, m.name AS milestone, m.dateText
|
| 97 |
+
ORDER BY p.name, m.dateText
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
3. **Challenges and Risks:**
|
| 101 |
+
```cypher
|
| 102 |
+
MATCH (p:Project)-[:HAS_CHALLENGE]->(c:Challenge)
|
| 103 |
+
RETURN p.name, collect(c.text) AS challenges
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
4. **Cross-Project Comparison:**
|
| 107 |
+
```cypher
|
| 108 |
+
MATCH (p:Project)
|
| 109 |
+
OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
|
| 110 |
+
OPTIONAL MATCH (p)-[:HAS_MILESTONE]->(m:Milestone)
|
| 111 |
+
WITH p, b, collect(m) AS milestones
|
| 112 |
+
RETURN p.name, b.amount, size(milestones) AS milestone_count
|
| 113 |
+
ORDER BY b.amount DESC
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
5. **Entity Relationships:**
|
| 117 |
+
```cypher
|
| 118 |
+
MATCH (p:Project)-[r]->(related)
|
| 119 |
+
WHERE NOT related:Chunk
|
| 120 |
+
RETURN p.name, type(r) AS relationship, labels(related)[0] AS entity_type,
|
| 121 |
+
coalesce(related.name, related.text, related.amount) AS value
|
| 122 |
+
LIMIT 50
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
## Rules
|
| 126 |
+
- Use OPTIONAL MATCH when relationships may not exist
|
| 127 |
+
- Always include ORDER BY for consistent results
|
| 128 |
+
- Use collect() to aggregate multiple related nodes
|
| 129 |
+
- Limit results if the query could return many rows
|
| 130 |
+
- Return human-readable names, not IDs
|
| 131 |
+
- For comparisons across projects, ensure all projects are included
|
| 132 |
+
|
| 133 |
+
## Question
|
| 134 |
+
{question}
|
| 135 |
+
|
| 136 |
+
Return ONLY the Cypher query, no explanation.""".strip()
|
| 137 |
+
|
| 138 |
+
def __init__(
|
| 139 |
+
self,
|
| 140 |
+
together_config: Optional[TogetherAIConfig] = None,
|
| 141 |
+
together_api_key: Optional[str] = None,
|
| 142 |
+
chat_model: str = "deepseek-ai/DeepSeek-V3",
|
| 143 |
+
embedding_model: str = "togethercomputer/m2-bert-80M-8k-retrieval",
|
| 144 |
+
) -> None:
|
| 145 |
+
"""Initialize GraphRAG builder.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
together_config: Together AI configuration object.
|
| 149 |
+
together_api_key: API key (alternative to config object).
|
| 150 |
+
chat_model: Chat model identifier.
|
| 151 |
+
embedding_model: Embedding model identifier.
|
| 152 |
+
|
| 153 |
+
Raises:
|
| 154 |
+
ValueError: If no API key is provided.
|
| 155 |
+
"""
|
| 156 |
+
# Handle configuration
|
| 157 |
+
if together_config:
|
| 158 |
+
api_key = together_config.api_key
|
| 159 |
+
chat_model = together_config.chat_model or chat_model
|
| 160 |
+
embedding_model = together_config.embedding_model or embedding_model
|
| 161 |
+
else:
|
| 162 |
+
api_key = together_api_key
|
| 163 |
+
|
| 164 |
+
if not api_key:
|
| 165 |
+
raise ValueError("Together API key is required.")
|
| 166 |
+
|
| 167 |
+
# Set environment variable for SDK
|
| 168 |
+
os.environ["TOGETHER_API_KEY"] = api_key
|
| 169 |
+
|
| 170 |
+
# Initialize models
|
| 171 |
+
self.llm = ChatTogether(model=chat_model, temperature=0)
|
| 172 |
+
self.embeddings = TogetherEmbeddings(model=embedding_model)
|
| 173 |
+
|
| 174 |
+
# Initialize parsers and chunkers
|
| 175 |
+
self._parser = ProjectReportParser()
|
| 176 |
+
self._chunker = SemanticChunker(
|
| 177 |
+
max_chunk_size=self.DEFAULT_CHUNK_SIZE + 300, # Slightly larger for semantic chunks
|
| 178 |
+
min_chunk_size=200,
|
| 179 |
+
overlap_sentences=2,
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
def _load_pdf_pages(
|
| 183 |
+
self,
|
| 184 |
+
pdf_files: List[Any]
|
| 185 |
+
) -> Tuple[List[Document], List[Tuple[str, str]]]:
|
| 186 |
+
"""Load PDF files and extract pages with metadata.
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
pdf_files: List of gradio-uploaded file handles.
|
| 190 |
+
|
| 191 |
+
Returns:
|
| 192 |
+
Tuple of (all pages as Documents, list of (source_name, full_text)).
|
| 193 |
+
"""
|
| 194 |
+
all_pages: List[Document] = []
|
| 195 |
+
raw_texts: List[Tuple[str, str]] = []
|
| 196 |
+
|
| 197 |
+
with log_step(logger, "Load PDF files", f"{len(pdf_files)} file(s)"):
|
| 198 |
+
for f in pdf_files:
|
| 199 |
+
src_name = (
|
| 200 |
+
getattr(f, "name", None) or
|
| 201 |
+
getattr(f, "orig_name", None) or
|
| 202 |
+
"uploaded.pdf"
|
| 203 |
+
)
|
| 204 |
+
logger.substep(f"Loading: {os.path.basename(src_name)}")
|
| 205 |
+
loader = PyPDFLoader(f.name)
|
| 206 |
+
pages = loader.load()
|
| 207 |
+
all_pages.extend(pages)
|
| 208 |
+
logger.substep(f"Extracted {len(pages)} pages")
|
| 209 |
+
|
| 210 |
+
joined = "\n".join([p.page_content for p in pages])
|
| 211 |
+
raw_texts.append((os.path.basename(src_name), joined))
|
| 212 |
+
|
| 213 |
+
logger.info(f"Total pages loaded: {len(all_pages)}")
|
| 214 |
+
return all_pages, raw_texts
|
| 215 |
+
|
| 216 |
+
def _create_chunks(
|
| 217 |
+
self,
|
| 218 |
+
pages: List[Document],
|
| 219 |
+
use_semantic_chunking: bool = True,
|
| 220 |
+
) -> List[Document]:
|
| 221 |
+
"""Split pages into chunks with normalized metadata.
|
| 222 |
+
|
| 223 |
+
Args:
|
| 224 |
+
pages: List of page Documents.
|
| 225 |
+
use_semantic_chunking: If True, uses section-aware chunking.
|
| 226 |
+
|
| 227 |
+
Returns:
|
| 228 |
+
List of chunk Documents with metadata.
|
| 229 |
+
"""
|
| 230 |
+
chunking_type = "semantic" if use_semantic_chunking else "character-based"
|
| 231 |
+
with log_step(logger, "Create document chunks", chunking_type):
|
| 232 |
+
if use_semantic_chunking:
|
| 233 |
+
# Use semantic chunker that respects document structure
|
| 234 |
+
logger.substep("Using section-aware semantic chunking")
|
| 235 |
+
chunks = self._chunker.chunk_pages(pages, adaptive=True)
|
| 236 |
+
else:
|
| 237 |
+
# Fallback to simple character-based splitting
|
| 238 |
+
logger.substep("Using RecursiveCharacterTextSplitter")
|
| 239 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 240 |
+
chunk_size=self.DEFAULT_CHUNK_SIZE,
|
| 241 |
+
chunk_overlap=self.DEFAULT_CHUNK_OVERLAP,
|
| 242 |
+
)
|
| 243 |
+
chunks = splitter.split_documents(pages)
|
| 244 |
+
|
| 245 |
+
logger.substep(f"Raw chunks created: {len(chunks)}")
|
| 246 |
+
|
| 247 |
+
processed_chunks: List[Document] = []
|
| 248 |
+
for chunk in chunks:
|
| 249 |
+
meta = dict(chunk.metadata or {})
|
| 250 |
+
meta["source"] = os.path.basename(meta.get("source", "")) or "uploaded.pdf"
|
| 251 |
+
|
| 252 |
+
# Normalize page numbers (PyPDFLoader uses 0-index)
|
| 253 |
+
if "page" in meta and isinstance(meta["page"], int):
|
| 254 |
+
if meta["page"] == 0 or (not use_semantic_chunking):
|
| 255 |
+
meta["page"] = int(meta["page"]) + 1
|
| 256 |
+
|
| 257 |
+
processed_chunks.append(Document(
|
| 258 |
+
page_content=chunk.page_content.replace("\n", " "),
|
| 259 |
+
metadata=meta,
|
| 260 |
+
))
|
| 261 |
+
|
| 262 |
+
logger.info(f"Final chunks: {len(processed_chunks)}")
|
| 263 |
+
return processed_chunks
|
| 264 |
+
|
| 265 |
+
def _extract_structured_data(
|
| 266 |
+
self,
|
| 267 |
+
neo4j: Neo4jService,
|
| 268 |
+
raw_texts: List[Tuple[str, str]],
|
| 269 |
+
) -> List[Dict[str, Any]]:
|
| 270 |
+
"""Extract and upsert structured project data.
|
| 271 |
+
|
| 272 |
+
Args:
|
| 273 |
+
neo4j: Neo4j service instance.
|
| 274 |
+
raw_texts: List of (source_name, full_text) tuples.
|
| 275 |
+
|
| 276 |
+
Returns:
|
| 277 |
+
List of project dictionaries with results/warnings.
|
| 278 |
+
"""
|
| 279 |
+
projects_created: List[Dict[str, Any]] = []
|
| 280 |
+
|
| 281 |
+
with log_step(logger, "Extract structured data", f"{len(raw_texts)} document(s)"):
|
| 282 |
+
for source, full_text in raw_texts:
|
| 283 |
+
logger.substep(f"Parsing: {source}")
|
| 284 |
+
record = self._parser.parse(full_text, source)
|
| 285 |
+
try:
|
| 286 |
+
proj = neo4j.upsert_structured_project(record)
|
| 287 |
+
projects_created.append(proj)
|
| 288 |
+
logger.substep(f"Created project: {proj.get('name', source)}")
|
| 289 |
+
except Exception as e:
|
| 290 |
+
logger.warning(f"Failed to create project {source}: {e}")
|
| 291 |
+
projects_created.append({
|
| 292 |
+
"projectId": record.project_id or source,
|
| 293 |
+
"name": record.project_name or source,
|
| 294 |
+
"warning": str(e),
|
| 295 |
+
})
|
| 296 |
+
|
| 297 |
+
logger.info(f"Structured extraction complete: {len(projects_created)} project(s)")
|
| 298 |
+
return projects_created
|
| 299 |
+
|
| 300 |
+
def _extract_llm_graph(
|
| 301 |
+
self,
|
| 302 |
+
neo4j: Neo4jService,
|
| 303 |
+
chunks: List[Document],
|
| 304 |
+
parallel: bool = True,
|
| 305 |
+
) -> None:
|
| 306 |
+
"""Extract entities/relationships using LLM and add to graph.
|
| 307 |
+
|
| 308 |
+
Args:
|
| 309 |
+
neo4j: Neo4j service instance.
|
| 310 |
+
chunks: Document chunks for extraction.
|
| 311 |
+
parallel: If True, uses parallel batch processing.
|
| 312 |
+
"""
|
| 313 |
+
mode = "parallel" if parallel else "sequential"
|
| 314 |
+
with log_step(logger, "LLM graph extraction", f"{len(chunks)} chunks, {mode}"):
|
| 315 |
+
logger.substep("Initializing LLMGraphTransformer")
|
| 316 |
+
transformer = LLMGraphTransformer(
|
| 317 |
+
llm=self.llm,
|
| 318 |
+
allowed_nodes=SchemaPolicy.ALLOWED_NODES,
|
| 319 |
+
allowed_relationships=SchemaPolicy.ALLOWED_RELATIONSHIPS,
|
| 320 |
+
node_properties=True, # Enable property extraction for richer graph
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
if not parallel or len(chunks) <= self.EXTRACTION_BATCH_SIZE:
|
| 324 |
+
# Sequential extraction for small chunk sets
|
| 325 |
+
logger.substep("Using sequential extraction (small chunk set)")
|
| 326 |
+
graph_documents = transformer.convert_to_graph_documents(chunks)
|
| 327 |
+
neo4j.graph.add_graph_documents(graph_documents, include_source=True)
|
| 328 |
+
logger.info(f"Added {len(graph_documents)} graph documents")
|
| 329 |
+
return
|
| 330 |
+
|
| 331 |
+
# Parallel extraction for larger chunk sets
|
| 332 |
+
def process_batch(batch: List[Document]) -> List:
|
| 333 |
+
"""Process a batch of chunks."""
|
| 334 |
+
try:
|
| 335 |
+
return transformer.convert_to_graph_documents(batch)
|
| 336 |
+
except Exception:
|
| 337 |
+
return []
|
| 338 |
+
|
| 339 |
+
# Split into batches
|
| 340 |
+
batches = [
|
| 341 |
+
chunks[i:i + self.EXTRACTION_BATCH_SIZE]
|
| 342 |
+
for i in range(0, len(chunks), self.EXTRACTION_BATCH_SIZE)
|
| 343 |
+
]
|
| 344 |
+
logger.substep(f"Split into {len(batches)} batches ({self.EXTRACTION_BATCH_SIZE} chunks each)")
|
| 345 |
+
|
| 346 |
+
all_graph_docs = []
|
| 347 |
+
failed_batches = 0
|
| 348 |
+
|
| 349 |
+
# Process batches with thread pool for IO-bound LLM calls
|
| 350 |
+
logger.substep(f"Starting parallel extraction with {self.MAX_EXTRACTION_WORKERS} workers")
|
| 351 |
+
with ThreadPoolExecutor(max_workers=self.MAX_EXTRACTION_WORKERS) as executor:
|
| 352 |
+
futures = {
|
| 353 |
+
executor.submit(process_batch, batch): i
|
| 354 |
+
for i, batch in enumerate(batches)
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
for future in as_completed(futures):
|
| 358 |
+
batch_idx = futures[future]
|
| 359 |
+
try:
|
| 360 |
+
result = future.result(timeout=120)
|
| 361 |
+
all_graph_docs.extend(result)
|
| 362 |
+
logger.substep(f"Batch {batch_idx + 1}/{len(batches)} complete")
|
| 363 |
+
except Exception as e:
|
| 364 |
+
failed_batches += 1
|
| 365 |
+
logger.warning(f"Batch {batch_idx + 1} failed: {e}")
|
| 366 |
+
|
| 367 |
+
# Bulk add to graph
|
| 368 |
+
if all_graph_docs:
|
| 369 |
+
logger.substep(f"Adding {len(all_graph_docs)} graph documents to Neo4j")
|
| 370 |
+
neo4j.graph.add_graph_documents(all_graph_docs, include_source=True)
|
| 371 |
+
|
| 372 |
+
if failed_batches > 0:
|
| 373 |
+
logger.warning(f"{failed_batches} batch(es) failed during extraction")
|
| 374 |
+
|
| 375 |
+
logger.info(f"LLM extraction complete: {len(all_graph_docs)} graph documents")
|
| 376 |
+
|
| 377 |
+
def _create_vector_index(
|
| 378 |
+
self,
|
| 379 |
+
chunks: List[Document],
|
| 380 |
+
neo4j_config: Neo4jConfig,
|
| 381 |
+
) -> Neo4jVector:
|
| 382 |
+
"""Create or refresh vector index for chunks.
|
| 383 |
+
|
| 384 |
+
Args:
|
| 385 |
+
chunks: Document chunks to index.
|
| 386 |
+
neo4j_config: Neo4j connection configuration.
|
| 387 |
+
|
| 388 |
+
Returns:
|
| 389 |
+
Neo4jVector index instance.
|
| 390 |
+
"""
|
| 391 |
+
with log_step(logger, "Create vector index", f"{len(chunks)} chunks"):
|
| 392 |
+
logger.substep(f"Index name: {self.INDEX_NAME}")
|
| 393 |
+
logger.substep(f"Keyword index: {self.KEYWORD_INDEX_NAME}")
|
| 394 |
+
logger.substep("Creating hybrid search index (dense + BM25)")
|
| 395 |
+
|
| 396 |
+
vector = Neo4jVector.from_documents(
|
| 397 |
+
documents=chunks,
|
| 398 |
+
embedding=self.embeddings,
|
| 399 |
+
url=neo4j_config.uri,
|
| 400 |
+
username=neo4j_config.username,
|
| 401 |
+
password=neo4j_config.password,
|
| 402 |
+
database=neo4j_config.database or "neo4j",
|
| 403 |
+
index_name=self.INDEX_NAME,
|
| 404 |
+
keyword_index_name=self.KEYWORD_INDEX_NAME,
|
| 405 |
+
node_label=self.NODE_LABEL,
|
| 406 |
+
embedding_node_property="embedding",
|
| 407 |
+
search_type="hybrid",
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
logger.info("Vector index created successfully")
|
| 411 |
+
return vector
|
| 412 |
+
|
| 413 |
+
def _create_qa_chain(self, neo4j: Neo4jService) -> GraphCypherQAChain:
|
| 414 |
+
"""Create Cypher QA chain for graph querying.
|
| 415 |
+
|
| 416 |
+
Args:
|
| 417 |
+
neo4j: Neo4j service instance.
|
| 418 |
+
|
| 419 |
+
Returns:
|
| 420 |
+
GraphCypherQAChain instance.
|
| 421 |
+
"""
|
| 422 |
+
with log_step(logger, "Create Cypher QA chain"):
|
| 423 |
+
logger.substep("Configuring enhanced Cypher prompt template")
|
| 424 |
+
cypher_prompt = PromptTemplate(
|
| 425 |
+
template=self.CYPHER_PROMPT_TEMPLATE,
|
| 426 |
+
input_variables=["schema", "question"],
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
logger.substep("Initializing GraphCypherQAChain")
|
| 430 |
+
chain = GraphCypherQAChain.from_llm(
|
| 431 |
+
llm=self.llm,
|
| 432 |
+
graph=neo4j.graph,
|
| 433 |
+
cypher_prompt=cypher_prompt,
|
| 434 |
+
verbose=False,
|
| 435 |
+
allow_dangerous_requests=True,
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
logger.info("Cypher QA chain ready")
|
| 439 |
+
return chain
|
| 440 |
+
|
| 441 |
+
@trace_flow("PDF Ingestion Pipeline")
|
| 442 |
+
def ingest(
|
| 443 |
+
self,
|
| 444 |
+
pdf_files: List[Any],
|
| 445 |
+
neo4j_config: Optional[Neo4jConfig] = None,
|
| 446 |
+
neo4j_uri: Optional[str] = None,
|
| 447 |
+
neo4j_user: Optional[str] = None,
|
| 448 |
+
neo4j_password: Optional[str] = None,
|
| 449 |
+
neo4j_database: str = "neo4j",
|
| 450 |
+
clear_db: bool = True,
|
| 451 |
+
) -> Tuple[str, AppState]:
|
| 452 |
+
"""Ingest one or more PDF reports into Neo4j and build GraphRAG indices.
|
| 453 |
+
|
| 454 |
+
Args:
|
| 455 |
+
pdf_files: List of gradio-uploaded file handles.
|
| 456 |
+
neo4j_config: Neo4j configuration object (preferred).
|
| 457 |
+
neo4j_uri: Neo4j connection URI (alternative).
|
| 458 |
+
neo4j_user: Username (alternative).
|
| 459 |
+
neo4j_password: Password (alternative).
|
| 460 |
+
neo4j_database: Database name.
|
| 461 |
+
clear_db: If True, deletes all existing nodes prior to ingestion.
|
| 462 |
+
|
| 463 |
+
Returns:
|
| 464 |
+
Tuple of (human-readable status message, AppState).
|
| 465 |
+
|
| 466 |
+
Notes:
|
| 467 |
+
- The ingestion process can be compute-heavy due to LLM graph extraction.
|
| 468 |
+
- Even if the deterministic parser yields partial results, chunk retrieval
|
| 469 |
+
still works.
|
| 470 |
+
"""
|
| 471 |
+
# Validate inputs
|
| 472 |
+
if not pdf_files:
|
| 473 |
+
logger.warning("No PDF files provided")
|
| 474 |
+
return "Please upload at least one PDF.", AppState()
|
| 475 |
+
|
| 476 |
+
logger.info(f"Starting ingestion of {len(pdf_files)} PDF file(s)")
|
| 477 |
+
|
| 478 |
+
# Build config from parameters if not provided
|
| 479 |
+
if neo4j_config is None:
|
| 480 |
+
neo4j_config = Neo4jConfig(
|
| 481 |
+
uri=neo4j_uri or "",
|
| 482 |
+
username=neo4j_user or "neo4j",
|
| 483 |
+
password=neo4j_password or "",
|
| 484 |
+
database=neo4j_database,
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
if not neo4j_config.is_valid():
|
| 488 |
+
logger.error("Invalid Neo4j configuration")
|
| 489 |
+
return "Please provide Neo4j connection details.", AppState()
|
| 490 |
+
|
| 491 |
+
# Connect to Neo4j
|
| 492 |
+
with log_step(logger, "Connect to Neo4j"):
|
| 493 |
+
try:
|
| 494 |
+
neo4j = Neo4jService(
|
| 495 |
+
uri=neo4j_config.uri,
|
| 496 |
+
user=neo4j_config.username,
|
| 497 |
+
password=neo4j_config.password,
|
| 498 |
+
database=neo4j_config.database,
|
| 499 |
+
)
|
| 500 |
+
logger.substep(f"Connected to {neo4j_config.uri}")
|
| 501 |
+
except Neo4jConnectionError as e:
|
| 502 |
+
logger.error(f"Neo4j connection failed: {e}")
|
| 503 |
+
return (
|
| 504 |
+
f"Neo4j connection failed. For Aura, use the exact URI shown in the "
|
| 505 |
+
f"console (typically starts with neo4j+s://...). Error: {e}",
|
| 506 |
+
AppState(),
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
# Ensure constraints
|
| 510 |
+
with log_step(logger, "Ensure database constraints"):
|
| 511 |
+
neo4j.ensure_constraints()
|
| 512 |
+
|
| 513 |
+
# Clear database if requested
|
| 514 |
+
if clear_db:
|
| 515 |
+
with log_step(logger, "Clear existing data"):
|
| 516 |
+
neo4j.clear()
|
| 517 |
+
|
| 518 |
+
# 1) Load PDF pages
|
| 519 |
+
all_pages, raw_texts = self._load_pdf_pages(pdf_files)
|
| 520 |
+
|
| 521 |
+
# 2) Structured extraction (high precision)
|
| 522 |
+
projects_created = self._extract_structured_data(neo4j, raw_texts)
|
| 523 |
+
|
| 524 |
+
# 3) Create chunks
|
| 525 |
+
chunks = self._create_chunks(all_pages)
|
| 526 |
+
|
| 527 |
+
# 4) LLM-based KG extraction (high recall)
|
| 528 |
+
self._extract_llm_graph(neo4j, chunks)
|
| 529 |
+
|
| 530 |
+
# 5) Vector index
|
| 531 |
+
vector = self._create_vector_index(chunks, neo4j_config)
|
| 532 |
+
|
| 533 |
+
# 6) Cypher QA chain
|
| 534 |
+
qa_chain = self._create_qa_chain(neo4j)
|
| 535 |
+
|
| 536 |
+
# Build status message
|
| 537 |
+
proj_lines = []
|
| 538 |
+
for p in projects_created:
|
| 539 |
+
warn = f" (warning: {p.get('warning')})" if "warning" in p else ""
|
| 540 |
+
proj_lines.append(f"- {p.get('name')} [{p.get('projectId')}]{warn}")
|
| 541 |
+
|
| 542 |
+
msg = (
|
| 543 |
+
"Ingestion complete.\n\n"
|
| 544 |
+
f"Neo4j database: `{neo4j_config.database}`\n\n"
|
| 545 |
+
"Projects found:\n" + "\n".join(proj_lines)
|
| 546 |
+
)
|
| 547 |
+
|
| 548 |
+
logger.info(f"Ingestion complete: {len(projects_created)} project(s), {len(chunks)} chunks")
|
| 549 |
+
|
| 550 |
+
return msg, AppState(
|
| 551 |
+
neo4j=neo4j,
|
| 552 |
+
vector=vector,
|
| 553 |
+
qa_chain=qa_chain,
|
| 554 |
+
llm=self.llm,
|
| 555 |
+
)
|
| 556 |
+
|
| 557 |
+
def ingest_with_progress(
|
| 558 |
+
self,
|
| 559 |
+
pdf_files: List[Any],
|
| 560 |
+
neo4j_config: Optional[Neo4jConfig] = None,
|
| 561 |
+
neo4j_uri: Optional[str] = None,
|
| 562 |
+
neo4j_user: Optional[str] = None,
|
| 563 |
+
neo4j_password: Optional[str] = None,
|
| 564 |
+
neo4j_database: str = "neo4j",
|
| 565 |
+
clear_db: bool = True,
|
| 566 |
+
skip_llm_extraction: bool = True, # Skip LLM extraction for faster ingestion
|
| 567 |
+
) -> Generator[Tuple[str, float, Optional[AppState]], None, None]:
|
| 568 |
+
"""Ingest PDFs with progress updates for UI.
|
| 569 |
+
|
| 570 |
+
This generator yields progress updates during ingestion, allowing
|
| 571 |
+
the UI to display a progress bar with status messages.
|
| 572 |
+
|
| 573 |
+
Args:
|
| 574 |
+
pdf_files: List of gradio-uploaded file handles.
|
| 575 |
+
neo4j_config: Neo4j configuration object (preferred).
|
| 576 |
+
neo4j_uri: Neo4j connection URI (alternative).
|
| 577 |
+
neo4j_user: Username (alternative).
|
| 578 |
+
neo4j_password: Password (alternative).
|
| 579 |
+
neo4j_database: Database name.
|
| 580 |
+
clear_db: If True, deletes all existing nodes prior to ingestion.
|
| 581 |
+
skip_llm_extraction: If True, skips LLM graph extraction for faster ingestion.
|
| 582 |
+
|
| 583 |
+
Yields:
|
| 584 |
+
Tuple of (status_message, progress_fraction, optional_state)
|
| 585 |
+
- progress_fraction is 0.0 to 1.0
|
| 586 |
+
- optional_state is None until final yield, then contains AppState
|
| 587 |
+
|
| 588 |
+
Example:
|
| 589 |
+
>>> for status, progress, state in builder.ingest_with_progress(files, config):
|
| 590 |
+
... print(f"{progress*100:.0f}%: {status}")
|
| 591 |
+
... if state:
|
| 592 |
+
... print("Done!")
|
| 593 |
+
"""
|
| 594 |
+
start_time = time.time()
|
| 595 |
+
|
| 596 |
+
# Validate inputs
|
| 597 |
+
if not pdf_files:
|
| 598 |
+
yield "❌ Please upload at least one PDF file.", 0.0, None
|
| 599 |
+
return
|
| 600 |
+
|
| 601 |
+
# Build config from parameters if not provided
|
| 602 |
+
if neo4j_config is None:
|
| 603 |
+
neo4j_config = Neo4jConfig(
|
| 604 |
+
uri=neo4j_uri or "",
|
| 605 |
+
username=neo4j_user or "neo4j",
|
| 606 |
+
password=neo4j_password or "",
|
| 607 |
+
database=neo4j_database,
|
| 608 |
+
)
|
| 609 |
+
|
| 610 |
+
if not neo4j_config.is_valid():
|
| 611 |
+
yield "❌ Please provide Neo4j connection details.", 0.0, None
|
| 612 |
+
return
|
| 613 |
+
|
| 614 |
+
# Step 1: Connect to Neo4j (5%)
|
| 615 |
+
yield "🔌 Connecting to Neo4j...", 0.05, None
|
| 616 |
+
try:
|
| 617 |
+
neo4j = Neo4jService(
|
| 618 |
+
uri=neo4j_config.uri,
|
| 619 |
+
user=neo4j_config.username,
|
| 620 |
+
password=neo4j_config.password,
|
| 621 |
+
database=neo4j_config.database,
|
| 622 |
+
)
|
| 623 |
+
except Neo4jConnectionError as e:
|
| 624 |
+
yield f"❌ Neo4j connection failed: {e}", 0.05, None
|
| 625 |
+
return
|
| 626 |
+
|
| 627 |
+
# Step 2: Ensure constraints (10%)
|
| 628 |
+
yield "📋 Setting up database constraints...", 0.10, None
|
| 629 |
+
neo4j.ensure_constraints()
|
| 630 |
+
|
| 631 |
+
# Step 3: Clear database if requested (15%)
|
| 632 |
+
if clear_db:
|
| 633 |
+
yield "🗑️ Clearing existing data...", 0.15, None
|
| 634 |
+
neo4j.clear()
|
| 635 |
+
|
| 636 |
+
# Step 4: Load PDF pages (25%)
|
| 637 |
+
yield f"📄 Loading {len(pdf_files)} PDF file(s)...", 0.20, None
|
| 638 |
+
all_pages, raw_texts = self._load_pdf_pages(pdf_files)
|
| 639 |
+
yield f"📄 Loaded {len(all_pages)} pages from PDFs", 0.25, None
|
| 640 |
+
|
| 641 |
+
# Step 5: Structured extraction (35%)
|
| 642 |
+
yield "🔍 Extracting structured project data...", 0.30, None
|
| 643 |
+
projects_created = self._extract_structured_data(neo4j, raw_texts)
|
| 644 |
+
project_names = [p.get('name', 'Unknown') for p in projects_created]
|
| 645 |
+
yield f"✅ Found {len(projects_created)} project(s): {', '.join(project_names)}", 0.35, None
|
| 646 |
+
|
| 647 |
+
# Step 6: Create chunks (45%)
|
| 648 |
+
yield "✂️ Creating document chunks...", 0.40, None
|
| 649 |
+
chunks = self._create_chunks(all_pages)
|
| 650 |
+
yield f"✅ Created {len(chunks)} chunks", 0.45, None
|
| 651 |
+
|
| 652 |
+
# Step 7: LLM Graph Extraction (optional) (45-70%)
|
| 653 |
+
if not skip_llm_extraction:
|
| 654 |
+
yield f"🧠 Extracting entities with LLM ({len(chunks)} chunks)...", 0.50, None
|
| 655 |
+
# This is the slowest step - show batch progress
|
| 656 |
+
total_batches = (len(chunks) + self.EXTRACTION_BATCH_SIZE - 1) // self.EXTRACTION_BATCH_SIZE
|
| 657 |
+
for batch_num in range(total_batches):
|
| 658 |
+
progress = 0.50 + (0.20 * (batch_num + 1) / total_batches)
|
| 659 |
+
yield f"🧠 LLM extraction: batch {batch_num + 1}/{total_batches}...", progress, None
|
| 660 |
+
self._extract_llm_graph(neo4j, chunks)
|
| 661 |
+
yield "✅ LLM graph extraction complete", 0.70, None
|
| 662 |
+
else:
|
| 663 |
+
yield "⏩ Skipping LLM extraction (using fast mode)", 0.70, None
|
| 664 |
+
|
| 665 |
+
# Step 8: Create vector index (90%)
|
| 666 |
+
yield f"📊 Creating vector index ({len(chunks)} chunks)...", 0.75, None
|
| 667 |
+
vector = self._create_vector_index(chunks, neo4j_config)
|
| 668 |
+
yield "✅ Vector index created", 0.90, None
|
| 669 |
+
|
| 670 |
+
# Step 9: Create QA chain (95%)
|
| 671 |
+
yield "⚙️ Initializing QA chain...", 0.95, None
|
| 672 |
+
qa_chain = self._create_qa_chain(neo4j)
|
| 673 |
+
|
| 674 |
+
# Final step: Complete (100%)
|
| 675 |
+
elapsed = time.time() - start_time
|
| 676 |
+
proj_lines = []
|
| 677 |
+
for p in projects_created:
|
| 678 |
+
warn = f" ⚠️ {p.get('warning')}" if "warning" in p else ""
|
| 679 |
+
proj_lines.append(f"- **{p.get('name')}** [{p.get('projectId')}]{warn}")
|
| 680 |
+
|
| 681 |
+
final_msg = (
|
| 682 |
+
f"## ✅ Ingestion Complete ({elapsed:.1f}s)\n\n"
|
| 683 |
+
f"**Database:** `{neo4j_config.database}`\n\n"
|
| 684 |
+
f"**Projects found:**\n" + "\n".join(proj_lines) + "\n\n"
|
| 685 |
+
f"**Stats:** {len(chunks)} chunks indexed"
|
| 686 |
+
)
|
| 687 |
+
|
| 688 |
+
yield final_msg, 1.0, AppState(
|
| 689 |
+
neo4j=neo4j,
|
| 690 |
+
vector=vector,
|
| 691 |
+
qa_chain=qa_chain,
|
| 692 |
+
llm=self.llm,
|
| 693 |
+
)
|
src/services/cache.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Query result caching for improved performance.
|
| 3 |
+
|
| 4 |
+
Provides in-memory caching with TTL for query results,
|
| 5 |
+
reducing latency and API costs for repeated queries.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import hashlib
|
| 11 |
+
import threading
|
| 12 |
+
import time
|
| 13 |
+
from dataclasses import dataclass, field
|
| 14 |
+
from typing import Any, Dict, List, Optional
|
| 15 |
+
|
| 16 |
+
from langchain.schema import Document
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class CacheEntry:
|
| 21 |
+
"""A single cache entry with TTL support."""
|
| 22 |
+
value: Any
|
| 23 |
+
timestamp: float
|
| 24 |
+
ttl: float
|
| 25 |
+
hits: int = 0
|
| 26 |
+
|
| 27 |
+
def is_expired(self) -> bool:
|
| 28 |
+
"""Check if entry has expired.
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
True if entry is past its TTL.
|
| 32 |
+
"""
|
| 33 |
+
return time.time() - self.timestamp > self.ttl
|
| 34 |
+
|
| 35 |
+
def access(self) -> Any:
|
| 36 |
+
"""Access the cached value and increment hit counter.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
Cached value.
|
| 40 |
+
"""
|
| 41 |
+
self.hits += 1
|
| 42 |
+
return self.value
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class QueryCache:
|
| 46 |
+
"""In-memory cache for query results with TTL."""
|
| 47 |
+
|
| 48 |
+
def __init__(
|
| 49 |
+
self,
|
| 50 |
+
default_ttl: float = 3600,
|
| 51 |
+
max_size: int = 1000,
|
| 52 |
+
) -> None:
|
| 53 |
+
self._cache: Dict[str, CacheEntry] = {}
|
| 54 |
+
self._lock = threading.RLock()
|
| 55 |
+
self.default_ttl = default_ttl
|
| 56 |
+
self.max_size = max_size
|
| 57 |
+
self._total_hits = 0
|
| 58 |
+
self._total_misses = 0
|
| 59 |
+
|
| 60 |
+
def _make_key(self, query: str, context_hash: str = "") -> str:
|
| 61 |
+
"""Create cache key from query and context.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
query: Query string.
|
| 65 |
+
context_hash: Optional context identifier.
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
MD5 hash key.
|
| 69 |
+
"""
|
| 70 |
+
combined = f"{query.lower().strip()}:{context_hash}"
|
| 71 |
+
return hashlib.md5(combined.encode()).hexdigest()
|
| 72 |
+
|
| 73 |
+
def _evict_if_needed(self) -> None:
|
| 74 |
+
"""Evict oldest entries if cache is at capacity."""
|
| 75 |
+
if len(self._cache) < self.max_size:
|
| 76 |
+
return
|
| 77 |
+
|
| 78 |
+
# Remove expired entries first
|
| 79 |
+
self.cleanup_expired()
|
| 80 |
+
|
| 81 |
+
# If still over capacity, remove oldest entries
|
| 82 |
+
if len(self._cache) >= self.max_size:
|
| 83 |
+
sorted_keys = sorted(
|
| 84 |
+
self._cache.keys(),
|
| 85 |
+
key=lambda k: self._cache[k].timestamp
|
| 86 |
+
)
|
| 87 |
+
# Remove oldest 10%
|
| 88 |
+
to_remove = max(1, len(sorted_keys) // 10)
|
| 89 |
+
for key in sorted_keys[:to_remove]:
|
| 90 |
+
del self._cache[key]
|
| 91 |
+
|
| 92 |
+
def get(
|
| 93 |
+
self,
|
| 94 |
+
query: str,
|
| 95 |
+
context_hash: str = ""
|
| 96 |
+
) -> Optional[Any]:
|
| 97 |
+
"""Get cached result if exists and not expired.
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
query: Query string.
|
| 101 |
+
context_hash: Optional context identifier.
|
| 102 |
+
|
| 103 |
+
Returns:
|
| 104 |
+
Cached value or None if not found/expired.
|
| 105 |
+
"""
|
| 106 |
+
key = self._make_key(query, context_hash)
|
| 107 |
+
|
| 108 |
+
with self._lock:
|
| 109 |
+
entry = self._cache.get(key)
|
| 110 |
+
|
| 111 |
+
if entry is None:
|
| 112 |
+
self._total_misses += 1
|
| 113 |
+
return None
|
| 114 |
+
|
| 115 |
+
if entry.is_expired():
|
| 116 |
+
del self._cache[key]
|
| 117 |
+
self._total_misses += 1
|
| 118 |
+
return None
|
| 119 |
+
|
| 120 |
+
self._total_hits += 1
|
| 121 |
+
return entry.access()
|
| 122 |
+
|
| 123 |
+
def set(
|
| 124 |
+
self,
|
| 125 |
+
query: str,
|
| 126 |
+
context_hash: str,
|
| 127 |
+
value: Any,
|
| 128 |
+
ttl: Optional[float] = None,
|
| 129 |
+
) -> None:
|
| 130 |
+
"""Cache a result.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
query: Query string.
|
| 134 |
+
context_hash: Context identifier.
|
| 135 |
+
value: Value to cache.
|
| 136 |
+
ttl: Optional TTL override.
|
| 137 |
+
"""
|
| 138 |
+
key = self._make_key(query, context_hash)
|
| 139 |
+
|
| 140 |
+
with self._lock:
|
| 141 |
+
self._evict_if_needed()
|
| 142 |
+
self._cache[key] = CacheEntry(
|
| 143 |
+
value=value,
|
| 144 |
+
timestamp=time.time(),
|
| 145 |
+
ttl=ttl or self.default_ttl,
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
def invalidate(self, query: str, context_hash: str = "") -> bool:
|
| 149 |
+
"""Invalidate a specific cache entry.
|
| 150 |
+
|
| 151 |
+
Args:
|
| 152 |
+
query: Query string.
|
| 153 |
+
context_hash: Context identifier.
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
True if entry was found and removed.
|
| 157 |
+
"""
|
| 158 |
+
key = self._make_key(query, context_hash)
|
| 159 |
+
|
| 160 |
+
with self._lock:
|
| 161 |
+
if key in self._cache:
|
| 162 |
+
del self._cache[key]
|
| 163 |
+
return True
|
| 164 |
+
return False
|
| 165 |
+
|
| 166 |
+
def invalidate_all(self) -> int:
|
| 167 |
+
"""Clear entire cache.
|
| 168 |
+
|
| 169 |
+
Returns:
|
| 170 |
+
Number of entries cleared.
|
| 171 |
+
"""
|
| 172 |
+
with self._lock:
|
| 173 |
+
count = len(self._cache)
|
| 174 |
+
self._cache.clear()
|
| 175 |
+
return count
|
| 176 |
+
|
| 177 |
+
def cleanup_expired(self) -> int:
|
| 178 |
+
"""Remove expired entries.
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
Number of entries removed.
|
| 182 |
+
"""
|
| 183 |
+
with self._lock:
|
| 184 |
+
expired_keys = [
|
| 185 |
+
k for k, v in self._cache.items() if v.is_expired()
|
| 186 |
+
]
|
| 187 |
+
for key in expired_keys:
|
| 188 |
+
del self._cache[key]
|
| 189 |
+
return len(expired_keys)
|
| 190 |
+
|
| 191 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 192 |
+
"""Get cache statistics.
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
Dictionary with cache metrics.
|
| 196 |
+
"""
|
| 197 |
+
with self._lock:
|
| 198 |
+
total_requests = self._total_hits + self._total_misses
|
| 199 |
+
hit_rate = (
|
| 200 |
+
self._total_hits / total_requests
|
| 201 |
+
if total_requests > 0
|
| 202 |
+
else 0.0
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
return {
|
| 206 |
+
"size": len(self._cache),
|
| 207 |
+
"max_size": self.max_size,
|
| 208 |
+
"total_hits": self._total_hits,
|
| 209 |
+
"total_misses": self._total_misses,
|
| 210 |
+
"hit_rate": round(hit_rate, 3),
|
| 211 |
+
"default_ttl": self.default_ttl,
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
class AnswerCache(QueryCache):
|
| 216 |
+
"""Specialized cache for GraphRAG answers.
|
| 217 |
+
|
| 218 |
+
Extends QueryCache with answer-specific functionality like
|
| 219 |
+
caching both the answer and supporting documents.
|
| 220 |
+
"""
|
| 221 |
+
|
| 222 |
+
@dataclass
|
| 223 |
+
class AnswerEntry:
|
| 224 |
+
"""Cached answer with supporting documents."""
|
| 225 |
+
answer: str
|
| 226 |
+
documents: List[Document] = field(default_factory=list)
|
| 227 |
+
cypher_result: str = ""
|
| 228 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
| 229 |
+
|
| 230 |
+
def set_answer(
|
| 231 |
+
self,
|
| 232 |
+
query: str,
|
| 233 |
+
answer: str,
|
| 234 |
+
documents: Optional[List[Document]] = None,
|
| 235 |
+
cypher_result: str = "",
|
| 236 |
+
context_hash: str = "",
|
| 237 |
+
ttl: Optional[float] = None,
|
| 238 |
+
) -> None:
|
| 239 |
+
"""Cache a complete answer with metadata.
|
| 240 |
+
|
| 241 |
+
Args:
|
| 242 |
+
query: User query.
|
| 243 |
+
answer: Generated answer.
|
| 244 |
+
documents: Supporting documents.
|
| 245 |
+
cypher_result: Cypher query result if any.
|
| 246 |
+
context_hash: Context identifier.
|
| 247 |
+
ttl: Optional TTL override.
|
| 248 |
+
"""
|
| 249 |
+
entry = self.AnswerEntry(
|
| 250 |
+
answer=answer,
|
| 251 |
+
documents=documents or [],
|
| 252 |
+
cypher_result=cypher_result,
|
| 253 |
+
metadata={"cached_at": time.time()},
|
| 254 |
+
)
|
| 255 |
+
self.set(query, context_hash, entry, ttl)
|
| 256 |
+
|
| 257 |
+
def get_answer(
|
| 258 |
+
self,
|
| 259 |
+
query: str,
|
| 260 |
+
context_hash: str = ""
|
| 261 |
+
) -> Optional[AnswerEntry]:
|
| 262 |
+
"""Get cached answer.
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
query: User query.
|
| 266 |
+
context_hash: Context identifier.
|
| 267 |
+
|
| 268 |
+
Returns:
|
| 269 |
+
AnswerEntry or None if not found.
|
| 270 |
+
"""
|
| 271 |
+
result = self.get(query, context_hash)
|
| 272 |
+
if isinstance(result, self.AnswerEntry):
|
| 273 |
+
return result
|
| 274 |
+
return None
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
# Global cache instances
|
| 278 |
+
_query_cache: Optional[QueryCache] = None
|
| 279 |
+
_answer_cache: Optional[AnswerCache] = None
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def get_query_cache(
|
| 283 |
+
default_ttl: float = 3600,
|
| 284 |
+
max_size: int = 1000,
|
| 285 |
+
) -> QueryCache:
|
| 286 |
+
"""Get or create the global query cache.
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
default_ttl: Default TTL for entries.
|
| 290 |
+
max_size: Maximum cache size.
|
| 291 |
+
|
| 292 |
+
Returns:
|
| 293 |
+
QueryCache singleton instance.
|
| 294 |
+
"""
|
| 295 |
+
global _query_cache
|
| 296 |
+
if _query_cache is None:
|
| 297 |
+
_query_cache = QueryCache(default_ttl=default_ttl, max_size=max_size)
|
| 298 |
+
return _query_cache
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
def get_answer_cache(
|
| 302 |
+
default_ttl: float = 3600,
|
| 303 |
+
max_size: int = 500,
|
| 304 |
+
) -> AnswerCache:
|
| 305 |
+
"""Get or create the global answer cache.
|
| 306 |
+
|
| 307 |
+
Args:
|
| 308 |
+
default_ttl: Default TTL for entries.
|
| 309 |
+
max_size: Maximum cache size.
|
| 310 |
+
|
| 311 |
+
Returns:
|
| 312 |
+
AnswerCache singleton instance.
|
| 313 |
+
"""
|
| 314 |
+
global _answer_cache
|
| 315 |
+
if _answer_cache is None:
|
| 316 |
+
_answer_cache = AnswerCache(default_ttl=default_ttl, max_size=max_size)
|
| 317 |
+
return _answer_cache
|
src/services/cypher_templates.py
ADDED
|
@@ -0,0 +1,1332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pre-validated Cypher query templates for deterministic query routing."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
import hashlib
|
| 8 |
+
from dataclasses import dataclass, field
|
| 9 |
+
from enum import Enum
|
| 10 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 11 |
+
import logging
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# =============================================================================
|
| 17 |
+
# LLM-BASED INTENT CLASSIFIER
|
| 18 |
+
# =============================================================================
|
| 19 |
+
|
| 20 |
+
class LLMIntentClassifier:
|
| 21 |
+
"""Classifies query intent using a lightweight LLM.
|
| 22 |
+
|
| 23 |
+
Uses a small model from Together AI for intent classification.
|
| 24 |
+
Handles synonyms naturally without hardcoding patterns.
|
| 25 |
+
Caches results and falls back to pattern matching if LLM fails.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
# Cheap, fast model for classification
|
| 29 |
+
DEFAULT_MODEL = "meta-llama/Llama-3.2-3B-Instruct-Turbo"
|
| 30 |
+
|
| 31 |
+
# Classification prompt - designed to be concise for speed
|
| 32 |
+
CLASSIFICATION_PROMPT = """Classify this query into exactly ONE category. For compound queries, pick the combined category.
|
| 33 |
+
|
| 34 |
+
Categories:
|
| 35 |
+
- TIMELINE_LOCATION: Questions about BOTH timeline/schedule AND location/place
|
| 36 |
+
- TIMELINE_BUDGET: Questions about BOTH timeline/schedule AND budget/cost
|
| 37 |
+
- BUDGET_LOCATION: Questions about BOTH cost/money AND location/place
|
| 38 |
+
- CONTACTS: Questions about project manager, owner, engineer, contractor, lead, head, E&C firm, personnel, who is responsible
|
| 39 |
+
- TIMELINE: Questions ONLY about schedule, dates, milestones, deadlines, duration, when things happen
|
| 40 |
+
- CHALLENGES: Questions about problems, risks, issues, obstacles, delays, failures, difficulties, constraints
|
| 41 |
+
- BUDGET: Questions ONLY about cost, money, investment, funding, expenses, price, TIV, financial aspects, spend
|
| 42 |
+
- LOCATION: Questions ONLY about where, place, site, city, country, address, geography, region
|
| 43 |
+
- TECHNICAL: Questions about capacity, scope, technical details, specifications, requirements, fuel type, labor
|
| 44 |
+
- COMPARISON: Generic comparison of ALL aspects of projects (budget, timeline, location, challenges, contacts)
|
| 45 |
+
- STATUS: Questions about current state, progress, whether active/cancelled, probability
|
| 46 |
+
- OVERVIEW: Questions asking for summary, description, general information, tell me about
|
| 47 |
+
- GENERAL: Questions that don't fit above categories or need detailed analysis
|
| 48 |
+
|
| 49 |
+
Query: "{query}"
|
| 50 |
+
|
| 51 |
+
Respond with ONLY the category name, nothing else."""
|
| 52 |
+
|
| 53 |
+
def __init__(
|
| 54 |
+
self,
|
| 55 |
+
model: str = None,
|
| 56 |
+
api_key: str = None,
|
| 57 |
+
use_cache: bool = True,
|
| 58 |
+
fallback_to_patterns: bool = True,
|
| 59 |
+
):
|
| 60 |
+
"""Initialize LLM intent classifier.
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
model: Together AI model ID. Defaults to Llama-3.2-3B.
|
| 64 |
+
api_key: Together AI API key. Uses env var if not provided.
|
| 65 |
+
use_cache: Whether to cache classification results.
|
| 66 |
+
fallback_to_patterns: Whether to use pattern matching as fallback.
|
| 67 |
+
"""
|
| 68 |
+
self.model = model or self.DEFAULT_MODEL
|
| 69 |
+
self.api_key = api_key or os.environ.get("TOGETHER_API_KEY")
|
| 70 |
+
self.use_cache = use_cache
|
| 71 |
+
self.fallback_to_patterns = fallback_to_patterns
|
| 72 |
+
self._cache: Dict[str, str] = {}
|
| 73 |
+
self._client = None
|
| 74 |
+
|
| 75 |
+
def _get_client(self):
|
| 76 |
+
"""Lazy-load Together AI client."""
|
| 77 |
+
if self._client is None:
|
| 78 |
+
try:
|
| 79 |
+
from together import Together
|
| 80 |
+
self._client = Together(api_key=self.api_key)
|
| 81 |
+
except ImportError:
|
| 82 |
+
logger.warning("together package not installed")
|
| 83 |
+
return None
|
| 84 |
+
except Exception as e:
|
| 85 |
+
logger.warning(f"Failed to initialize Together client: {e}")
|
| 86 |
+
return None
|
| 87 |
+
return self._client
|
| 88 |
+
|
| 89 |
+
def _cache_key(self, query: str) -> str:
|
| 90 |
+
"""Generate cache key for query."""
|
| 91 |
+
return hashlib.md5(query.lower().strip().encode()).hexdigest()
|
| 92 |
+
|
| 93 |
+
def classify(self, query: str) -> str:
|
| 94 |
+
"""Classify query intent using LLM.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
query: User query string
|
| 98 |
+
|
| 99 |
+
Returns:
|
| 100 |
+
Intent category name (e.g., "TIMELINE", "BUDGET")
|
| 101 |
+
"""
|
| 102 |
+
# Check cache first
|
| 103 |
+
if self.use_cache:
|
| 104 |
+
cache_key = self._cache_key(query)
|
| 105 |
+
if cache_key in self._cache:
|
| 106 |
+
logger.debug(f"Intent cache hit: {self._cache[cache_key]}")
|
| 107 |
+
return self._cache[cache_key]
|
| 108 |
+
|
| 109 |
+
# Try LLM classification
|
| 110 |
+
client = self._get_client()
|
| 111 |
+
if client:
|
| 112 |
+
try:
|
| 113 |
+
response = client.chat.completions.create(
|
| 114 |
+
model=self.model,
|
| 115 |
+
messages=[
|
| 116 |
+
{"role": "user", "content": self.CLASSIFICATION_PROMPT.format(query=query)}
|
| 117 |
+
],
|
| 118 |
+
max_tokens=20, # Only need category name
|
| 119 |
+
temperature=0, # Deterministic
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
intent = response.choices[0].message.content.strip().upper()
|
| 123 |
+
|
| 124 |
+
# Validate intent is a known category
|
| 125 |
+
valid_intents = {
|
| 126 |
+
"BUDGET_LOCATION", "TIMELINE_LOCATION", "TIMELINE_BUDGET",
|
| 127 |
+
"TIMELINE", "CHALLENGES", "BUDGET", "LOCATION",
|
| 128 |
+
"CONTACTS", "TECHNICAL", "COMPARISON", "STATUS",
|
| 129 |
+
"OVERVIEW", "GENERAL"
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
# Handle variations in response - check longer names first
|
| 133 |
+
matched = False
|
| 134 |
+
for valid in sorted(valid_intents, key=len, reverse=True):
|
| 135 |
+
if valid in intent:
|
| 136 |
+
intent = valid
|
| 137 |
+
matched = True
|
| 138 |
+
break
|
| 139 |
+
|
| 140 |
+
if not matched:
|
| 141 |
+
intent = "GENERAL"
|
| 142 |
+
|
| 143 |
+
# Cache result
|
| 144 |
+
if self.use_cache:
|
| 145 |
+
self._cache[cache_key] = intent
|
| 146 |
+
|
| 147 |
+
logger.info(f"LLM classified query as: {intent}")
|
| 148 |
+
return intent
|
| 149 |
+
|
| 150 |
+
except Exception as e:
|
| 151 |
+
logger.warning(f"LLM classification failed: {e}")
|
| 152 |
+
|
| 153 |
+
# Fallback to pattern matching
|
| 154 |
+
if self.fallback_to_patterns:
|
| 155 |
+
return self._pattern_fallback(query)
|
| 156 |
+
|
| 157 |
+
return "GENERAL"
|
| 158 |
+
|
| 159 |
+
def _pattern_fallback(self, query: str) -> str:
|
| 160 |
+
"""Simple pattern-based fallback if LLM fails."""
|
| 161 |
+
q = query.lower()
|
| 162 |
+
|
| 163 |
+
# Check for keywords - expanded synonym sets
|
| 164 |
+
has_timeline = any(w in q for w in [
|
| 165 |
+
"timeline", "schedule", "milestone", "deadline", "when", "date",
|
| 166 |
+
"duration", "start", "finish", "complete", "begin", "end"
|
| 167 |
+
])
|
| 168 |
+
has_budget = any(w in q for w in [
|
| 169 |
+
"budget", "cost", "investment", "money", "spend", "fund", "price",
|
| 170 |
+
"expense", "tiv", "financial", "dollar", "amount", "funding"
|
| 171 |
+
])
|
| 172 |
+
has_location = any(w in q for w in [
|
| 173 |
+
"location", "where", "site", "city", "country", "place", "address",
|
| 174 |
+
"region", "state", "area", "geography", "situated"
|
| 175 |
+
])
|
| 176 |
+
has_challenge = any(w in q for w in [
|
| 177 |
+
"challenge", "risk", "issue", "problem", "obstacle", "delay",
|
| 178 |
+
"difficult", "constraint", "failure", "cancelled", "cancel"
|
| 179 |
+
])
|
| 180 |
+
has_contacts = any(w in q for w in [
|
| 181 |
+
"manager", "owner", "engineer", "contractor", "lead", "head",
|
| 182 |
+
"contact", "personnel", "responsible", "e&c", "firm", "who"
|
| 183 |
+
])
|
| 184 |
+
has_technical = any(w in q for w in [
|
| 185 |
+
"capacity", "scope", "technical", "specification", "requirement",
|
| 186 |
+
"fuel", "labor", "megawatt", "mw", "barrel", "bbl", "unit"
|
| 187 |
+
])
|
| 188 |
+
|
| 189 |
+
# Check for compound intents first (most specific)
|
| 190 |
+
if has_timeline and has_location:
|
| 191 |
+
return "TIMELINE_LOCATION"
|
| 192 |
+
if has_timeline and has_budget:
|
| 193 |
+
return "TIMELINE_BUDGET"
|
| 194 |
+
if has_budget and has_location:
|
| 195 |
+
return "BUDGET_LOCATION"
|
| 196 |
+
|
| 197 |
+
# Single intents - prioritize more specific ones
|
| 198 |
+
if has_contacts:
|
| 199 |
+
return "CONTACTS"
|
| 200 |
+
if has_technical:
|
| 201 |
+
return "TECHNICAL"
|
| 202 |
+
if has_timeline:
|
| 203 |
+
return "TIMELINE"
|
| 204 |
+
if has_challenge:
|
| 205 |
+
return "CHALLENGES"
|
| 206 |
+
if has_budget:
|
| 207 |
+
return "BUDGET"
|
| 208 |
+
if has_location:
|
| 209 |
+
return "LOCATION"
|
| 210 |
+
|
| 211 |
+
# Generic intents
|
| 212 |
+
if any(w in q for w in ["compare", "comparison", "versus", "vs", "differ", "difference"]):
|
| 213 |
+
return "COMPARISON"
|
| 214 |
+
if any(w in q for w in ["status", "progress", "state", "active", "probability"]):
|
| 215 |
+
return "STATUS"
|
| 216 |
+
if any(w in q for w in ["overview", "summary", "describe", "explain", "tell me", "about"]):
|
| 217 |
+
return "OVERVIEW"
|
| 218 |
+
|
| 219 |
+
return "GENERAL"
|
| 220 |
+
|
| 221 |
+
def clear_cache(self) -> int:
|
| 222 |
+
"""Clear the classification cache."""
|
| 223 |
+
count = len(self._cache)
|
| 224 |
+
self._cache.clear()
|
| 225 |
+
return count
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
class QueryIntent(Enum):
|
| 229 |
+
"""Detected query intents for template routing."""
|
| 230 |
+
BUDGET = "budget"
|
| 231 |
+
LOCATION = "location"
|
| 232 |
+
BUDGET_LOCATION = "budget_location"
|
| 233 |
+
TIMELINE = "timeline"
|
| 234 |
+
TIMELINE_LOCATION = "timeline_location" # Combined: timeline + location
|
| 235 |
+
TIMELINE_BUDGET = "timeline_budget" # Combined: timeline + budget
|
| 236 |
+
CHALLENGES = "challenges"
|
| 237 |
+
CONTACTS = "contacts" # Project manager, owner, engineer
|
| 238 |
+
TECHNICAL = "technical" # Capacity, scope, specifications
|
| 239 |
+
COMPARISON = "comparison" # Full comparison with all data
|
| 240 |
+
PROJECT_OVERVIEW = "overview"
|
| 241 |
+
PROJECT_STATUS = "status"
|
| 242 |
+
GENERAL = "general" # Requires RAG fallback
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
@dataclass
|
| 246 |
+
class CypherTemplate:
|
| 247 |
+
"""Pre-validated Cypher query template.
|
| 248 |
+
|
| 249 |
+
Attributes:
|
| 250 |
+
intent: The query intent this template handles
|
| 251 |
+
cypher: The Cypher query string
|
| 252 |
+
description: Human-readable description
|
| 253 |
+
required_params: List of required parameter names (if any)
|
| 254 |
+
"""
|
| 255 |
+
intent: QueryIntent
|
| 256 |
+
cypher: str
|
| 257 |
+
description: str
|
| 258 |
+
required_params: List[str] = field(default_factory=list)
|
| 259 |
+
|
| 260 |
+
def execute(self, graph: Any, params: Optional[Dict[str, Any]] = None) -> List[Dict]:
|
| 261 |
+
"""Execute template against the graph.
|
| 262 |
+
|
| 263 |
+
Args:
|
| 264 |
+
graph: Neo4j graph instance (LangChain Neo4jGraph)
|
| 265 |
+
params: Optional query parameters
|
| 266 |
+
|
| 267 |
+
Returns:
|
| 268 |
+
List of result dictionaries
|
| 269 |
+
"""
|
| 270 |
+
try:
|
| 271 |
+
return graph.query(self.cypher, params or {})
|
| 272 |
+
except Exception as e:
|
| 273 |
+
logger.warning(f"Template execution failed: {e}")
|
| 274 |
+
return []
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
class CypherTemplateRouter:
|
| 278 |
+
"""Routes queries to pre-validated Cypher templates.
|
| 279 |
+
|
| 280 |
+
This eliminates LLM Cypher generation for ~70-80% of queries,
|
| 281 |
+
providing deterministic, fast, and reliable results.
|
| 282 |
+
|
| 283 |
+
Example:
|
| 284 |
+
>>> router = CypherTemplateRouter()
|
| 285 |
+
>>> results, intent = router.route_query("What is the budget?", graph)
|
| 286 |
+
>>> if results is not None:
|
| 287 |
+
... print(f"Used template for {intent.value}")
|
| 288 |
+
"""
|
| 289 |
+
|
| 290 |
+
# =====================================================================
|
| 291 |
+
# PRE-VALIDATED CYPHER TEMPLATES
|
| 292 |
+
# =====================================================================
|
| 293 |
+
# These queries have been tested against the actual graph schema and
|
| 294 |
+
# are guaranteed to work correctly.
|
| 295 |
+
|
| 296 |
+
TEMPLATES = {
|
| 297 |
+
QueryIntent.BUDGET_LOCATION: CypherTemplate(
|
| 298 |
+
intent=QueryIntent.BUDGET_LOCATION,
|
| 299 |
+
cypher="""
|
| 300 |
+
MATCH (p:Project)
|
| 301 |
+
OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
|
| 302 |
+
OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
|
| 303 |
+
RETURN p.name AS project,
|
| 304 |
+
p.projectId AS projectId,
|
| 305 |
+
p.status AS status,
|
| 306 |
+
b.amount AS budget,
|
| 307 |
+
b.currency AS currency,
|
| 308 |
+
l.address AS address,
|
| 309 |
+
l.city AS city,
|
| 310 |
+
l.state AS state,
|
| 311 |
+
l.postal AS postal,
|
| 312 |
+
l.country AS country,
|
| 313 |
+
l.zoneCounty AS zoneCounty
|
| 314 |
+
ORDER BY p.name
|
| 315 |
+
""",
|
| 316 |
+
description="Get budget (TIV) and location for all projects",
|
| 317 |
+
),
|
| 318 |
+
|
| 319 |
+
QueryIntent.BUDGET: CypherTemplate(
|
| 320 |
+
intent=QueryIntent.BUDGET,
|
| 321 |
+
cypher="""
|
| 322 |
+
MATCH (p:Project)
|
| 323 |
+
OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
|
| 324 |
+
RETURN p.name AS project,
|
| 325 |
+
p.projectId AS projectId,
|
| 326 |
+
p.status AS status,
|
| 327 |
+
b.amount AS budget,
|
| 328 |
+
b.currency AS currency,
|
| 329 |
+
b.kind AS budgetType
|
| 330 |
+
ORDER BY b.amount DESC
|
| 331 |
+
""",
|
| 332 |
+
description="Get budget/investment information for all projects",
|
| 333 |
+
),
|
| 334 |
+
|
| 335 |
+
QueryIntent.LOCATION: CypherTemplate(
|
| 336 |
+
intent=QueryIntent.LOCATION,
|
| 337 |
+
cypher="""
|
| 338 |
+
MATCH (p:Project)
|
| 339 |
+
OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
|
| 340 |
+
RETURN p.name AS project,
|
| 341 |
+
p.projectId AS projectId,
|
| 342 |
+
l.address AS address,
|
| 343 |
+
l.city AS city,
|
| 344 |
+
l.state AS state,
|
| 345 |
+
l.postal AS postal,
|
| 346 |
+
l.country AS country,
|
| 347 |
+
l.zoneCounty AS zone
|
| 348 |
+
ORDER BY p.name
|
| 349 |
+
""",
|
| 350 |
+
description="Get location information for all projects",
|
| 351 |
+
),
|
| 352 |
+
|
| 353 |
+
QueryIntent.TIMELINE: CypherTemplate(
|
| 354 |
+
intent=QueryIntent.TIMELINE,
|
| 355 |
+
cypher="""
|
| 356 |
+
MATCH (p:Project)
|
| 357 |
+
OPTIONAL MATCH (p)-[:HAS_MILESTONE]->(m:Milestone)
|
| 358 |
+
WITH p, m
|
| 359 |
+
ORDER BY p.name, m.dateText
|
| 360 |
+
RETURN p.name AS project,
|
| 361 |
+
p.projectId AS projectId,
|
| 362 |
+
p.status AS status,
|
| 363 |
+
collect({
|
| 364 |
+
name: m.name,
|
| 365 |
+
date: m.dateText,
|
| 366 |
+
detail: m.sentence
|
| 367 |
+
}) AS milestones
|
| 368 |
+
ORDER BY p.name
|
| 369 |
+
""",
|
| 370 |
+
description="Get timeline and milestones for all projects",
|
| 371 |
+
),
|
| 372 |
+
|
| 373 |
+
QueryIntent.CHALLENGES: CypherTemplate(
|
| 374 |
+
intent=QueryIntent.CHALLENGES,
|
| 375 |
+
cypher="""
|
| 376 |
+
MATCH (p:Project)
|
| 377 |
+
OPTIONAL MATCH (p)-[:HAS_CHALLENGE]->(c:Challenge)
|
| 378 |
+
RETURN p.name AS project,
|
| 379 |
+
p.projectId AS projectId,
|
| 380 |
+
p.status AS status,
|
| 381 |
+
p.statusReason AS statusReason,
|
| 382 |
+
collect(DISTINCT c.text) AS challenges
|
| 383 |
+
ORDER BY p.name
|
| 384 |
+
""",
|
| 385 |
+
description="Get challenges, constraints, and risks for all projects",
|
| 386 |
+
),
|
| 387 |
+
|
| 388 |
+
QueryIntent.TIMELINE_LOCATION: CypherTemplate(
|
| 389 |
+
intent=QueryIntent.TIMELINE_LOCATION,
|
| 390 |
+
cypher="""
|
| 391 |
+
MATCH (p:Project)
|
| 392 |
+
OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
|
| 393 |
+
OPTIONAL MATCH (p)-[:HAS_MILESTONE]->(m:Milestone)
|
| 394 |
+
WITH p, l, m
|
| 395 |
+
ORDER BY p.name, m.dateText
|
| 396 |
+
RETURN p.name AS project,
|
| 397 |
+
p.projectId AS projectId,
|
| 398 |
+
p.status AS status,
|
| 399 |
+
l.city AS city,
|
| 400 |
+
l.state AS state,
|
| 401 |
+
l.country AS country,
|
| 402 |
+
l.address AS address,
|
| 403 |
+
collect({
|
| 404 |
+
name: m.name,
|
| 405 |
+
date: m.dateText,
|
| 406 |
+
detail: m.sentence
|
| 407 |
+
}) AS milestones
|
| 408 |
+
ORDER BY p.name
|
| 409 |
+
""",
|
| 410 |
+
description="Get timeline milestones AND location for all projects",
|
| 411 |
+
),
|
| 412 |
+
|
| 413 |
+
QueryIntent.TIMELINE_BUDGET: CypherTemplate(
|
| 414 |
+
intent=QueryIntent.TIMELINE_BUDGET,
|
| 415 |
+
cypher="""
|
| 416 |
+
MATCH (p:Project)
|
| 417 |
+
OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
|
| 418 |
+
OPTIONAL MATCH (p)-[:HAS_MILESTONE]->(m:Milestone)
|
| 419 |
+
WITH p, b, m
|
| 420 |
+
ORDER BY p.name, m.dateText
|
| 421 |
+
RETURN p.name AS project,
|
| 422 |
+
p.projectId AS projectId,
|
| 423 |
+
p.status AS status,
|
| 424 |
+
b.amount AS budget,
|
| 425 |
+
b.currency AS currency,
|
| 426 |
+
collect({
|
| 427 |
+
name: m.name,
|
| 428 |
+
date: m.dateText,
|
| 429 |
+
detail: m.sentence
|
| 430 |
+
}) AS milestones
|
| 431 |
+
ORDER BY p.name
|
| 432 |
+
""",
|
| 433 |
+
description="Get timeline milestones AND budget for all projects",
|
| 434 |
+
),
|
| 435 |
+
|
| 436 |
+
QueryIntent.CONTACTS: CypherTemplate(
|
| 437 |
+
intent=QueryIntent.CONTACTS,
|
| 438 |
+
cypher="""
|
| 439 |
+
MATCH (p:Project)
|
| 440 |
+
RETURN p.name AS project,
|
| 441 |
+
p.projectId AS projectId,
|
| 442 |
+
p.status AS status,
|
| 443 |
+
p.projectManager AS projectManager,
|
| 444 |
+
p.projectManagerCompany AS projectManagerCompany,
|
| 445 |
+
p.projectManagerTitle AS projectManagerTitle,
|
| 446 |
+
p.projectManagerEmail AS projectManagerEmail,
|
| 447 |
+
p.projectManagerPhone AS projectManagerPhone,
|
| 448 |
+
p.plantOwner AS plantOwner,
|
| 449 |
+
p.plantParent AS plantParent,
|
| 450 |
+
p.plantName AS plantName,
|
| 451 |
+
p.engineerCompany AS engineerCompany,
|
| 452 |
+
p.ecFirm AS ecFirm,
|
| 453 |
+
p.phone AS phone
|
| 454 |
+
ORDER BY p.name
|
| 455 |
+
""",
|
| 456 |
+
description="Get project manager, owner, engineer, and contact information",
|
| 457 |
+
),
|
| 458 |
+
|
| 459 |
+
QueryIntent.TECHNICAL: CypherTemplate(
|
| 460 |
+
intent=QueryIntent.TECHNICAL,
|
| 461 |
+
cypher="""
|
| 462 |
+
MATCH (p:Project)
|
| 463 |
+
RETURN p.name AS project,
|
| 464 |
+
p.projectId AS projectId,
|
| 465 |
+
p.status AS status,
|
| 466 |
+
p.industryCode AS industryCode,
|
| 467 |
+
p.projectType AS projectType,
|
| 468 |
+
p.sector AS sector,
|
| 469 |
+
p.sicCode AS sicCode,
|
| 470 |
+
p.sicProduct AS sicProduct,
|
| 471 |
+
p.pecTiming AS pecTiming,
|
| 472 |
+
p.pecActivity AS pecActivity,
|
| 473 |
+
p.projectCapacity AS projectCapacity,
|
| 474 |
+
p.scopeText AS scopeText,
|
| 475 |
+
p.environmental AS environmental,
|
| 476 |
+
p.constructionLabor AS constructionLabor,
|
| 477 |
+
p.operationsLabor AS operationsLabor,
|
| 478 |
+
p.fuelType AS fuelType,
|
| 479 |
+
p.unitName AS unitName
|
| 480 |
+
ORDER BY p.name
|
| 481 |
+
""",
|
| 482 |
+
description="Get technical details including capacity, scope, and specifications",
|
| 483 |
+
),
|
| 484 |
+
|
| 485 |
+
QueryIntent.COMPARISON: CypherTemplate(
|
| 486 |
+
intent=QueryIntent.COMPARISON,
|
| 487 |
+
cypher="""
|
| 488 |
+
MATCH (p:Project)
|
| 489 |
+
OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
|
| 490 |
+
OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
|
| 491 |
+
OPTIONAL MATCH (p)-[:HAS_MILESTONE]->(m:Milestone)
|
| 492 |
+
OPTIONAL MATCH (p)-[:HAS_CHALLENGE]->(c:Challenge)
|
| 493 |
+
WITH p, b, l, m, c
|
| 494 |
+
ORDER BY p.name, m.dateText
|
| 495 |
+
WITH p, b, l,
|
| 496 |
+
collect(DISTINCT {name: m.name, date: m.dateText}) AS milestones,
|
| 497 |
+
collect(DISTINCT c.text) AS challenges
|
| 498 |
+
RETURN p.name AS project,
|
| 499 |
+
p.projectId AS projectId,
|
| 500 |
+
p.status AS status,
|
| 501 |
+
p.statusReason AS statusReason,
|
| 502 |
+
p.projectProbability AS projectProbability,
|
| 503 |
+
p.projectManager AS projectManager,
|
| 504 |
+
p.projectManagerCompany AS projectManagerCompany,
|
| 505 |
+
p.projectManagerTitle AS projectManagerTitle,
|
| 506 |
+
p.plantOwner AS plantOwner,
|
| 507 |
+
p.plantParent AS plantParent,
|
| 508 |
+
p.plantName AS plantName,
|
| 509 |
+
p.engineerCompany AS engineerCompany,
|
| 510 |
+
p.ecFirm AS ecFirm,
|
| 511 |
+
p.industryCode AS industryCode,
|
| 512 |
+
p.projectType AS projectType,
|
| 513 |
+
p.sector AS sector,
|
| 514 |
+
p.sicCode AS sicCode,
|
| 515 |
+
p.pecTiming AS pecTiming,
|
| 516 |
+
p.pecActivity AS pecActivity,
|
| 517 |
+
p.projectCapacity AS projectCapacity,
|
| 518 |
+
p.scopeText AS scopeText,
|
| 519 |
+
b.amount AS budget,
|
| 520 |
+
b.currency AS currency,
|
| 521 |
+
l.city AS city,
|
| 522 |
+
l.state AS state,
|
| 523 |
+
l.country AS country,
|
| 524 |
+
l.address AS address,
|
| 525 |
+
milestones,
|
| 526 |
+
challenges
|
| 527 |
+
ORDER BY b.amount DESC
|
| 528 |
+
""",
|
| 529 |
+
description="Compare all projects with full details (budget, location, timeline, challenges, contacts, technical)",
|
| 530 |
+
),
|
| 531 |
+
|
| 532 |
+
QueryIntent.PROJECT_OVERVIEW: CypherTemplate(
|
| 533 |
+
intent=QueryIntent.PROJECT_OVERVIEW,
|
| 534 |
+
cypher="""
|
| 535 |
+
MATCH (p:Project)
|
| 536 |
+
OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
|
| 537 |
+
OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
|
| 538 |
+
OPTIONAL MATCH (p)-[:HAS_REPORT]->(r:Report)
|
| 539 |
+
RETURN p.name AS project,
|
| 540 |
+
p.projectId AS projectId,
|
| 541 |
+
p.status AS status,
|
| 542 |
+
p.statusReason AS statusReason,
|
| 543 |
+
p.projectProbability AS projectProbability,
|
| 544 |
+
p.projectManager AS projectManager,
|
| 545 |
+
p.projectManagerCompany AS projectManagerCompany,
|
| 546 |
+
p.projectManagerTitle AS projectManagerTitle,
|
| 547 |
+
p.plantOwner AS plantOwner,
|
| 548 |
+
p.plantParent AS plantParent,
|
| 549 |
+
p.plantName AS plantName,
|
| 550 |
+
p.engineerCompany AS engineerCompany,
|
| 551 |
+
p.ecFirm AS ecFirm,
|
| 552 |
+
p.industryCode AS industryCode,
|
| 553 |
+
p.projectType AS projectType,
|
| 554 |
+
p.sector AS sector,
|
| 555 |
+
p.sicCode AS sicCode,
|
| 556 |
+
p.pecTiming AS pecTiming,
|
| 557 |
+
p.pecActivity AS pecActivity,
|
| 558 |
+
p.projectCapacity AS projectCapacity,
|
| 559 |
+
p.constructionLabor AS constructionLabor,
|
| 560 |
+
p.operationsLabor AS operationsLabor,
|
| 561 |
+
p.fuelType AS fuelType,
|
| 562 |
+
p.unitName AS unitName,
|
| 563 |
+
b.amount AS budget,
|
| 564 |
+
b.currency AS currency,
|
| 565 |
+
l.city AS city,
|
| 566 |
+
l.state AS state,
|
| 567 |
+
l.country AS country,
|
| 568 |
+
l.address AS address,
|
| 569 |
+
r.lastUpdate AS lastUpdate,
|
| 570 |
+
r.initialRelease AS initialRelease
|
| 571 |
+
ORDER BY p.name
|
| 572 |
+
""",
|
| 573 |
+
description="Get comprehensive overview of all projects with all attributes",
|
| 574 |
+
),
|
| 575 |
+
|
| 576 |
+
QueryIntent.PROJECT_STATUS: CypherTemplate(
|
| 577 |
+
intent=QueryIntent.PROJECT_STATUS,
|
| 578 |
+
cypher="""
|
| 579 |
+
MATCH (p:Project)
|
| 580 |
+
OPTIONAL MATCH (p)-[:HAS_REPORT]->(r:Report)
|
| 581 |
+
RETURN p.name AS project,
|
| 582 |
+
p.projectId AS projectId,
|
| 583 |
+
p.status AS status,
|
| 584 |
+
p.statusReason AS statusReason,
|
| 585 |
+
r.lastUpdate AS lastUpdate
|
| 586 |
+
ORDER BY p.name
|
| 587 |
+
""",
|
| 588 |
+
description="Get project status information",
|
| 589 |
+
),
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
def __init__(self, use_llm: bool = True) -> None:
|
| 593 |
+
"""Initialize the template router.
|
| 594 |
+
|
| 595 |
+
Args:
|
| 596 |
+
use_llm: If True, uses LLM for intent classification (handles synonyms).
|
| 597 |
+
If False, uses simple pattern matching (faster but limited).
|
| 598 |
+
"""
|
| 599 |
+
self.use_llm = use_llm
|
| 600 |
+
self._llm_classifier: Optional[LLMIntentClassifier] = None
|
| 601 |
+
|
| 602 |
+
def _get_classifier(self) -> LLMIntentClassifier:
|
| 603 |
+
"""Lazy-load the LLM classifier."""
|
| 604 |
+
if self._llm_classifier is None:
|
| 605 |
+
self._llm_classifier = LLMIntentClassifier(
|
| 606 |
+
use_cache=True,
|
| 607 |
+
fallback_to_patterns=True,
|
| 608 |
+
)
|
| 609 |
+
return self._llm_classifier
|
| 610 |
+
|
| 611 |
+
def classify_intent(self, query: str) -> QueryIntent:
|
| 612 |
+
"""Classify query intent using LLM or pattern matching.
|
| 613 |
+
|
| 614 |
+
Args:
|
| 615 |
+
query: User query string
|
| 616 |
+
|
| 617 |
+
Returns:
|
| 618 |
+
Detected QueryIntent
|
| 619 |
+
"""
|
| 620 |
+
if self.use_llm:
|
| 621 |
+
classifier = self._get_classifier()
|
| 622 |
+
intent_str = classifier.classify(query)
|
| 623 |
+
else:
|
| 624 |
+
# Fallback to simple pattern matching
|
| 625 |
+
intent_str = self._simple_pattern_match(query)
|
| 626 |
+
|
| 627 |
+
# Map string to QueryIntent enum
|
| 628 |
+
intent_map = {
|
| 629 |
+
"BUDGET_LOCATION": QueryIntent.BUDGET_LOCATION,
|
| 630 |
+
"TIMELINE_LOCATION": QueryIntent.TIMELINE_LOCATION,
|
| 631 |
+
"TIMELINE_BUDGET": QueryIntent.TIMELINE_BUDGET,
|
| 632 |
+
"TIMELINE": QueryIntent.TIMELINE,
|
| 633 |
+
"CHALLENGES": QueryIntent.CHALLENGES,
|
| 634 |
+
"CONTACTS": QueryIntent.CONTACTS,
|
| 635 |
+
"TECHNICAL": QueryIntent.TECHNICAL,
|
| 636 |
+
"BUDGET": QueryIntent.BUDGET,
|
| 637 |
+
"LOCATION": QueryIntent.LOCATION,
|
| 638 |
+
"COMPARISON": QueryIntent.COMPARISON,
|
| 639 |
+
"STATUS": QueryIntent.PROJECT_STATUS,
|
| 640 |
+
"OVERVIEW": QueryIntent.PROJECT_OVERVIEW,
|
| 641 |
+
"GENERAL": QueryIntent.GENERAL,
|
| 642 |
+
}
|
| 643 |
+
|
| 644 |
+
return intent_map.get(intent_str, QueryIntent.GENERAL)
|
| 645 |
+
|
| 646 |
+
def _simple_pattern_match(self, query: str) -> str:
|
| 647 |
+
"""Simple pattern matching fallback (no LLM)."""
|
| 648 |
+
q = query.lower()
|
| 649 |
+
|
| 650 |
+
# Check for combined intents first
|
| 651 |
+
if any(w in q for w in ["budget", "cost", "money"]) and any(w in q for w in ["location", "where", "site"]):
|
| 652 |
+
return "BUDGET_LOCATION"
|
| 653 |
+
|
| 654 |
+
# Single intents - check domain keywords
|
| 655 |
+
if any(w in q for w in ["timeline", "schedule", "milestone", "deadline", "when", "duration"]):
|
| 656 |
+
return "TIMELINE"
|
| 657 |
+
if any(w in q for w in ["challenge", "risk", "issue", "problem", "obstacle", "delay"]):
|
| 658 |
+
return "CHALLENGES"
|
| 659 |
+
if any(w in q for w in ["budget", "cost", "investment", "money", "spend", "fund", "price"]):
|
| 660 |
+
return "BUDGET"
|
| 661 |
+
if any(w in q for w in ["location", "where", "site", "city", "country", "place"]):
|
| 662 |
+
return "LOCATION"
|
| 663 |
+
if any(w in q for w in ["compare", "comparison", "versus", "differ"]):
|
| 664 |
+
return "COMPARISON"
|
| 665 |
+
if any(w in q for w in ["status", "progress", "state"]):
|
| 666 |
+
return "STATUS"
|
| 667 |
+
if any(w in q for w in ["overview", "summary", "describe", "explain"]):
|
| 668 |
+
return "OVERVIEW"
|
| 669 |
+
|
| 670 |
+
return "GENERAL"
|
| 671 |
+
|
| 672 |
+
def get_template(self, intent: QueryIntent) -> Optional[CypherTemplate]:
|
| 673 |
+
"""Get template for a given intent.
|
| 674 |
+
|
| 675 |
+
Args:
|
| 676 |
+
intent: Query intent
|
| 677 |
+
|
| 678 |
+
Returns:
|
| 679 |
+
CypherTemplate or None if no template for intent
|
| 680 |
+
"""
|
| 681 |
+
return self.TEMPLATES.get(intent)
|
| 682 |
+
|
| 683 |
+
def route_query(
|
| 684 |
+
self,
|
| 685 |
+
query: str,
|
| 686 |
+
graph: Any,
|
| 687 |
+
) -> Tuple[Optional[List[Dict]], QueryIntent]:
|
| 688 |
+
"""Route query to template or indicate fallback needed.
|
| 689 |
+
|
| 690 |
+
Args:
|
| 691 |
+
query: User query string
|
| 692 |
+
graph: Neo4j graph instance
|
| 693 |
+
|
| 694 |
+
Returns:
|
| 695 |
+
Tuple of (results or None, detected intent)
|
| 696 |
+
Results is None if intent is GENERAL or template execution failed
|
| 697 |
+
"""
|
| 698 |
+
intent = self.classify_intent(query)
|
| 699 |
+
logger.info(f"Query classified as: {intent.value}")
|
| 700 |
+
|
| 701 |
+
if intent == QueryIntent.GENERAL:
|
| 702 |
+
return None, intent
|
| 703 |
+
|
| 704 |
+
template = self.get_template(intent)
|
| 705 |
+
if template is None:
|
| 706 |
+
logger.warning(f"No template found for intent: {intent.value}")
|
| 707 |
+
return None, intent
|
| 708 |
+
|
| 709 |
+
try:
|
| 710 |
+
results = template.execute(graph)
|
| 711 |
+
if results:
|
| 712 |
+
logger.info(f"Template returned {len(results)} results")
|
| 713 |
+
return results, intent
|
| 714 |
+
else:
|
| 715 |
+
logger.warning("Template returned empty results")
|
| 716 |
+
return [], intent
|
| 717 |
+
except Exception as e:
|
| 718 |
+
logger.warning(f"Template execution error: {e}")
|
| 719 |
+
return None, intent
|
| 720 |
+
|
| 721 |
+
def get_all_intents(self) -> List[QueryIntent]:
|
| 722 |
+
"""Get list of all supported intents (excluding GENERAL)."""
|
| 723 |
+
return [intent for intent in QueryIntent if intent != QueryIntent.GENERAL]
|
| 724 |
+
|
| 725 |
+
def get_template_description(self, intent: QueryIntent) -> str:
|
| 726 |
+
"""Get human-readable description of what a template does."""
|
| 727 |
+
template = self.get_template(intent)
|
| 728 |
+
if template:
|
| 729 |
+
return template.description
|
| 730 |
+
return f"No template available for {intent.value}"
|
| 731 |
+
|
| 732 |
+
|
| 733 |
+
# =========================================================================
|
| 734 |
+
# RESULT FORMATTERS
|
| 735 |
+
# =========================================================================
|
| 736 |
+
# These functions format Cypher results into human-readable markdown
|
| 737 |
+
# without requiring LLM synthesis.
|
| 738 |
+
|
| 739 |
+
class TemplateResultFormatter:
|
| 740 |
+
"""Formats template results into markdown without LLM."""
|
| 741 |
+
|
| 742 |
+
# Standard message for missing information
|
| 743 |
+
NOT_FOUND_MSG = "I couldn't find this information in the provided documents."
|
| 744 |
+
|
| 745 |
+
@staticmethod
|
| 746 |
+
def format_budget(results: List[Dict]) -> str:
|
| 747 |
+
"""Format budget results."""
|
| 748 |
+
if not results:
|
| 749 |
+
return "I couldn't find any budget information in the provided documents."
|
| 750 |
+
|
| 751 |
+
lines = ["## Budget Information\n"]
|
| 752 |
+
for r in results:
|
| 753 |
+
project = r.get('project') or 'Unknown Project'
|
| 754 |
+
budget = r.get('budget')
|
| 755 |
+
currency = r.get('currency') or ''
|
| 756 |
+
status = r.get('status') or ''
|
| 757 |
+
|
| 758 |
+
if budget is not None:
|
| 759 |
+
if isinstance(budget, (int, float)):
|
| 760 |
+
budget_str = f"{budget:,.0f} {currency}".strip()
|
| 761 |
+
else:
|
| 762 |
+
budget_str = f"{budget} {currency}".strip()
|
| 763 |
+
else:
|
| 764 |
+
budget_str = "Not available"
|
| 765 |
+
|
| 766 |
+
status_str = f" ({status})" if status else ""
|
| 767 |
+
lines.append(f"- **{project}**{status_str}: {budget_str}")
|
| 768 |
+
|
| 769 |
+
return "\n".join(lines)
|
| 770 |
+
|
| 771 |
+
@staticmethod
|
| 772 |
+
def format_location(results: List[Dict]) -> str:
|
| 773 |
+
"""Format location results."""
|
| 774 |
+
if not results:
|
| 775 |
+
return "I couldn't find any location information in the provided documents."
|
| 776 |
+
|
| 777 |
+
lines = ["## Location Information\n"]
|
| 778 |
+
for r in results:
|
| 779 |
+
project = r.get('project') or 'Unknown Project'
|
| 780 |
+
loc_parts = [
|
| 781 |
+
r.get('address'),
|
| 782 |
+
r.get('city'),
|
| 783 |
+
r.get('state'),
|
| 784 |
+
r.get('country'),
|
| 785 |
+
]
|
| 786 |
+
loc = ", ".join([p for p in loc_parts if p]) or "Not available"
|
| 787 |
+
lines.append(f"- **{project}**: {loc}")
|
| 788 |
+
|
| 789 |
+
return "\n".join(lines)
|
| 790 |
+
|
| 791 |
+
@staticmethod
|
| 792 |
+
def format_budget_location(results: List[Dict]) -> str:
|
| 793 |
+
"""Format combined budget and location results."""
|
| 794 |
+
if not results:
|
| 795 |
+
return "I couldn't find any budget or location information in the provided documents."
|
| 796 |
+
|
| 797 |
+
lines = ["## Budget Allocation and Location\n"]
|
| 798 |
+
for r in results:
|
| 799 |
+
project = r.get('project') or 'Unknown Project'
|
| 800 |
+
status = r.get('status') or ''
|
| 801 |
+
|
| 802 |
+
# Format budget
|
| 803 |
+
budget = r.get('budget')
|
| 804 |
+
currency = r.get('currency') or ''
|
| 805 |
+
if budget is not None:
|
| 806 |
+
if isinstance(budget, (int, float)):
|
| 807 |
+
budget_str = f"{budget:,.0f} {currency}".strip()
|
| 808 |
+
else:
|
| 809 |
+
budget_str = f"{budget} {currency}".strip()
|
| 810 |
+
else:
|
| 811 |
+
budget_str = "Not available"
|
| 812 |
+
|
| 813 |
+
# Format location
|
| 814 |
+
loc_parts = [r.get('city'), r.get('state'), r.get('country')]
|
| 815 |
+
loc = ", ".join([p for p in loc_parts if p]) or "Not available"
|
| 816 |
+
|
| 817 |
+
status_str = f" *({status})*" if status else ""
|
| 818 |
+
lines.append(f"\n### {project}{status_str}")
|
| 819 |
+
lines.append(f"- **Budget (TIV)**: {budget_str}")
|
| 820 |
+
lines.append(f"- **Location**: {loc}")
|
| 821 |
+
|
| 822 |
+
if r.get('address'):
|
| 823 |
+
lines.append(f"- **Address**: {r['address']}")
|
| 824 |
+
if r.get('zoneCounty'):
|
| 825 |
+
lines.append(f"- **Zone/County**: {r['zoneCounty']}")
|
| 826 |
+
|
| 827 |
+
return "\n".join(lines)
|
| 828 |
+
|
| 829 |
+
@staticmethod
|
| 830 |
+
def format_timeline(results: List[Dict]) -> str:
|
| 831 |
+
"""Format timeline/milestone results."""
|
| 832 |
+
if not results:
|
| 833 |
+
return "I couldn't find any timeline information in the provided documents."
|
| 834 |
+
|
| 835 |
+
lines = ["## Project Timelines\n"]
|
| 836 |
+
for r in results:
|
| 837 |
+
project = r.get('project') or 'Unknown Project'
|
| 838 |
+
status = r.get('status') or ''
|
| 839 |
+
milestones = r.get('milestones') or []
|
| 840 |
+
|
| 841 |
+
status_str = f" *({status})*" if status else ""
|
| 842 |
+
lines.append(f"\n### {project}{status_str}")
|
| 843 |
+
|
| 844 |
+
# Filter out null milestones
|
| 845 |
+
valid_milestones = [
|
| 846 |
+
m for m in milestones
|
| 847 |
+
if m and (m.get('name') or m.get('date'))
|
| 848 |
+
]
|
| 849 |
+
|
| 850 |
+
if not valid_milestones:
|
| 851 |
+
lines.append("- No milestones recorded")
|
| 852 |
+
else:
|
| 853 |
+
for m in valid_milestones[:12]: # Limit display
|
| 854 |
+
name = m.get('name') or 'Milestone'
|
| 855 |
+
date = m.get('date') or ''
|
| 856 |
+
detail = m.get('detail') or ''
|
| 857 |
+
|
| 858 |
+
if date:
|
| 859 |
+
lines.append(f"- **{name}**: {date}")
|
| 860 |
+
elif detail:
|
| 861 |
+
lines.append(f"- **{name}**: {detail[:100]}...")
|
| 862 |
+
else:
|
| 863 |
+
lines.append(f"- {name}")
|
| 864 |
+
|
| 865 |
+
return "\n".join(lines)
|
| 866 |
+
|
| 867 |
+
@staticmethod
|
| 868 |
+
def format_challenges(results: List[Dict]) -> str:
|
| 869 |
+
"""Format challenges results."""
|
| 870 |
+
if not results:
|
| 871 |
+
return "I couldn't find any challenge or risk information in the provided documents."
|
| 872 |
+
|
| 873 |
+
lines = ["## Project Challenges and Constraints\n"]
|
| 874 |
+
for r in results:
|
| 875 |
+
project = r.get('project') or 'Unknown Project'
|
| 876 |
+
status = r.get('status') or ''
|
| 877 |
+
status_reason = r.get('statusReason') or ''
|
| 878 |
+
challenges = r.get('challenges') or []
|
| 879 |
+
|
| 880 |
+
lines.append(f"\n### {project}")
|
| 881 |
+
|
| 882 |
+
if status:
|
| 883 |
+
lines.append(f"**Status**: {status}")
|
| 884 |
+
if status_reason:
|
| 885 |
+
lines.append(f"**Status Reason**: {status_reason}")
|
| 886 |
+
|
| 887 |
+
# Filter out None/empty challenges
|
| 888 |
+
valid_challenges = [c for c in challenges if c]
|
| 889 |
+
|
| 890 |
+
if valid_challenges:
|
| 891 |
+
lines.append("\n**Identified Challenges:**")
|
| 892 |
+
for ch in valid_challenges[:10]:
|
| 893 |
+
lines.append(f"- {ch}")
|
| 894 |
+
elif status_reason:
|
| 895 |
+
lines.append("\n*Challenges inferred from status reason above.*")
|
| 896 |
+
else:
|
| 897 |
+
lines.append("- No specific challenges recorded")
|
| 898 |
+
|
| 899 |
+
return "\n".join(lines)
|
| 900 |
+
|
| 901 |
+
@staticmethod
|
| 902 |
+
def format_contacts(results: List[Dict]) -> str:
|
| 903 |
+
"""Format contact/personnel information results."""
|
| 904 |
+
if not results:
|
| 905 |
+
return "I couldn't find any contact or personnel information in the provided documents."
|
| 906 |
+
|
| 907 |
+
lines = ["## Project Contacts and Personnel\n"]
|
| 908 |
+
|
| 909 |
+
for r in results:
|
| 910 |
+
project = r.get('project') or 'Unknown Project'
|
| 911 |
+
lines.append(f"\n### {project}")
|
| 912 |
+
|
| 913 |
+
has_any_contact = False
|
| 914 |
+
|
| 915 |
+
# Project Manager
|
| 916 |
+
pm_name = r.get('projectManager')
|
| 917 |
+
if pm_name:
|
| 918 |
+
has_any_contact = True
|
| 919 |
+
pm_info = pm_name
|
| 920 |
+
if r.get('projectManagerTitle'):
|
| 921 |
+
pm_info += f", {r['projectManagerTitle']}"
|
| 922 |
+
if r.get('projectManagerCompany'):
|
| 923 |
+
pm_info += f" ({r['projectManagerCompany']})"
|
| 924 |
+
lines.append(f"- **Project Manager**: {pm_info}")
|
| 925 |
+
if r.get('projectManagerEmail'):
|
| 926 |
+
lines.append(f" - Email: {r['projectManagerEmail']}")
|
| 927 |
+
if r.get('projectManagerPhone'):
|
| 928 |
+
lines.append(f" - Phone: {r['projectManagerPhone']}")
|
| 929 |
+
|
| 930 |
+
# Owner
|
| 931 |
+
plant_owner = r.get('plantOwner')
|
| 932 |
+
if plant_owner:
|
| 933 |
+
has_any_contact = True
|
| 934 |
+
owner_info = plant_owner
|
| 935 |
+
if r.get('plantParent'):
|
| 936 |
+
owner_info += f" (Parent: {r['plantParent']})"
|
| 937 |
+
lines.append(f"- **Owner**: {owner_info}")
|
| 938 |
+
if r.get('plantName'):
|
| 939 |
+
lines.append(f" - Plant/Facility: {r['plantName']}")
|
| 940 |
+
|
| 941 |
+
# Engineer
|
| 942 |
+
if r.get('engineerCompany'):
|
| 943 |
+
has_any_contact = True
|
| 944 |
+
lines.append(f"- **Engineer**: {r['engineerCompany']}")
|
| 945 |
+
|
| 946 |
+
# E&C Firm
|
| 947 |
+
if r.get('ecFirm'):
|
| 948 |
+
has_any_contact = True
|
| 949 |
+
lines.append(f"- **E&C Firm**: {r['ecFirm']}")
|
| 950 |
+
|
| 951 |
+
# General phone
|
| 952 |
+
if r.get('phone'):
|
| 953 |
+
has_any_contact = True
|
| 954 |
+
lines.append(f"- **Phone**: {r['phone']}")
|
| 955 |
+
|
| 956 |
+
if not has_any_contact:
|
| 957 |
+
lines.append("- No contact information available")
|
| 958 |
+
|
| 959 |
+
return "\n".join(lines)
|
| 960 |
+
|
| 961 |
+
@staticmethod
|
| 962 |
+
def format_technical(results: List[Dict]) -> str:
|
| 963 |
+
"""Format technical details and specifications results."""
|
| 964 |
+
if not results:
|
| 965 |
+
return "I couldn't find any technical specifications in the provided documents."
|
| 966 |
+
|
| 967 |
+
lines = ["## Technical Details and Specifications\n"]
|
| 968 |
+
|
| 969 |
+
for r in results:
|
| 970 |
+
project = r.get('project') or 'Unknown Project'
|
| 971 |
+
lines.append(f"\n### {project}")
|
| 972 |
+
|
| 973 |
+
has_any_technical = False
|
| 974 |
+
|
| 975 |
+
# Classification
|
| 976 |
+
if r.get('industryCode') or r.get('projectType') or r.get('sector'):
|
| 977 |
+
has_any_technical = True
|
| 978 |
+
lines.append("- **Classification**:")
|
| 979 |
+
if r.get('industryCode'):
|
| 980 |
+
lines.append(f" - Industry: {r['industryCode']}")
|
| 981 |
+
if r.get('projectType'):
|
| 982 |
+
lines.append(f" - Type: {r['projectType']}")
|
| 983 |
+
if r.get('sector'):
|
| 984 |
+
lines.append(f" - Sector: {r['sector']}")
|
| 985 |
+
if r.get('sicCode'):
|
| 986 |
+
lines.append(f" - SIC Code: {r['sicCode']}")
|
| 987 |
+
if r.get('sicProduct'):
|
| 988 |
+
lines.append(f" - SIC Product: {r['sicProduct']}")
|
| 989 |
+
|
| 990 |
+
# PEC Stage
|
| 991 |
+
if r.get('pecTiming') or r.get('pecActivity'):
|
| 992 |
+
has_any_technical = True
|
| 993 |
+
pec = f"{r.get('pecTiming', '')} - {r.get('pecActivity', '')}".strip(' -')
|
| 994 |
+
if pec:
|
| 995 |
+
lines.append(f"- **PEC Stage**: {pec}")
|
| 996 |
+
|
| 997 |
+
# Capacity
|
| 998 |
+
if r.get('projectCapacity'):
|
| 999 |
+
has_any_technical = True
|
| 1000 |
+
lines.append(f"- **Project Capacity**: {r['projectCapacity']}")
|
| 1001 |
+
|
| 1002 |
+
# Scope
|
| 1003 |
+
if r.get('scopeText'):
|
| 1004 |
+
has_any_technical = True
|
| 1005 |
+
scope = r['scopeText']
|
| 1006 |
+
if len(scope) > 300:
|
| 1007 |
+
scope = scope[:300] + "..."
|
| 1008 |
+
lines.append(f"- **Scope**: {scope}")
|
| 1009 |
+
|
| 1010 |
+
# Environmental
|
| 1011 |
+
if r.get('environmental'):
|
| 1012 |
+
has_any_technical = True
|
| 1013 |
+
lines.append(f"- **Environmental**: {r['environmental']}")
|
| 1014 |
+
|
| 1015 |
+
# Labor
|
| 1016 |
+
if r.get('constructionLabor') or r.get('operationsLabor'):
|
| 1017 |
+
has_any_technical = True
|
| 1018 |
+
labor_parts = []
|
| 1019 |
+
if r.get('constructionLabor'):
|
| 1020 |
+
labor_parts.append(f"Construction: {r['constructionLabor']}")
|
| 1021 |
+
if r.get('operationsLabor'):
|
| 1022 |
+
labor_parts.append(f"Operations: {r['operationsLabor']}")
|
| 1023 |
+
lines.append(f"- **Labor**: {', '.join(labor_parts)}")
|
| 1024 |
+
|
| 1025 |
+
# Fuel type
|
| 1026 |
+
if r.get('fuelType'):
|
| 1027 |
+
has_any_technical = True
|
| 1028 |
+
lines.append(f"- **Fuel Type**: {r['fuelType']}")
|
| 1029 |
+
|
| 1030 |
+
# Unit
|
| 1031 |
+
if r.get('unitName'):
|
| 1032 |
+
has_any_technical = True
|
| 1033 |
+
lines.append(f"- **Unit**: {r['unitName']}")
|
| 1034 |
+
|
| 1035 |
+
if not has_any_technical:
|
| 1036 |
+
lines.append("- No technical specifications available")
|
| 1037 |
+
|
| 1038 |
+
return "\n".join(lines)
|
| 1039 |
+
|
| 1040 |
+
@staticmethod
|
| 1041 |
+
def format_comparison(results: List[Dict]) -> str:
|
| 1042 |
+
"""Format comparison results with comprehensive project details."""
|
| 1043 |
+
if not results:
|
| 1044 |
+
return "I couldn't find any project data for comparison in the provided documents."
|
| 1045 |
+
|
| 1046 |
+
lines = ["## Project Comparison\n"]
|
| 1047 |
+
|
| 1048 |
+
for r in results:
|
| 1049 |
+
project = r.get('project') or 'Unknown'
|
| 1050 |
+
lines.append(f"### {project}")
|
| 1051 |
+
|
| 1052 |
+
# Status section
|
| 1053 |
+
status = r.get('status')
|
| 1054 |
+
if status:
|
| 1055 |
+
lines.append(f"- **Status**: {status}")
|
| 1056 |
+
if r.get('statusReason'):
|
| 1057 |
+
lines.append(f" - Reason: {r['statusReason']}")
|
| 1058 |
+
if r.get('projectProbability'):
|
| 1059 |
+
lines.append(f" - Probability: {r['projectProbability']}")
|
| 1060 |
+
|
| 1061 |
+
# Classification
|
| 1062 |
+
if r.get('industryCode') or r.get('projectType') or r.get('sector'):
|
| 1063 |
+
lines.append("- **Classification**:")
|
| 1064 |
+
if r.get('industryCode'):
|
| 1065 |
+
lines.append(f" - Industry: {r['industryCode']}")
|
| 1066 |
+
if r.get('projectType'):
|
| 1067 |
+
lines.append(f" - Type: {r['projectType']}")
|
| 1068 |
+
if r.get('sector'):
|
| 1069 |
+
lines.append(f" - Sector: {r['sector']}")
|
| 1070 |
+
if r.get('sicCode'):
|
| 1071 |
+
lines.append(f" - SIC Code: {r['sicCode']}")
|
| 1072 |
+
|
| 1073 |
+
# Budget
|
| 1074 |
+
budget = r.get('budget')
|
| 1075 |
+
currency = r.get('currency') or ''
|
| 1076 |
+
if budget is not None and isinstance(budget, (int, float)):
|
| 1077 |
+
if budget >= 1_000_000_000:
|
| 1078 |
+
budget_str = f"{budget/1_000_000_000:.1f}B {currency}".strip()
|
| 1079 |
+
elif budget >= 1_000_000:
|
| 1080 |
+
budget_str = f"{budget/1_000_000:.0f}M {currency}".strip()
|
| 1081 |
+
else:
|
| 1082 |
+
budget_str = f"{budget:,.0f} {currency}".strip()
|
| 1083 |
+
lines.append(f"- **Budget (TIV)**: {budget_str}")
|
| 1084 |
+
|
| 1085 |
+
# Location
|
| 1086 |
+
loc_parts = [r.get('address'), r.get('city'), r.get('state'), r.get('country')]
|
| 1087 |
+
loc_parts = [p for p in loc_parts if p]
|
| 1088 |
+
if loc_parts:
|
| 1089 |
+
lines.append(f"- **Location**: {', '.join(loc_parts)}")
|
| 1090 |
+
|
| 1091 |
+
# Capacity/Technical
|
| 1092 |
+
if r.get('projectCapacity'):
|
| 1093 |
+
lines.append(f"- **Project Capacity**: {r['projectCapacity']}")
|
| 1094 |
+
if r.get('pecTiming') or r.get('pecActivity'):
|
| 1095 |
+
pec = f"{r.get('pecTiming', '')} - {r.get('pecActivity', '')}".strip(' -')
|
| 1096 |
+
if pec:
|
| 1097 |
+
lines.append(f"- **PEC Stage**: {pec}")
|
| 1098 |
+
|
| 1099 |
+
# Contacts section
|
| 1100 |
+
pm_name = r.get('projectManager')
|
| 1101 |
+
pm_company = r.get('projectManagerCompany')
|
| 1102 |
+
pm_title = r.get('projectManagerTitle')
|
| 1103 |
+
plant_owner = r.get('plantOwner')
|
| 1104 |
+
plant_parent = r.get('plantParent')
|
| 1105 |
+
engineer = r.get('engineerCompany')
|
| 1106 |
+
ec_firm = r.get('ecFirm')
|
| 1107 |
+
|
| 1108 |
+
if any([pm_name, plant_owner, engineer, ec_firm]):
|
| 1109 |
+
lines.append("- **Key Contacts**:")
|
| 1110 |
+
if pm_name:
|
| 1111 |
+
pm_info = pm_name
|
| 1112 |
+
if pm_title:
|
| 1113 |
+
pm_info += f", {pm_title}"
|
| 1114 |
+
if pm_company:
|
| 1115 |
+
pm_info += f" ({pm_company})"
|
| 1116 |
+
lines.append(f" - Project Manager: {pm_info}")
|
| 1117 |
+
if plant_owner:
|
| 1118 |
+
owner_info = plant_owner
|
| 1119 |
+
if plant_parent:
|
| 1120 |
+
owner_info += f" (Parent: {plant_parent})"
|
| 1121 |
+
lines.append(f" - Owner: {owner_info}")
|
| 1122 |
+
if engineer:
|
| 1123 |
+
lines.append(f" - Engineer: {engineer}")
|
| 1124 |
+
if ec_firm:
|
| 1125 |
+
lines.append(f" - E&C Firm: {ec_firm}")
|
| 1126 |
+
|
| 1127 |
+
# Plant info
|
| 1128 |
+
if r.get('plantName'):
|
| 1129 |
+
lines.append(f"- **Plant/Facility**: {r['plantName']}")
|
| 1130 |
+
|
| 1131 |
+
# Milestones and Challenges counts
|
| 1132 |
+
ms = r.get('milestones') or []
|
| 1133 |
+
ch = r.get('challenges') or []
|
| 1134 |
+
if isinstance(ms, list):
|
| 1135 |
+
milestone_count = len([m for m in ms if m and m.get('name')])
|
| 1136 |
+
else:
|
| 1137 |
+
milestone_count = 0
|
| 1138 |
+
if isinstance(ch, list):
|
| 1139 |
+
challenge_count = len([c for c in ch if c])
|
| 1140 |
+
else:
|
| 1141 |
+
challenge_count = 0
|
| 1142 |
+
|
| 1143 |
+
lines.append(f"- **Milestones**: {milestone_count}")
|
| 1144 |
+
lines.append(f"- **Challenges**: {challenge_count}")
|
| 1145 |
+
lines.append("")
|
| 1146 |
+
|
| 1147 |
+
return "\n".join(lines)
|
| 1148 |
+
|
| 1149 |
+
@staticmethod
|
| 1150 |
+
def format_overview(results: List[Dict]) -> str:
|
| 1151 |
+
"""Format comprehensive project overview results."""
|
| 1152 |
+
if not results:
|
| 1153 |
+
return "I couldn't find any project data in the provided documents."
|
| 1154 |
+
|
| 1155 |
+
lines = ["## Project Overview\n"]
|
| 1156 |
+
for r in results:
|
| 1157 |
+
project = r.get('project') or 'Unknown Project'
|
| 1158 |
+
lines.append(f"\n### {project}")
|
| 1159 |
+
|
| 1160 |
+
# Basic identification
|
| 1161 |
+
if r.get('projectId'):
|
| 1162 |
+
lines.append(f"- **Project ID**: {r['projectId']}")
|
| 1163 |
+
|
| 1164 |
+
# Status section
|
| 1165 |
+
if r.get('status'):
|
| 1166 |
+
lines.append(f"- **Status**: {r['status']}")
|
| 1167 |
+
if r.get('statusReason'):
|
| 1168 |
+
lines.append(f" - Reason: {r['statusReason']}")
|
| 1169 |
+
if r.get('projectProbability'):
|
| 1170 |
+
lines.append(f" - Probability: {r['projectProbability']}")
|
| 1171 |
+
|
| 1172 |
+
# Classification section
|
| 1173 |
+
has_classification = any([r.get('industryCode'), r.get('projectType'),
|
| 1174 |
+
r.get('sector'), r.get('sicCode')])
|
| 1175 |
+
if has_classification:
|
| 1176 |
+
lines.append("- **Classification**:")
|
| 1177 |
+
if r.get('industryCode'):
|
| 1178 |
+
lines.append(f" - Industry: {r['industryCode']}")
|
| 1179 |
+
if r.get('projectType'):
|
| 1180 |
+
lines.append(f" - Type: {r['projectType']}")
|
| 1181 |
+
if r.get('sector'):
|
| 1182 |
+
lines.append(f" - Sector: {r['sector']}")
|
| 1183 |
+
if r.get('sicCode'):
|
| 1184 |
+
lines.append(f" - SIC Code: {r['sicCode']}")
|
| 1185 |
+
|
| 1186 |
+
# Budget
|
| 1187 |
+
if r.get('budget') is not None:
|
| 1188 |
+
budget = r['budget']
|
| 1189 |
+
currency = r.get('currency') or ''
|
| 1190 |
+
if isinstance(budget, (int, float)):
|
| 1191 |
+
if budget >= 1_000_000_000:
|
| 1192 |
+
budget_str = f"{budget/1_000_000_000:.1f}B {currency}".strip()
|
| 1193 |
+
elif budget >= 1_000_000:
|
| 1194 |
+
budget_str = f"{budget/1_000_000:.0f}M {currency}".strip()
|
| 1195 |
+
else:
|
| 1196 |
+
budget_str = f"{budget:,.0f} {currency}".strip()
|
| 1197 |
+
else:
|
| 1198 |
+
budget_str = f"{budget} {currency}".strip()
|
| 1199 |
+
lines.append(f"- **Budget (TIV)**: {budget_str}")
|
| 1200 |
+
|
| 1201 |
+
# Location
|
| 1202 |
+
loc_parts = [r.get('address'), r.get('city'), r.get('state'), r.get('country')]
|
| 1203 |
+
loc_parts = [p for p in loc_parts if p]
|
| 1204 |
+
if loc_parts:
|
| 1205 |
+
lines.append(f"- **Location**: {', '.join(loc_parts)}")
|
| 1206 |
+
|
| 1207 |
+
# Technical details
|
| 1208 |
+
if r.get('projectCapacity'):
|
| 1209 |
+
lines.append(f"- **Project Capacity**: {r['projectCapacity']}")
|
| 1210 |
+
if r.get('pecTiming') or r.get('pecActivity'):
|
| 1211 |
+
pec = f"{r.get('pecTiming', '')} - {r.get('pecActivity', '')}".strip(' -')
|
| 1212 |
+
if pec:
|
| 1213 |
+
lines.append(f"- **PEC Stage**: {pec}")
|
| 1214 |
+
if r.get('fuelType'):
|
| 1215 |
+
lines.append(f"- **Fuel Type**: {r['fuelType']}")
|
| 1216 |
+
if r.get('unitName'):
|
| 1217 |
+
lines.append(f"- **Unit**: {r['unitName']}")
|
| 1218 |
+
|
| 1219 |
+
# Labor information
|
| 1220 |
+
if r.get('constructionLabor') or r.get('operationsLabor'):
|
| 1221 |
+
labor_info = []
|
| 1222 |
+
if r.get('constructionLabor'):
|
| 1223 |
+
labor_info.append(f"Construction: {r['constructionLabor']}")
|
| 1224 |
+
if r.get('operationsLabor'):
|
| 1225 |
+
labor_info.append(f"Operations: {r['operationsLabor']}")
|
| 1226 |
+
lines.append(f"- **Labor**: {', '.join(labor_info)}")
|
| 1227 |
+
|
| 1228 |
+
# Contacts section
|
| 1229 |
+
pm_name = r.get('projectManager')
|
| 1230 |
+
pm_company = r.get('projectManagerCompany')
|
| 1231 |
+
pm_title = r.get('projectManagerTitle')
|
| 1232 |
+
plant_owner = r.get('plantOwner')
|
| 1233 |
+
plant_parent = r.get('plantParent')
|
| 1234 |
+
plant_name = r.get('plantName')
|
| 1235 |
+
engineer = r.get('engineerCompany')
|
| 1236 |
+
ec_firm = r.get('ecFirm')
|
| 1237 |
+
|
| 1238 |
+
if any([pm_name, plant_owner, engineer, ec_firm]):
|
| 1239 |
+
lines.append("- **Key Contacts**:")
|
| 1240 |
+
if pm_name:
|
| 1241 |
+
pm_info = pm_name
|
| 1242 |
+
if pm_title:
|
| 1243 |
+
pm_info += f", {pm_title}"
|
| 1244 |
+
if pm_company:
|
| 1245 |
+
pm_info += f" ({pm_company})"
|
| 1246 |
+
lines.append(f" - Project Manager: {pm_info}")
|
| 1247 |
+
if plant_owner:
|
| 1248 |
+
owner_info = plant_owner
|
| 1249 |
+
if plant_parent:
|
| 1250 |
+
owner_info += f" (Parent: {plant_parent})"
|
| 1251 |
+
lines.append(f" - Owner: {owner_info}")
|
| 1252 |
+
if engineer:
|
| 1253 |
+
lines.append(f" - Engineer: {engineer}")
|
| 1254 |
+
if ec_firm:
|
| 1255 |
+
lines.append(f" - E&C Firm: {ec_firm}")
|
| 1256 |
+
|
| 1257 |
+
# Plant/Facility info
|
| 1258 |
+
if plant_name:
|
| 1259 |
+
lines.append(f"- **Plant/Facility**: {plant_name}")
|
| 1260 |
+
|
| 1261 |
+
# Report dates
|
| 1262 |
+
if r.get('lastUpdate') or r.get('initialRelease'):
|
| 1263 |
+
lines.append("- **Report Info**:")
|
| 1264 |
+
if r.get('lastUpdate'):
|
| 1265 |
+
lines.append(f" - Last Updated: {r['lastUpdate']}")
|
| 1266 |
+
if r.get('initialRelease'):
|
| 1267 |
+
lines.append(f" - Initial Release: {r['initialRelease']}")
|
| 1268 |
+
|
| 1269 |
+
return "\n".join(lines)
|
| 1270 |
+
|
| 1271 |
+
@staticmethod
|
| 1272 |
+
def format_status(results: List[Dict]) -> str:
|
| 1273 |
+
"""Format status results."""
|
| 1274 |
+
if not results:
|
| 1275 |
+
return "I couldn't find any project status information in the provided documents."
|
| 1276 |
+
|
| 1277 |
+
lines = ["## Project Status\n"]
|
| 1278 |
+
for r in results:
|
| 1279 |
+
project = r.get('project') or 'Unknown Project'
|
| 1280 |
+
status = r.get('status') or 'Unknown'
|
| 1281 |
+
reason = r.get('statusReason') or ''
|
| 1282 |
+
last_update = r.get('lastUpdate') or ''
|
| 1283 |
+
|
| 1284 |
+
lines.append(f"\n### {project}")
|
| 1285 |
+
lines.append(f"- **Status**: {status}")
|
| 1286 |
+
if reason:
|
| 1287 |
+
lines.append(f"- **Reason**: {reason}")
|
| 1288 |
+
if last_update:
|
| 1289 |
+
lines.append(f"- **Last Updated**: {last_update}")
|
| 1290 |
+
|
| 1291 |
+
return "\n".join(lines)
|
| 1292 |
+
|
| 1293 |
+
@classmethod
|
| 1294 |
+
def format(cls, results: List[Dict], intent: QueryIntent) -> str:
|
| 1295 |
+
"""Format results based on intent.
|
| 1296 |
+
|
| 1297 |
+
Args:
|
| 1298 |
+
results: Query results
|
| 1299 |
+
intent: Detected intent
|
| 1300 |
+
|
| 1301 |
+
Returns:
|
| 1302 |
+
Formatted markdown string
|
| 1303 |
+
"""
|
| 1304 |
+
formatters = {
|
| 1305 |
+
QueryIntent.BUDGET: cls.format_budget,
|
| 1306 |
+
QueryIntent.LOCATION: cls.format_location,
|
| 1307 |
+
QueryIntent.BUDGET_LOCATION: cls.format_budget_location,
|
| 1308 |
+
QueryIntent.TIMELINE: cls.format_timeline,
|
| 1309 |
+
QueryIntent.TIMELINE_LOCATION: cls.format_timeline, # Use timeline formatter
|
| 1310 |
+
QueryIntent.TIMELINE_BUDGET: cls.format_timeline, # Use timeline formatter
|
| 1311 |
+
QueryIntent.CHALLENGES: cls.format_challenges,
|
| 1312 |
+
QueryIntent.CONTACTS: cls.format_contacts,
|
| 1313 |
+
QueryIntent.TECHNICAL: cls.format_technical,
|
| 1314 |
+
QueryIntent.COMPARISON: cls.format_comparison,
|
| 1315 |
+
QueryIntent.PROJECT_OVERVIEW: cls.format_overview,
|
| 1316 |
+
QueryIntent.PROJECT_STATUS: cls.format_status,
|
| 1317 |
+
}
|
| 1318 |
+
|
| 1319 |
+
formatter = formatters.get(intent)
|
| 1320 |
+
if formatter:
|
| 1321 |
+
return formatter(results)
|
| 1322 |
+
|
| 1323 |
+
# Generic fallback
|
| 1324 |
+
if not results:
|
| 1325 |
+
return "I couldn't find this information in the provided documents."
|
| 1326 |
+
|
| 1327 |
+
lines = ["## Query Results\n"]
|
| 1328 |
+
for r in results:
|
| 1329 |
+
items = [f"**{k}**: {v}" for k, v in r.items() if v is not None]
|
| 1330 |
+
lines.append("- " + " | ".join(items))
|
| 1331 |
+
|
| 1332 |
+
return "\n".join(lines)
|
src/services/neo4j_service.py
ADDED
|
@@ -0,0 +1,588 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Neo4j database access layer.
|
| 3 |
+
|
| 4 |
+
Provides centralized Neo4j connectivity and data management
|
| 5 |
+
with Aura/hosted instance best practices.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from typing import Any, Dict, List, Optional
|
| 11 |
+
|
| 12 |
+
from neo4j import GraphDatabase, Driver
|
| 13 |
+
from neo4j.exceptions import ServiceUnavailable, AuthError
|
| 14 |
+
|
| 15 |
+
# LangChain Neo4j integration
|
| 16 |
+
try:
|
| 17 |
+
from langchain_community.graphs import Neo4jGraph
|
| 18 |
+
except ImportError:
|
| 19 |
+
from langchain.graphs import Neo4jGraph
|
| 20 |
+
|
| 21 |
+
from src.config import get_logger, log_step
|
| 22 |
+
from src.models.project import ProjectRecord, GeoComponents, Milestone
|
| 23 |
+
from src.parsers.project_parser import ProjectReportParser
|
| 24 |
+
|
| 25 |
+
# Module logger
|
| 26 |
+
logger = get_logger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class Neo4jConnectionError(Exception):
|
| 30 |
+
"""Raised when Neo4j connection fails."""
|
| 31 |
+
pass
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class Neo4jService:
|
| 35 |
+
"""Neo4j access layer with Aura/hosted best practices.
|
| 36 |
+
|
| 37 |
+
This class centralizes:
|
| 38 |
+
- Driver construction and connectivity validation
|
| 39 |
+
- LangChain Neo4jGraph wrapper configuration
|
| 40 |
+
- Constraints, structured writes, and database cleanup
|
| 41 |
+
|
| 42 |
+
Attributes:
|
| 43 |
+
uri: Neo4j connection URI.
|
| 44 |
+
user: Database username.
|
| 45 |
+
password: Database password.
|
| 46 |
+
database: Database name.
|
| 47 |
+
driver: Low-level Neo4j driver.
|
| 48 |
+
graph: LangChain Neo4jGraph wrapper.
|
| 49 |
+
|
| 50 |
+
Raises:
|
| 51 |
+
Neo4jConnectionError: If connection fails.
|
| 52 |
+
|
| 53 |
+
Example:
|
| 54 |
+
>>> service = Neo4jService(
|
| 55 |
+
... uri="neo4j+s://xxx.databases.neo4j.io",
|
| 56 |
+
... user="neo4j",
|
| 57 |
+
... password="password"
|
| 58 |
+
... )
|
| 59 |
+
>>> service.ensure_constraints()
|
| 60 |
+
>>> service.close()
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
# Constraint definitions for structured layer
|
| 64 |
+
CONSTRAINTS = [
|
| 65 |
+
"CREATE CONSTRAINT project_id IF NOT EXISTS FOR (p:Project) REQUIRE p.projectId IS UNIQUE",
|
| 66 |
+
"CREATE CONSTRAINT project_name IF NOT EXISTS FOR (p:Project) REQUIRE p.name IS UNIQUE",
|
| 67 |
+
"CREATE CONSTRAINT budget_key IF NOT EXISTS FOR (b:Budget) REQUIRE b.key IS UNIQUE",
|
| 68 |
+
"CREATE CONSTRAINT location_key IF NOT EXISTS FOR (l:Location) REQUIRE l.key IS UNIQUE",
|
| 69 |
+
"CREATE CONSTRAINT milestone_key IF NOT EXISTS FOR (m:Milestone) REQUIRE m.key IS UNIQUE",
|
| 70 |
+
"CREATE CONSTRAINT report_key IF NOT EXISTS FOR (r:Report) REQUIRE r.key IS UNIQUE",
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
# Performance indexes for faster queries
|
| 74 |
+
INDEXES = [
|
| 75 |
+
"CREATE INDEX project_name_idx IF NOT EXISTS FOR (p:Project) ON (p.name)",
|
| 76 |
+
"CREATE INDEX project_source_idx IF NOT EXISTS FOR (p:Project) ON (p.source)",
|
| 77 |
+
"CREATE INDEX chunk_source_idx IF NOT EXISTS FOR (c:Chunk) ON (c.source)",
|
| 78 |
+
"CREATE INDEX milestone_date_idx IF NOT EXISTS FOR (m:Milestone) ON (m.dateText)",
|
| 79 |
+
"CREATE INDEX location_city_idx IF NOT EXISTS FOR (l:Location) ON (l.city)",
|
| 80 |
+
"CREATE INDEX location_country_idx IF NOT EXISTS FOR (l:Location) ON (l.country)",
|
| 81 |
+
"CREATE INDEX challenge_source_idx IF NOT EXISTS FOR (c:Challenge) ON (c.source)",
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
# Full-text index for semantic search within graph
|
| 85 |
+
FULLTEXT_INDEX = """
|
| 86 |
+
CREATE FULLTEXT INDEX entity_fulltext IF NOT EXISTS
|
| 87 |
+
FOR (n:Project|Organization|Location|Milestone|Challenge)
|
| 88 |
+
ON EACH [n.name, n.text, n.description]
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
# Cypher template with APOC support
|
| 92 |
+
# Uses CALL subqueries to handle empty lists properly
|
| 93 |
+
CYPHER_UPSERT_WITH_APOC = """
|
| 94 |
+
MERGE (p:Project {projectId: $project_id})
|
| 95 |
+
ON CREATE SET p.name = $project_name
|
| 96 |
+
ON MATCH SET p.name = coalesce(p.name, $project_name)
|
| 97 |
+
SET p.source = $source,
|
| 98 |
+
p.status = $status,
|
| 99 |
+
p.statusReason = $status_reason,
|
| 100 |
+
p.lastUpdate = $last_update,
|
| 101 |
+
p.initialRelease = $initial_release
|
| 102 |
+
|
| 103 |
+
WITH p
|
| 104 |
+
MERGE (b:Budget {key: $bud_key})
|
| 105 |
+
SET b.amount = $tiv_amount,
|
| 106 |
+
b.currency = $tiv_currency,
|
| 107 |
+
b.kind = 'TIV',
|
| 108 |
+
b.source = $source
|
| 109 |
+
MERGE (p)-[:HAS_BUDGET]->(b)
|
| 110 |
+
|
| 111 |
+
WITH p
|
| 112 |
+
MERGE (l:Location {key: $loc_key})
|
| 113 |
+
SET l.address = $address,
|
| 114 |
+
l.city = $city,
|
| 115 |
+
l.state = $state,
|
| 116 |
+
l.postal = $postal,
|
| 117 |
+
l.country = $country,
|
| 118 |
+
l.zoneCounty = $zone_county,
|
| 119 |
+
l.source = $source
|
| 120 |
+
MERGE (p)-[:LOCATED_IN]->(l)
|
| 121 |
+
|
| 122 |
+
WITH p
|
| 123 |
+
MERGE (r:Report {key: $rep_key})
|
| 124 |
+
SET r.source = $source,
|
| 125 |
+
r.lastUpdate = $last_update,
|
| 126 |
+
r.initialRelease = $initial_release
|
| 127 |
+
MERGE (p)-[:HAS_REPORT]->(r)
|
| 128 |
+
|
| 129 |
+
WITH p
|
| 130 |
+
CALL {
|
| 131 |
+
WITH p
|
| 132 |
+
UNWIND CASE WHEN size($challenges) > 0 THEN $challenges ELSE [null] END AS ch
|
| 133 |
+
WITH p, ch WHERE ch IS NOT NULL
|
| 134 |
+
MERGE (c:Challenge {key: p.projectId + '::ch::' + toString(apoc.util.md5(ch))})
|
| 135 |
+
SET c.text = ch, c.source = $source
|
| 136 |
+
MERGE (p)-[:HAS_CHALLENGE]->(c)
|
| 137 |
+
RETURN count(*) AS chCount
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
WITH p
|
| 141 |
+
CALL {
|
| 142 |
+
WITH p
|
| 143 |
+
UNWIND CASE WHEN size($milestones) > 0 THEN $milestones ELSE [null] END AS ms
|
| 144 |
+
WITH p, ms WHERE ms IS NOT NULL
|
| 145 |
+
MERGE (m:Milestone {key: p.projectId + '::ms::' + toString(apoc.util.md5(ms.sentence))})
|
| 146 |
+
SET m.name = ms.name, m.dateText = ms.dateText, m.sentence = ms.sentence, m.source = $source
|
| 147 |
+
MERGE (p)-[:HAS_MILESTONE]->(m)
|
| 148 |
+
RETURN count(*) AS msCount
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
RETURN p.projectId AS projectId, p.name AS name
|
| 152 |
+
"""
|
| 153 |
+
|
| 154 |
+
# Cypher template without APOC (fallback)
|
| 155 |
+
# Uses CALL subqueries to handle empty lists properly
|
| 156 |
+
CYPHER_UPSERT_NO_APOC = """
|
| 157 |
+
MERGE (p:Project {projectId: $project_id})
|
| 158 |
+
ON CREATE SET p.name = $project_name
|
| 159 |
+
ON MATCH SET p.name = coalesce(p.name, $project_name)
|
| 160 |
+
SET p.source = $source,
|
| 161 |
+
p.status = $status,
|
| 162 |
+
p.statusReason = $status_reason,
|
| 163 |
+
p.lastUpdate = $last_update,
|
| 164 |
+
p.initialRelease = $initial_release
|
| 165 |
+
|
| 166 |
+
WITH p
|
| 167 |
+
MERGE (b:Budget {key: $bud_key})
|
| 168 |
+
SET b.amount = $tiv_amount,
|
| 169 |
+
b.currency = $tiv_currency,
|
| 170 |
+
b.kind = 'TIV',
|
| 171 |
+
b.source = $source
|
| 172 |
+
MERGE (p)-[:HAS_BUDGET]->(b)
|
| 173 |
+
|
| 174 |
+
WITH p
|
| 175 |
+
MERGE (l:Location {key: $loc_key})
|
| 176 |
+
SET l.address = $address,
|
| 177 |
+
l.city = $city,
|
| 178 |
+
l.state = $state,
|
| 179 |
+
l.postal = $postal,
|
| 180 |
+
l.country = $country,
|
| 181 |
+
l.zoneCounty = $zone_county,
|
| 182 |
+
l.source = $source
|
| 183 |
+
MERGE (p)-[:LOCATED_IN]->(l)
|
| 184 |
+
|
| 185 |
+
WITH p
|
| 186 |
+
MERGE (r:Report {key: $rep_key})
|
| 187 |
+
SET r.source = $source,
|
| 188 |
+
r.lastUpdate = $last_update,
|
| 189 |
+
r.initialRelease = $initial_release
|
| 190 |
+
MERGE (p)-[:HAS_REPORT]->(r)
|
| 191 |
+
|
| 192 |
+
WITH p
|
| 193 |
+
CALL {
|
| 194 |
+
WITH p
|
| 195 |
+
UNWIND CASE WHEN size($challenges) > 0 THEN range(0, size($challenges)-1) ELSE [null] END AS i
|
| 196 |
+
WITH p, i WHERE i IS NOT NULL
|
| 197 |
+
MERGE (c:Challenge {key: p.projectId + '::ch::' + toString(i)})
|
| 198 |
+
SET c.text = $challenges[i], c.source = $source
|
| 199 |
+
MERGE (p)-[:HAS_CHALLENGE]->(c)
|
| 200 |
+
RETURN count(*) AS chCount
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
WITH p
|
| 204 |
+
CALL {
|
| 205 |
+
WITH p
|
| 206 |
+
UNWIND CASE WHEN size($milestones) > 0 THEN range(0, size($milestones)-1) ELSE [null] END AS j
|
| 207 |
+
WITH p, j WHERE j IS NOT NULL
|
| 208 |
+
MERGE (m:Milestone {key: p.projectId + '::ms::' + toString(j)})
|
| 209 |
+
SET m.name = $milestones[j].name, m.dateText = $milestones[j].dateText,
|
| 210 |
+
m.sentence = $milestones[j].sentence, m.source = $source
|
| 211 |
+
MERGE (p)-[:HAS_MILESTONE]->(m)
|
| 212 |
+
RETURN count(*) AS msCount
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
RETURN p.projectId AS projectId, p.name AS name
|
| 216 |
+
"""
|
| 217 |
+
|
| 218 |
+
def __init__(
|
| 219 |
+
self,
|
| 220 |
+
uri: str,
|
| 221 |
+
user: str,
|
| 222 |
+
password: str,
|
| 223 |
+
database: str = "neo4j"
|
| 224 |
+
) -> None:
|
| 225 |
+
"""Initialize Neo4j service.
|
| 226 |
+
|
| 227 |
+
Args:
|
| 228 |
+
uri: Neo4j URI (typically neo4j+s://... for Aura).
|
| 229 |
+
user: Neo4j username.
|
| 230 |
+
password: Neo4j password.
|
| 231 |
+
database: Neo4j database name (Aura commonly uses "neo4j").
|
| 232 |
+
|
| 233 |
+
Raises:
|
| 234 |
+
Neo4jConnectionError: If connection or authentication fails.
|
| 235 |
+
"""
|
| 236 |
+
self.uri = uri
|
| 237 |
+
self.user = user
|
| 238 |
+
self.password = password
|
| 239 |
+
self.database = database or "neo4j"
|
| 240 |
+
|
| 241 |
+
logger.info(f"Connecting to Neo4j: {uri}")
|
| 242 |
+
try:
|
| 243 |
+
# Low-level driver for constraint management and transactional writes
|
| 244 |
+
logger.substep("Creating driver")
|
| 245 |
+
self.driver: Driver = GraphDatabase.driver(uri, auth=(user, password))
|
| 246 |
+
self.driver.verify_connectivity()
|
| 247 |
+
logger.substep("Driver connectivity verified")
|
| 248 |
+
|
| 249 |
+
# LangChain wrapper for GraphCypherQAChain and graph operations
|
| 250 |
+
logger.substep("Initializing Neo4jGraph wrapper")
|
| 251 |
+
self.graph: Neo4jGraph = Neo4jGraph(
|
| 252 |
+
url=uri,
|
| 253 |
+
username=user,
|
| 254 |
+
password=password,
|
| 255 |
+
database=self.database
|
| 256 |
+
)
|
| 257 |
+
logger.info(f"Connected to Neo4j database: {self.database}")
|
| 258 |
+
except ServiceUnavailable as e:
|
| 259 |
+
logger.error(f"Service unavailable: {e}")
|
| 260 |
+
raise Neo4jConnectionError(
|
| 261 |
+
f"Could not connect to Neo4j at {uri}. "
|
| 262 |
+
f"Ensure the URI is correct and the database is running. "
|
| 263 |
+
f"Error: {e}"
|
| 264 |
+
) from e
|
| 265 |
+
except AuthError as e:
|
| 266 |
+
logger.error(f"Authentication failed: {e}")
|
| 267 |
+
raise Neo4jConnectionError(
|
| 268 |
+
f"Authentication failed for Neo4j. "
|
| 269 |
+
f"Check username and password. Error: {e}"
|
| 270 |
+
) from e
|
| 271 |
+
except Exception as e:
|
| 272 |
+
logger.error(f"Connection failed: {e}")
|
| 273 |
+
raise Neo4jConnectionError(
|
| 274 |
+
f"Failed to connect to Neo4j: {e}"
|
| 275 |
+
) from e
|
| 276 |
+
|
| 277 |
+
self._parser = ProjectReportParser()
|
| 278 |
+
|
| 279 |
+
def close(self) -> None:
|
| 280 |
+
"""Close the underlying Neo4j driver."""
|
| 281 |
+
logger.debug("Closing Neo4j driver")
|
| 282 |
+
try:
|
| 283 |
+
self.driver.close()
|
| 284 |
+
logger.debug("Neo4j driver closed")
|
| 285 |
+
except Exception as e:
|
| 286 |
+
logger.warning(f"Error closing driver: {e}")
|
| 287 |
+
|
| 288 |
+
def ensure_constraints(self) -> None:
|
| 289 |
+
"""Create constraints for the structured layer.
|
| 290 |
+
|
| 291 |
+
Notes:
|
| 292 |
+
Some Aura tiers or policies may restrict certain DDL operations.
|
| 293 |
+
Failures are logged but swallowed to keep ingestion operational.
|
| 294 |
+
"""
|
| 295 |
+
with log_step(logger, "Create database constraints"):
|
| 296 |
+
success_count = 0
|
| 297 |
+
with self.driver.session(database=self.database) as session:
|
| 298 |
+
for stmt in self.CONSTRAINTS:
|
| 299 |
+
try:
|
| 300 |
+
session.run(stmt)
|
| 301 |
+
success_count += 1
|
| 302 |
+
except Exception as e:
|
| 303 |
+
logger.debug(f"Constraint skipped: {e}")
|
| 304 |
+
logger.info(f"Constraints created: {success_count}/{len(self.CONSTRAINTS)}")
|
| 305 |
+
|
| 306 |
+
# Also create performance indexes
|
| 307 |
+
self.ensure_indexes()
|
| 308 |
+
|
| 309 |
+
def ensure_indexes(self) -> None:
|
| 310 |
+
"""Create performance indexes for faster queries.
|
| 311 |
+
|
| 312 |
+
Creates indexes on frequently queried properties and
|
| 313 |
+
optionally a full-text index for semantic search.
|
| 314 |
+
"""
|
| 315 |
+
with log_step(logger, "Create performance indexes"):
|
| 316 |
+
success_count = 0
|
| 317 |
+
with self.driver.session(database=self.database) as session:
|
| 318 |
+
for stmt in self.INDEXES:
|
| 319 |
+
try:
|
| 320 |
+
session.run(stmt)
|
| 321 |
+
success_count += 1
|
| 322 |
+
except Exception as e:
|
| 323 |
+
logger.debug(f"Index skipped: {e}")
|
| 324 |
+
|
| 325 |
+
# Try to create full-text index (may not be available on all tiers)
|
| 326 |
+
try:
|
| 327 |
+
session.run(self.FULLTEXT_INDEX)
|
| 328 |
+
logger.substep("Full-text index created")
|
| 329 |
+
except Exception as e:
|
| 330 |
+
logger.debug(f"Full-text index skipped: {e}")
|
| 331 |
+
|
| 332 |
+
logger.info(f"Indexes created: {success_count}/{len(self.INDEXES)}")
|
| 333 |
+
|
| 334 |
+
def get_statistics(self) -> Dict[str, Any]:
|
| 335 |
+
"""Get database statistics for monitoring.
|
| 336 |
+
|
| 337 |
+
Returns:
|
| 338 |
+
Dictionary with node/relationship counts and other stats.
|
| 339 |
+
"""
|
| 340 |
+
stats: Dict[str, Any] = {}
|
| 341 |
+
|
| 342 |
+
queries = {
|
| 343 |
+
"node_count": "MATCH (n) RETURN count(n) AS count",
|
| 344 |
+
"relationship_count": "MATCH ()-[r]->() RETURN count(r) AS count",
|
| 345 |
+
"project_count": "MATCH (p:Project) RETURN count(p) AS count",
|
| 346 |
+
"chunk_count": "MATCH (c:Chunk) RETURN count(c) AS count",
|
| 347 |
+
"entity_count": "MATCH (e) WHERE NOT e:Chunk AND NOT e:Project RETURN count(e) AS count",
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
for name, query in queries.items():
|
| 351 |
+
try:
|
| 352 |
+
result = self.graph.query(query)
|
| 353 |
+
stats[name] = result[0]["count"] if result else 0
|
| 354 |
+
except Exception:
|
| 355 |
+
stats[name] = -1
|
| 356 |
+
|
| 357 |
+
return stats
|
| 358 |
+
|
| 359 |
+
def clear(self) -> None:
|
| 360 |
+
"""Delete all nodes and relationships from the database."""
|
| 361 |
+
logger.info("Clearing all nodes and relationships from database")
|
| 362 |
+
self.graph.query("MATCH (n) DETACH DELETE n")
|
| 363 |
+
logger.info("Database cleared")
|
| 364 |
+
|
| 365 |
+
def upsert_structured_project(
|
| 366 |
+
self,
|
| 367 |
+
record: ProjectRecord
|
| 368 |
+
) -> Dict[str, Any]:
|
| 369 |
+
"""Upsert structured nodes/relationships for a single project record.
|
| 370 |
+
|
| 371 |
+
This function is the reliability backbone for:
|
| 372 |
+
- Budget allocation & location questions
|
| 373 |
+
- Timeline comparison questions
|
| 374 |
+
- Challenges questions (derived from reason/details/schedule heuristics)
|
| 375 |
+
|
| 376 |
+
Args:
|
| 377 |
+
record: Parsed ProjectRecord.
|
| 378 |
+
|
| 379 |
+
Returns:
|
| 380 |
+
Dictionary with {"projectId": ..., "name": ...}.
|
| 381 |
+
"""
|
| 382 |
+
project_name = record.project_name or record.source
|
| 383 |
+
logger.debug(f"Upserting project: {project_name}")
|
| 384 |
+
|
| 385 |
+
project_key = record.get_unique_key()
|
| 386 |
+
loc_key = f"{project_key}::loc"
|
| 387 |
+
bud_key = f"{project_key}::tiv"
|
| 388 |
+
rep_key = f"{project_key}::report::{record.last_update or ''}"
|
| 389 |
+
|
| 390 |
+
# Parse geographic components
|
| 391 |
+
geo = self._parser.parse_city_state_country(record.city_state_line)
|
| 392 |
+
|
| 393 |
+
# Derive challenges and milestones
|
| 394 |
+
challenges = self._parser.derive_challenges(record)
|
| 395 |
+
milestones = self._parser.extract_milestones(record.schedule_text)
|
| 396 |
+
milestone_dicts = [m.to_dict() for m in milestones]
|
| 397 |
+
|
| 398 |
+
logger.substep(f"Extracted {len(challenges)} challenges, {len(milestones)} milestones")
|
| 399 |
+
if milestones:
|
| 400 |
+
for ms in milestones:
|
| 401 |
+
logger.substep(f" Milestone: {ms.name} -> {ms.date_text}")
|
| 402 |
+
else:
|
| 403 |
+
logger.warning(f"No milestones extracted from schedule_text: {record.schedule_text[:100] if record.schedule_text else 'None'}...")
|
| 404 |
+
|
| 405 |
+
params = {
|
| 406 |
+
# Identification
|
| 407 |
+
"source": record.source,
|
| 408 |
+
"project_id": record.project_id or record.project_name or record.source,
|
| 409 |
+
"project_name": record.project_name or record.source,
|
| 410 |
+
# Classification
|
| 411 |
+
"industry_code": record.industry_code,
|
| 412 |
+
"project_type": record.project_type,
|
| 413 |
+
"sector": record.sector,
|
| 414 |
+
"sic_code": record.sic_code,
|
| 415 |
+
# Financial
|
| 416 |
+
"bud_key": bud_key,
|
| 417 |
+
"tiv_amount": record.tiv_amount,
|
| 418 |
+
"tiv_currency": record.tiv_currency,
|
| 419 |
+
# Status
|
| 420 |
+
"status": record.status,
|
| 421 |
+
"status_reason": record.status_reason,
|
| 422 |
+
"project_probability": record.project_probability,
|
| 423 |
+
# Timeline
|
| 424 |
+
"last_update": record.last_update,
|
| 425 |
+
"initial_release": record.initial_release,
|
| 426 |
+
"pec_timing": record.pec_timing,
|
| 427 |
+
"pec_activity": record.pec_activity,
|
| 428 |
+
# Location
|
| 429 |
+
"loc_key": loc_key,
|
| 430 |
+
"address": record.address,
|
| 431 |
+
"city": geo.city,
|
| 432 |
+
"state": geo.state,
|
| 433 |
+
"postal": geo.postal,
|
| 434 |
+
"country": geo.country,
|
| 435 |
+
"zone_county": record.zone_county,
|
| 436 |
+
"phone": record.phone,
|
| 437 |
+
# Plant Info
|
| 438 |
+
"plant_owner": record.plant_owner,
|
| 439 |
+
"plant_parent": record.plant_parent,
|
| 440 |
+
"plant_name": record.plant_name,
|
| 441 |
+
"plant_id": record.plant_id,
|
| 442 |
+
"unit_name": record.unit_name,
|
| 443 |
+
# Contacts
|
| 444 |
+
"project_manager": record.project_manager,
|
| 445 |
+
"project_manager_company": record.project_manager_company,
|
| 446 |
+
"project_manager_email": record.project_manager_email,
|
| 447 |
+
"engineer_company": record.engineer_company,
|
| 448 |
+
"ec_firm": record.ec_firm,
|
| 449 |
+
# Technical
|
| 450 |
+
"scope_text": record.scope_text,
|
| 451 |
+
"project_capacity": record.project_capacity,
|
| 452 |
+
"environmental": record.environmental,
|
| 453 |
+
"construction_labor": record.construction_labor,
|
| 454 |
+
"fuel_type": record.fuel_type,
|
| 455 |
+
# Report
|
| 456 |
+
"rep_key": rep_key,
|
| 457 |
+
# Derived
|
| 458 |
+
"challenges": challenges,
|
| 459 |
+
"milestones": milestone_dicts,
|
| 460 |
+
}
|
| 461 |
+
|
| 462 |
+
with self.driver.session(database=self.database) as session:
|
| 463 |
+
# Step 1: Upsert base project with all fields
|
| 464 |
+
base_query = """
|
| 465 |
+
MERGE (p:Project {projectId: $project_id})
|
| 466 |
+
ON CREATE SET p.name = $project_name
|
| 467 |
+
ON MATCH SET p.name = coalesce(p.name, $project_name)
|
| 468 |
+
SET p.source = $source,
|
| 469 |
+
// Classification
|
| 470 |
+
p.industryCode = $industry_code,
|
| 471 |
+
p.projectType = $project_type,
|
| 472 |
+
p.sector = $sector,
|
| 473 |
+
p.sicCode = $sic_code,
|
| 474 |
+
// Status
|
| 475 |
+
p.status = $status,
|
| 476 |
+
p.statusReason = $status_reason,
|
| 477 |
+
p.projectProbability = $project_probability,
|
| 478 |
+
// Timeline
|
| 479 |
+
p.lastUpdate = $last_update,
|
| 480 |
+
p.initialRelease = $initial_release,
|
| 481 |
+
p.pecTiming = $pec_timing,
|
| 482 |
+
p.pecActivity = $pec_activity,
|
| 483 |
+
// Plant Info
|
| 484 |
+
p.plantOwner = $plant_owner,
|
| 485 |
+
p.plantParent = $plant_parent,
|
| 486 |
+
p.plantName = $plant_name,
|
| 487 |
+
p.plantId = $plant_id,
|
| 488 |
+
p.unitName = $unit_name,
|
| 489 |
+
p.phone = $phone,
|
| 490 |
+
// Contacts
|
| 491 |
+
p.projectManager = $project_manager,
|
| 492 |
+
p.projectManagerCompany = $project_manager_company,
|
| 493 |
+
p.projectManagerEmail = $project_manager_email,
|
| 494 |
+
p.engineerCompany = $engineer_company,
|
| 495 |
+
p.ecFirm = $ec_firm,
|
| 496 |
+
// Technical
|
| 497 |
+
p.scopeText = $scope_text,
|
| 498 |
+
p.projectCapacity = $project_capacity,
|
| 499 |
+
p.environmental = $environmental,
|
| 500 |
+
p.constructionLabor = $construction_labor,
|
| 501 |
+
p.fuelType = $fuel_type
|
| 502 |
+
|
| 503 |
+
WITH p
|
| 504 |
+
MERGE (b:Budget {key: $bud_key})
|
| 505 |
+
SET b.amount = $tiv_amount, b.currency = $tiv_currency, b.kind = 'TIV', b.source = $source
|
| 506 |
+
MERGE (p)-[:HAS_BUDGET]->(b)
|
| 507 |
+
|
| 508 |
+
WITH p
|
| 509 |
+
MERGE (l:Location {key: $loc_key})
|
| 510 |
+
SET l.address = $address, l.city = $city, l.state = $state,
|
| 511 |
+
l.postal = $postal, l.country = $country, l.zoneCounty = $zone_county, l.source = $source
|
| 512 |
+
MERGE (p)-[:LOCATED_IN]->(l)
|
| 513 |
+
|
| 514 |
+
WITH p
|
| 515 |
+
MERGE (r:Report {key: $rep_key})
|
| 516 |
+
SET r.source = $source, r.lastUpdate = $last_update, r.initialRelease = $initial_release
|
| 517 |
+
MERGE (p)-[:HAS_REPORT]->(r)
|
| 518 |
+
|
| 519 |
+
RETURN p.projectId AS projectId, p.name AS name
|
| 520 |
+
"""
|
| 521 |
+
logger.substep("Executing base project upsert")
|
| 522 |
+
row = session.run(base_query, params).single()
|
| 523 |
+
|
| 524 |
+
if row is None:
|
| 525 |
+
logger.warning("Base project upsert returned no result")
|
| 526 |
+
return {"projectId": params["project_id"], "name": params["project_name"]}
|
| 527 |
+
|
| 528 |
+
project_id = row["projectId"]
|
| 529 |
+
project_name = row["name"]
|
| 530 |
+
logger.substep(f"Project created: {project_name}")
|
| 531 |
+
|
| 532 |
+
# Step 2: Add challenges (separate query)
|
| 533 |
+
if challenges:
|
| 534 |
+
for i, ch in enumerate(challenges):
|
| 535 |
+
ch_query = """
|
| 536 |
+
MATCH (p:Project {projectId: $project_id})
|
| 537 |
+
MERGE (c:Challenge {key: $ch_key})
|
| 538 |
+
SET c.text = $ch_text, c.source = $source
|
| 539 |
+
MERGE (p)-[:HAS_CHALLENGE]->(c)
|
| 540 |
+
"""
|
| 541 |
+
session.run(ch_query, {
|
| 542 |
+
"project_id": project_id,
|
| 543 |
+
"ch_key": f"{project_id}::ch::{i}",
|
| 544 |
+
"ch_text": ch,
|
| 545 |
+
"source": record.source
|
| 546 |
+
})
|
| 547 |
+
logger.substep(f"Added {len(challenges)} challenges")
|
| 548 |
+
|
| 549 |
+
# Step 3: Add milestones (separate query)
|
| 550 |
+
if milestone_dicts:
|
| 551 |
+
for i, ms in enumerate(milestone_dicts):
|
| 552 |
+
ms_query = """
|
| 553 |
+
MATCH (p:Project {projectId: $project_id})
|
| 554 |
+
MERGE (m:Milestone {key: $ms_key})
|
| 555 |
+
SET m.name = $ms_name, m.dateText = $ms_date, m.sentence = $ms_sentence, m.source = $source
|
| 556 |
+
MERGE (p)-[:HAS_MILESTONE]->(m)
|
| 557 |
+
"""
|
| 558 |
+
session.run(ms_query, {
|
| 559 |
+
"project_id": project_id,
|
| 560 |
+
"ms_key": f"{project_id}::ms::{i}",
|
| 561 |
+
"ms_name": ms.get("name", ""),
|
| 562 |
+
"ms_date": ms.get("dateText", ""),
|
| 563 |
+
"ms_sentence": ms.get("sentence", ""),
|
| 564 |
+
"source": record.source
|
| 565 |
+
})
|
| 566 |
+
logger.substep(f"Added {len(milestone_dicts)} milestones")
|
| 567 |
+
|
| 568 |
+
return {"projectId": project_id, "name": project_name}
|
| 569 |
+
|
| 570 |
+
def query(self, cypher: str, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
|
| 571 |
+
"""Execute a Cypher query and return results.
|
| 572 |
+
|
| 573 |
+
Args:
|
| 574 |
+
cypher: Cypher query string.
|
| 575 |
+
params: Optional query parameters.
|
| 576 |
+
|
| 577 |
+
Returns:
|
| 578 |
+
List of result dictionaries.
|
| 579 |
+
"""
|
| 580 |
+
return self.graph.query(cypher, params or {})
|
| 581 |
+
|
| 582 |
+
def __enter__(self) -> "Neo4jService":
|
| 583 |
+
"""Context manager entry."""
|
| 584 |
+
return self
|
| 585 |
+
|
| 586 |
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
| 587 |
+
"""Context manager exit with cleanup."""
|
| 588 |
+
self.close()
|
src/services/reranker.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Cross-encoder reranker for document retrieval."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
from langchain.schema import Document
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
# Lazy import to avoid loading model at import time
|
| 13 |
+
_cross_encoder = None
|
| 14 |
+
_cross_encoder_model_name = None
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _get_cross_encoder(model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
|
| 18 |
+
"""Lazy load the cross-encoder model.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
model_name: HuggingFace model identifier
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
CrossEncoder instance
|
| 25 |
+
"""
|
| 26 |
+
global _cross_encoder, _cross_encoder_model_name
|
| 27 |
+
|
| 28 |
+
if _cross_encoder is None or _cross_encoder_model_name != model_name:
|
| 29 |
+
try:
|
| 30 |
+
from sentence_transformers import CrossEncoder
|
| 31 |
+
logger.info(f"Loading cross-encoder model: {model_name}")
|
| 32 |
+
_cross_encoder = CrossEncoder(model_name, max_length=512)
|
| 33 |
+
_cross_encoder_model_name = model_name
|
| 34 |
+
except ImportError:
|
| 35 |
+
logger.warning(
|
| 36 |
+
"sentence-transformers not installed. "
|
| 37 |
+
"Run: pip install sentence-transformers"
|
| 38 |
+
)
|
| 39 |
+
return None
|
| 40 |
+
except Exception as e:
|
| 41 |
+
logger.warning(f"Failed to load cross-encoder: {e}")
|
| 42 |
+
return None
|
| 43 |
+
|
| 44 |
+
return _cross_encoder
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class FastCrossEncoderReranker:
|
| 48 |
+
"""Cross-encoder reranker using sentence-transformers.
|
| 49 |
+
|
| 50 |
+
Runs locally and is faster than LLM-based reranking.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
MODEL_OPTIONS = {
|
| 54 |
+
"fast": "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
| 55 |
+
"balanced": "cross-encoder/ms-marco-MiniLM-L-12-v2",
|
| 56 |
+
"tiny": "cross-encoder/ms-marco-TinyBERT-L-2-v2",
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
def __init__(
|
| 60 |
+
self,
|
| 61 |
+
model_name: str = "fast",
|
| 62 |
+
max_length: int = 512,
|
| 63 |
+
batch_size: int = 16,
|
| 64 |
+
) -> None:
|
| 65 |
+
"""Initialize cross-encoder reranker.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
model_name: One of "fast", "balanced", "tiny", or a HuggingFace model ID
|
| 69 |
+
max_length: Maximum sequence length for encoding
|
| 70 |
+
batch_size: Batch size for scoring (higher = faster but more memory)
|
| 71 |
+
"""
|
| 72 |
+
# Resolve model name alias
|
| 73 |
+
self.model_name = self.MODEL_OPTIONS.get(model_name, model_name)
|
| 74 |
+
self.max_length = max_length
|
| 75 |
+
self.batch_size = batch_size
|
| 76 |
+
self._model = None
|
| 77 |
+
|
| 78 |
+
def _ensure_model(self) -> bool:
|
| 79 |
+
"""Ensure model is loaded.
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
True if model is available, False otherwise
|
| 83 |
+
"""
|
| 84 |
+
if self._model is None:
|
| 85 |
+
self._model = _get_cross_encoder(self.model_name)
|
| 86 |
+
return self._model is not None
|
| 87 |
+
|
| 88 |
+
def rerank(
|
| 89 |
+
self,
|
| 90 |
+
query: str,
|
| 91 |
+
documents: List[Document],
|
| 92 |
+
top_k: int = 6,
|
| 93 |
+
) -> List[Document]:
|
| 94 |
+
"""Rerank documents by relevance to query.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
query: User query
|
| 98 |
+
documents: Documents to rerank
|
| 99 |
+
top_k: Number of top documents to return
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
Reranked documents (most relevant first)
|
| 103 |
+
"""
|
| 104 |
+
if not documents:
|
| 105 |
+
return []
|
| 106 |
+
|
| 107 |
+
if len(documents) <= 1:
|
| 108 |
+
return documents
|
| 109 |
+
|
| 110 |
+
if not self._ensure_model():
|
| 111 |
+
logger.warning("Cross-encoder not available, returning original order")
|
| 112 |
+
return documents[:top_k]
|
| 113 |
+
|
| 114 |
+
try:
|
| 115 |
+
# Prepare query-document pairs
|
| 116 |
+
pairs = [
|
| 117 |
+
(query, self._get_text(doc)[:self.max_length])
|
| 118 |
+
for doc in documents
|
| 119 |
+
]
|
| 120 |
+
|
| 121 |
+
# Score all pairs (batched for efficiency)
|
| 122 |
+
scores = self._model.predict(
|
| 123 |
+
pairs,
|
| 124 |
+
batch_size=self.batch_size,
|
| 125 |
+
show_progress_bar=False,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# Sort by score descending
|
| 129 |
+
scored_docs = sorted(
|
| 130 |
+
zip(documents, scores),
|
| 131 |
+
key=lambda x: x[1],
|
| 132 |
+
reverse=True,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
return [doc for doc, _ in scored_docs[:top_k]]
|
| 136 |
+
|
| 137 |
+
except Exception as e:
|
| 138 |
+
logger.warning(f"Reranking failed: {e}, returning original order")
|
| 139 |
+
return documents[:top_k]
|
| 140 |
+
|
| 141 |
+
def rerank_with_scores(
|
| 142 |
+
self,
|
| 143 |
+
query: str,
|
| 144 |
+
documents: List[Document],
|
| 145 |
+
top_k: int = 6,
|
| 146 |
+
) -> List[Tuple[Document, float]]:
|
| 147 |
+
"""Rerank documents and return with scores.
|
| 148 |
+
|
| 149 |
+
Args:
|
| 150 |
+
query: User query
|
| 151 |
+
documents: Documents to rerank
|
| 152 |
+
top_k: Number of top documents to return
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
List of (document, score) tuples, sorted by score descending
|
| 156 |
+
"""
|
| 157 |
+
if not documents:
|
| 158 |
+
return []
|
| 159 |
+
|
| 160 |
+
if len(documents) <= 1:
|
| 161 |
+
return [(doc, 1.0) for doc in documents]
|
| 162 |
+
|
| 163 |
+
if not self._ensure_model():
|
| 164 |
+
return [(doc, 1.0 - i * 0.1) for i, doc in enumerate(documents[:top_k])]
|
| 165 |
+
|
| 166 |
+
try:
|
| 167 |
+
pairs = [
|
| 168 |
+
(query, self._get_text(doc)[:self.max_length])
|
| 169 |
+
for doc in documents
|
| 170 |
+
]
|
| 171 |
+
|
| 172 |
+
scores = self._model.predict(
|
| 173 |
+
pairs,
|
| 174 |
+
batch_size=self.batch_size,
|
| 175 |
+
show_progress_bar=False,
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
scored_docs = sorted(
|
| 179 |
+
zip(documents, scores),
|
| 180 |
+
key=lambda x: x[1],
|
| 181 |
+
reverse=True,
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
return scored_docs[:top_k]
|
| 185 |
+
|
| 186 |
+
except Exception as e:
|
| 187 |
+
logger.warning(f"Reranking failed: {e}")
|
| 188 |
+
return [(doc, 1.0 - i * 0.1) for i, doc in enumerate(documents[:top_k])]
|
| 189 |
+
|
| 190 |
+
def _get_text(self, doc: Document) -> str:
|
| 191 |
+
"""Extract text content from document.
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
doc: LangChain Document
|
| 195 |
+
|
| 196 |
+
Returns:
|
| 197 |
+
Text content
|
| 198 |
+
"""
|
| 199 |
+
if hasattr(doc, 'page_content'):
|
| 200 |
+
return doc.page_content
|
| 201 |
+
return str(doc)
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
class NoOpReranker:
|
| 205 |
+
"""No-op reranker that returns documents in original order.
|
| 206 |
+
|
| 207 |
+
Use this as a fallback when cross-encoder is not available.
|
| 208 |
+
"""
|
| 209 |
+
|
| 210 |
+
def rerank(
|
| 211 |
+
self,
|
| 212 |
+
query: str,
|
| 213 |
+
documents: List[Document],
|
| 214 |
+
top_k: int = 6,
|
| 215 |
+
) -> List[Document]:
|
| 216 |
+
"""Return documents without reranking."""
|
| 217 |
+
return documents[:top_k]
|
| 218 |
+
|
| 219 |
+
def rerank_with_scores(
|
| 220 |
+
self,
|
| 221 |
+
query: str,
|
| 222 |
+
documents: List[Document],
|
| 223 |
+
top_k: int = 6,
|
| 224 |
+
) -> List[Tuple[Document, float]]:
|
| 225 |
+
"""Return documents with dummy scores."""
|
| 226 |
+
return [(doc, 1.0 - i * 0.05) for i, doc in enumerate(documents[:top_k])]
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def get_reranker(
|
| 230 |
+
model_name: str = "fast",
|
| 231 |
+
fallback_to_noop: bool = True,
|
| 232 |
+
) -> FastCrossEncoderReranker:
|
| 233 |
+
"""Factory function to get a reranker instance.
|
| 234 |
+
|
| 235 |
+
Args:
|
| 236 |
+
model_name: Model name or alias
|
| 237 |
+
fallback_to_noop: If True, return NoOpReranker when cross-encoder fails
|
| 238 |
+
|
| 239 |
+
Returns:
|
| 240 |
+
Reranker instance
|
| 241 |
+
"""
|
| 242 |
+
try:
|
| 243 |
+
reranker = FastCrossEncoderReranker(model_name)
|
| 244 |
+
# Test model loading
|
| 245 |
+
if reranker._ensure_model():
|
| 246 |
+
return reranker
|
| 247 |
+
except Exception as e:
|
| 248 |
+
logger.warning(f"Failed to create cross-encoder reranker: {e}")
|
| 249 |
+
|
| 250 |
+
if fallback_to_noop:
|
| 251 |
+
logger.info("Using no-op reranker as fallback")
|
| 252 |
+
return NoOpReranker()
|
| 253 |
+
|
| 254 |
+
raise RuntimeError("Cross-encoder reranker not available")
|
src/services/retriever.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Optimized retriever with pattern-based expansion and cross-encoder reranking."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import hashlib
|
| 6 |
+
import re
|
| 7 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 8 |
+
|
| 9 |
+
from langchain.schema import Document
|
| 10 |
+
|
| 11 |
+
from src.config import get_logger, log_step
|
| 12 |
+
|
| 13 |
+
logger = get_logger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class OptimizedRetriever:
|
| 17 |
+
"""Fast retriever without LLM calls for expansion/reranking.
|
| 18 |
+
|
| 19 |
+
Uses pattern-based query expansion and cross-encoder reranking
|
| 20 |
+
instead of LLM calls for faster retrieval.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
EXPANSION_PATTERNS = {
|
| 24 |
+
"budget": ["cost", "investment", "TIV", "capex", "funding", "allocation", "financial"],
|
| 25 |
+
"location": ["site", "address", "city", "country", "region", "plant", "facility"],
|
| 26 |
+
"timeline": ["schedule", "milestone", "deadline", "completion", "duration", "phase"],
|
| 27 |
+
"challenge": ["risk", "issue", "constraint", "problem", "delay", "obstacle", "barrier"],
|
| 28 |
+
"project": ["plant", "facility", "refinery", "station", "development"],
|
| 29 |
+
"status": ["progress", "state", "condition", "update"],
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
def __init__(
|
| 33 |
+
self,
|
| 34 |
+
vector_store: Any,
|
| 35 |
+
reranker: Optional[Any] = None,
|
| 36 |
+
k_initial: int = 12,
|
| 37 |
+
k_final: int = 6,
|
| 38 |
+
use_expansion: bool = True,
|
| 39 |
+
use_reranking: bool = True,
|
| 40 |
+
use_cache: bool = True,
|
| 41 |
+
) -> None:
|
| 42 |
+
self.vector_store = vector_store
|
| 43 |
+
self.k_initial = k_initial
|
| 44 |
+
self.k_final = k_final
|
| 45 |
+
self.use_expansion = use_expansion
|
| 46 |
+
self.use_reranking = use_reranking
|
| 47 |
+
self.use_cache = use_cache
|
| 48 |
+
self._cache: Dict[str, List[Document]] = {}
|
| 49 |
+
self._reranker = reranker
|
| 50 |
+
self._reranker_loaded = reranker is not None
|
| 51 |
+
|
| 52 |
+
def _get_reranker(self) -> Optional[Any]:
|
| 53 |
+
if self._reranker_loaded:
|
| 54 |
+
return self._reranker
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
from src.services.reranker import get_reranker
|
| 58 |
+
self._reranker = get_reranker("fast")
|
| 59 |
+
self._reranker_loaded = True
|
| 60 |
+
logger.info("Loaded cross-encoder reranker")
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.warning(f"Could not load reranker: {e}")
|
| 63 |
+
self._reranker = None
|
| 64 |
+
self._reranker_loaded = True
|
| 65 |
+
|
| 66 |
+
return self._reranker
|
| 67 |
+
|
| 68 |
+
def _cache_key(self, query: str) -> str:
|
| 69 |
+
return hashlib.md5(query.lower().strip().encode()).hexdigest()
|
| 70 |
+
|
| 71 |
+
def _expand_query_fast(self, query: str) -> List[str]:
|
| 72 |
+
queries = [query]
|
| 73 |
+
query_lower = query.lower()
|
| 74 |
+
|
| 75 |
+
for keyword, expansions in self.EXPANSION_PATTERNS.items():
|
| 76 |
+
if keyword in query_lower:
|
| 77 |
+
for exp in expansions[:2]:
|
| 78 |
+
if exp.lower() not in query_lower:
|
| 79 |
+
variation = re.sub(
|
| 80 |
+
rf'\b{keyword}\b',
|
| 81 |
+
exp,
|
| 82 |
+
query,
|
| 83 |
+
flags=re.IGNORECASE
|
| 84 |
+
)
|
| 85 |
+
if variation != query and variation not in queries:
|
| 86 |
+
queries.append(variation)
|
| 87 |
+
break
|
| 88 |
+
|
| 89 |
+
return queries[:3]
|
| 90 |
+
|
| 91 |
+
def _reciprocal_rank_fusion(
|
| 92 |
+
self,
|
| 93 |
+
result_lists: List[List[Tuple[Document, float]]],
|
| 94 |
+
k: int = 60,
|
| 95 |
+
) -> List[Document]:
|
| 96 |
+
doc_scores: Dict[str, Dict[str, Any]] = {}
|
| 97 |
+
|
| 98 |
+
for results in result_lists:
|
| 99 |
+
for rank, (doc, _) in enumerate(results):
|
| 100 |
+
doc_id = hashlib.md5(doc.page_content[:200].encode()).hexdigest()
|
| 101 |
+
|
| 102 |
+
if doc_id not in doc_scores:
|
| 103 |
+
doc_scores[doc_id] = {"doc": doc, "score": 0}
|
| 104 |
+
|
| 105 |
+
doc_scores[doc_id]["score"] += 1.0 / (k + rank + 1)
|
| 106 |
+
|
| 107 |
+
sorted_items = sorted(
|
| 108 |
+
doc_scores.values(),
|
| 109 |
+
key=lambda x: x["score"],
|
| 110 |
+
reverse=True,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
return [item["doc"] for item in sorted_items]
|
| 114 |
+
|
| 115 |
+
def retrieve(self, question: str) -> List[Document]:
|
| 116 |
+
with log_step(logger, "Optimized retrieval"):
|
| 117 |
+
if self.use_cache:
|
| 118 |
+
cache_key = self._cache_key(question)
|
| 119 |
+
if cache_key in self._cache:
|
| 120 |
+
logger.info("Cache hit - returning cached results")
|
| 121 |
+
return self._cache[cache_key]
|
| 122 |
+
|
| 123 |
+
if self.use_expansion:
|
| 124 |
+
queries = self._expand_query_fast(question)
|
| 125 |
+
logger.substep(f"Expanded to {len(queries)} queries")
|
| 126 |
+
else:
|
| 127 |
+
queries = [question]
|
| 128 |
+
|
| 129 |
+
all_results: List[List[Tuple[Document, float]]] = []
|
| 130 |
+
|
| 131 |
+
for i, query in enumerate(queries):
|
| 132 |
+
try:
|
| 133 |
+
if hasattr(self.vector_store, 'similarity_search_with_score'):
|
| 134 |
+
results = self.vector_store.similarity_search_with_score(
|
| 135 |
+
query, k=self.k_initial
|
| 136 |
+
)
|
| 137 |
+
else:
|
| 138 |
+
docs = self.vector_store.similarity_search(
|
| 139 |
+
query, k=self.k_initial
|
| 140 |
+
)
|
| 141 |
+
results = [(doc, 1.0 - j * 0.01) for j, doc in enumerate(docs)]
|
| 142 |
+
|
| 143 |
+
all_results.append(results)
|
| 144 |
+
except Exception as e:
|
| 145 |
+
logger.warning(f"Query {i+1} failed: {e}")
|
| 146 |
+
|
| 147 |
+
if not all_results:
|
| 148 |
+
logger.warning("No results from any query")
|
| 149 |
+
return []
|
| 150 |
+
|
| 151 |
+
if len(all_results) > 1:
|
| 152 |
+
fused_docs = self._reciprocal_rank_fusion(all_results)
|
| 153 |
+
else:
|
| 154 |
+
fused_docs = [doc for doc, _ in all_results[0]]
|
| 155 |
+
|
| 156 |
+
fused_docs = fused_docs[:self.k_initial]
|
| 157 |
+
logger.substep(f"Fused to {len(fused_docs)} documents")
|
| 158 |
+
|
| 159 |
+
if self.use_reranking and len(fused_docs) > self.k_final:
|
| 160 |
+
reranker = self._get_reranker()
|
| 161 |
+
if reranker:
|
| 162 |
+
with log_step(logger, "Cross-encoder reranking"):
|
| 163 |
+
fused_docs = reranker.rerank(question, fused_docs, self.k_final)
|
| 164 |
+
|
| 165 |
+
final_docs = fused_docs[:self.k_final]
|
| 166 |
+
|
| 167 |
+
if self.use_cache:
|
| 168 |
+
self._cache[cache_key] = final_docs
|
| 169 |
+
|
| 170 |
+
logger.info(f"Returning {len(final_docs)} documents")
|
| 171 |
+
return final_docs
|
| 172 |
+
|
| 173 |
+
def clear_cache(self) -> None:
|
| 174 |
+
self._cache.clear()
|
| 175 |
+
|
| 176 |
+
def get_cache_stats(self) -> Dict[str, int]:
|
| 177 |
+
return {"cached_queries": len(self._cache)}
|
src/ui/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""User interface components."""
|
| 2 |
+
|
| 3 |
+
from src.ui.gradio_app import GradioApp
|
| 4 |
+
|
| 5 |
+
__all__ = ["GradioApp"]
|
src/ui/gradio_app.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gradio web interface for Project Intelligence Hub."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, List
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
|
| 9 |
+
from src.config.settings import Settings, Neo4jConfig, TogetherAIConfig
|
| 10 |
+
from src.models.state import AppState
|
| 11 |
+
from src.services.builder import GraphRAGBuilder
|
| 12 |
+
from src.services.answerer import QueryAnswerer
|
| 13 |
+
from src.services.neo4j_service import Neo4jService, Neo4jConnectionError
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class GradioApp:
|
| 17 |
+
"""Gradio controller for ingestion and query-time interactions."""
|
| 18 |
+
|
| 19 |
+
TITLE = "Project Intelligence Hub"
|
| 20 |
+
DESCRIPTION = """
|
| 21 |
+
# Project Intelligence Hub
|
| 22 |
+
|
| 23 |
+
Transform unstructured PDF reports into a queryable knowledge graph.
|
| 24 |
+
|
| 25 |
+
1. **Ingest** — Upload documents to extract entities and relationships
|
| 26 |
+
2. **Index** — Build vector embeddings and graph structure
|
| 27 |
+
3. **Query** — Retrieve answers via hybrid graph + semantic search
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
GRAPH_EXPLORER_QUERIES = {
|
| 31 |
+
"node_labels": """
|
| 32 |
+
CALL db.labels() YIELD label
|
| 33 |
+
CALL { WITH label MATCH (n) WHERE label IN labels(n) RETURN count(n) AS cnt }
|
| 34 |
+
RETURN label, cnt ORDER BY cnt DESC
|
| 35 |
+
""",
|
| 36 |
+
"relationship_types": """
|
| 37 |
+
CALL db.relationshipTypes() YIELD relationshipType
|
| 38 |
+
CALL { WITH relationshipType MATCH ()-[r]->() WHERE type(r) = relationshipType RETURN count(r) AS cnt }
|
| 39 |
+
RETURN relationshipType, cnt ORDER BY cnt DESC
|
| 40 |
+
""",
|
| 41 |
+
"sample_projects": """
|
| 42 |
+
MATCH (p:Project)
|
| 43 |
+
OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
|
| 44 |
+
OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
|
| 45 |
+
RETURN p.name AS project, b.amount AS budget, b.currency AS currency,
|
| 46 |
+
l.city AS city, l.country AS country
|
| 47 |
+
LIMIT 10
|
| 48 |
+
""",
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
def __init__(self, settings: Settings | None = None) -> None:
|
| 52 |
+
self.settings = settings or Settings.from_env()
|
| 53 |
+
self.answerer = QueryAnswerer()
|
| 54 |
+
self._validate_settings()
|
| 55 |
+
|
| 56 |
+
def _validate_settings(self) -> None:
|
| 57 |
+
issues = []
|
| 58 |
+
if not self.settings.together_ai.api_key:
|
| 59 |
+
issues.append("TOGETHER_API_KEY not set in .env")
|
| 60 |
+
if not self.settings.neo4j.uri:
|
| 61 |
+
issues.append("NEO4J_URI not set in .env")
|
| 62 |
+
if not self.settings.neo4j.password:
|
| 63 |
+
issues.append("NEO4J_PASSWORD not set in .env")
|
| 64 |
+
|
| 65 |
+
if issues:
|
| 66 |
+
print("Configuration warnings:")
|
| 67 |
+
for issue in issues:
|
| 68 |
+
print(f" - {issue}")
|
| 69 |
+
|
| 70 |
+
def _ingest_action(self, pdf_files: List[Any], clear_db: str):
|
| 71 |
+
clear_db_bool = clear_db == "Yes"
|
| 72 |
+
|
| 73 |
+
if not pdf_files:
|
| 74 |
+
yield "No documents provided. Upload at least one PDF.", gr.update(value=0, visible=True), None
|
| 75 |
+
return
|
| 76 |
+
|
| 77 |
+
if not self.settings.together_ai.api_key:
|
| 78 |
+
yield "Missing API credentials: TOGETHER_API_KEY", gr.update(value=0, visible=True), None
|
| 79 |
+
return
|
| 80 |
+
|
| 81 |
+
if not self.settings.neo4j.uri or not self.settings.neo4j.password:
|
| 82 |
+
yield "Missing database credentials: NEO4J_URI or NEO4J_PASSWORD", gr.update(value=0, visible=True), None
|
| 83 |
+
return
|
| 84 |
+
|
| 85 |
+
together_config = TogetherAIConfig(
|
| 86 |
+
api_key=self.settings.together_ai.api_key,
|
| 87 |
+
chat_model=self.settings.together_ai.chat_model,
|
| 88 |
+
embedding_model=self.settings.together_ai.embedding_model,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
neo4j_config = Neo4jConfig(
|
| 92 |
+
uri=self.settings.neo4j.uri,
|
| 93 |
+
username=self.settings.neo4j.username,
|
| 94 |
+
password=self.settings.neo4j.password,
|
| 95 |
+
database=self.settings.neo4j.database,
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
builder = GraphRAGBuilder(together_config=together_config)
|
| 100 |
+
|
| 101 |
+
final_state = None
|
| 102 |
+
for status, progress, state in builder.ingest_with_progress(
|
| 103 |
+
pdf_files=pdf_files,
|
| 104 |
+
neo4j_config=neo4j_config,
|
| 105 |
+
clear_db=clear_db_bool,
|
| 106 |
+
skip_llm_extraction=True,
|
| 107 |
+
):
|
| 108 |
+
yield status, gr.update(value=progress, visible=True), state
|
| 109 |
+
if state is not None:
|
| 110 |
+
final_state = state
|
| 111 |
+
|
| 112 |
+
if final_state:
|
| 113 |
+
yield "Pipeline complete. Ready for queries.", gr.update(value=1.0, visible=False), final_state
|
| 114 |
+
|
| 115 |
+
except ValueError as e:
|
| 116 |
+
yield f"Configuration error: {e}", gr.update(value=0, visible=True), None
|
| 117 |
+
except Exception as e:
|
| 118 |
+
import traceback
|
| 119 |
+
traceback.print_exc()
|
| 120 |
+
yield f"Pipeline failed: {e}", gr.update(value=0, visible=True), None
|
| 121 |
+
|
| 122 |
+
def _clear_action(self) -> str:
|
| 123 |
+
if not self.settings.neo4j.uri or not self.settings.neo4j.password:
|
| 124 |
+
return "Database credentials not configured."
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
with Neo4jService(
|
| 128 |
+
uri=self.settings.neo4j.uri,
|
| 129 |
+
user=self.settings.neo4j.username,
|
| 130 |
+
password=self.settings.neo4j.password,
|
| 131 |
+
database=self.settings.neo4j.database,
|
| 132 |
+
) as neo4j:
|
| 133 |
+
neo4j.clear()
|
| 134 |
+
return "Graph database cleared. All nodes and relationships removed."
|
| 135 |
+
except Neo4jConnectionError as e:
|
| 136 |
+
return f"Connection error: {e}"
|
| 137 |
+
except Exception as e:
|
| 138 |
+
return f"Operation failed: {e}"
|
| 139 |
+
|
| 140 |
+
def _ask_action(self, question: str, state: AppState) -> str:
|
| 141 |
+
return self.answerer.answer(question, state)
|
| 142 |
+
|
| 143 |
+
def _explore_graph_action(self) -> str:
|
| 144 |
+
if not self.settings.neo4j.uri or not self.settings.neo4j.password:
|
| 145 |
+
return "Database credentials not configured."
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
with Neo4jService(
|
| 149 |
+
uri=self.settings.neo4j.uri,
|
| 150 |
+
user=self.settings.neo4j.username,
|
| 151 |
+
password=self.settings.neo4j.password,
|
| 152 |
+
database=self.settings.neo4j.database,
|
| 153 |
+
) as neo4j:
|
| 154 |
+
output = []
|
| 155 |
+
|
| 156 |
+
# Node counts by label
|
| 157 |
+
output.append("### Node Distribution\n")
|
| 158 |
+
output.append("| Label | Count |")
|
| 159 |
+
output.append("|-------|-------|")
|
| 160 |
+
try:
|
| 161 |
+
results = neo4j.query(self.GRAPH_EXPLORER_QUERIES["node_labels"])
|
| 162 |
+
for row in results:
|
| 163 |
+
output.append(f"| {row['label']} | {row['cnt']:,} |")
|
| 164 |
+
except Exception:
|
| 165 |
+
output.append("| (unable to fetch) | - |")
|
| 166 |
+
|
| 167 |
+
# Relationship counts
|
| 168 |
+
output.append("\n### Relationship Distribution\n")
|
| 169 |
+
output.append("| Type | Count |")
|
| 170 |
+
output.append("|------|-------|")
|
| 171 |
+
try:
|
| 172 |
+
results = neo4j.query(self.GRAPH_EXPLORER_QUERIES["relationship_types"])
|
| 173 |
+
for row in results:
|
| 174 |
+
output.append(f"| {row['relationshipType']} | {row['cnt']:,} |")
|
| 175 |
+
except Exception:
|
| 176 |
+
output.append("| (unable to fetch) | - |")
|
| 177 |
+
|
| 178 |
+
# Sample projects
|
| 179 |
+
output.append("\n### Sample Projects\n")
|
| 180 |
+
output.append("| Project | Budget | Location |")
|
| 181 |
+
output.append("|---------|--------|----------|")
|
| 182 |
+
try:
|
| 183 |
+
results = neo4j.query(self.GRAPH_EXPLORER_QUERIES["sample_projects"])
|
| 184 |
+
if not results:
|
| 185 |
+
output.append("| (no projects found) | - | - |")
|
| 186 |
+
for row in results:
|
| 187 |
+
name = row.get('project') or '-'
|
| 188 |
+
budget = f"{row.get('budget') or '-'} {row.get('currency') or ''}".strip()
|
| 189 |
+
location = f"{row.get('city') or ''}, {row.get('country') or ''}".strip(", ")
|
| 190 |
+
output.append(f"| {name} | {budget} | {location or '-'} |")
|
| 191 |
+
except Exception:
|
| 192 |
+
output.append("| (unable to fetch) | - | - |")
|
| 193 |
+
|
| 194 |
+
return "\n".join(output)
|
| 195 |
+
|
| 196 |
+
except Neo4jConnectionError as e:
|
| 197 |
+
return f"Connection error: {e}"
|
| 198 |
+
except Exception as e:
|
| 199 |
+
return f"Failed to fetch graph data: {e}"
|
| 200 |
+
|
| 201 |
+
def build(self) -> gr.Blocks:
|
| 202 |
+
with gr.Blocks(title=self.TITLE) as demo:
|
| 203 |
+
gr.Markdown(self.DESCRIPTION)
|
| 204 |
+
|
| 205 |
+
state = gr.State(value=None)
|
| 206 |
+
|
| 207 |
+
with gr.Group():
|
| 208 |
+
pdfs = gr.File(
|
| 209 |
+
label="Document Source",
|
| 210 |
+
file_types=[".pdf"],
|
| 211 |
+
file_count="multiple",
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
with gr.Row():
|
| 215 |
+
clear_toggle = gr.Radio(
|
| 216 |
+
label="Reset graph before ingestion",
|
| 217 |
+
choices=["Yes", "No"],
|
| 218 |
+
value="Yes",
|
| 219 |
+
scale=1,
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
with gr.Row():
|
| 223 |
+
ingest_btn = gr.Button("Run Ingestion Pipeline", variant="primary", scale=2)
|
| 224 |
+
clear_btn = gr.Button("Reset Graph", variant="secondary", scale=1)
|
| 225 |
+
|
| 226 |
+
progress_bar = gr.Slider(
|
| 227 |
+
label="Progress",
|
| 228 |
+
minimum=0,
|
| 229 |
+
maximum=1,
|
| 230 |
+
value=0,
|
| 231 |
+
interactive=False,
|
| 232 |
+
visible=False,
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
ingest_status = gr.Markdown()
|
| 236 |
+
|
| 237 |
+
gr.Markdown("---")
|
| 238 |
+
|
| 239 |
+
with gr.Group():
|
| 240 |
+
gr.Markdown("### Query Interface")
|
| 241 |
+
question = gr.Textbox(
|
| 242 |
+
label="Natural Language Query",
|
| 243 |
+
placeholder="e.g., Compare budget allocations and milestone timelines across projects",
|
| 244 |
+
lines=2,
|
| 245 |
+
)
|
| 246 |
+
ask_btn = gr.Button("Execute Query", variant="primary")
|
| 247 |
+
answer = gr.Markdown(label="Response")
|
| 248 |
+
|
| 249 |
+
with gr.Accordion("Graph Explorer", open=False):
|
| 250 |
+
gr.Markdown("View database contents without direct access to credentials.")
|
| 251 |
+
explore_btn = gr.Button("Load Graph Statistics", variant="secondary")
|
| 252 |
+
graph_stats = gr.Markdown()
|
| 253 |
+
|
| 254 |
+
with gr.Accordion("System Configuration", open=False):
|
| 255 |
+
gr.Markdown(self._get_config_status())
|
| 256 |
+
|
| 257 |
+
ingest_btn.click(
|
| 258 |
+
fn=self._ingest_action,
|
| 259 |
+
inputs=[pdfs, clear_toggle],
|
| 260 |
+
outputs=[ingest_status, progress_bar, state],
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
clear_btn.click(
|
| 264 |
+
fn=self._clear_action,
|
| 265 |
+
inputs=[],
|
| 266 |
+
outputs=[ingest_status],
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
ask_btn.click(
|
| 270 |
+
fn=self._ask_action,
|
| 271 |
+
inputs=[question, state],
|
| 272 |
+
outputs=[answer],
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
explore_btn.click(
|
| 276 |
+
fn=self._explore_graph_action,
|
| 277 |
+
inputs=[],
|
| 278 |
+
outputs=[graph_stats],
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
return demo
|
| 282 |
+
|
| 283 |
+
def _get_config_status(self) -> str:
|
| 284 |
+
def status(value: str) -> str:
|
| 285 |
+
return "Connected" if value else "Not configured"
|
| 286 |
+
|
| 287 |
+
return f"""
|
| 288 |
+
| Component | Status |
|
| 289 |
+
|-----------|--------|
|
| 290 |
+
| LLM Provider (Together AI) | {status(self.settings.together_ai.api_key)} |
|
| 291 |
+
| Graph Database (Neo4j) | {status(self.settings.neo4j.uri)} |
|
| 292 |
+
"""
|
| 293 |
+
|
| 294 |
+
def launch(self, **kwargs) -> None:
|
| 295 |
+
demo = self.build()
|
| 296 |
+
demo.launch(
|
| 297 |
+
server_name=kwargs.get("server_name", self.settings.app.host),
|
| 298 |
+
server_port=kwargs.get("server_port", self.settings.app.port),
|
| 299 |
+
theme=gr.themes.Soft(),
|
| 300 |
+
**{k: v for k, v in kwargs.items() if k not in ("server_name", "server_port")},
|
| 301 |
+
)
|