adi-123 commited on
Commit
8c35759
·
verified ·
1 Parent(s): d955ccf

Upload 21 files

Browse files
src/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GraphRAG Solution Package.
3
+
4
+ A modular GraphRAG (Graph Retrieval-Augmented Generation) application
5
+ for analyzing industrial project-report PDFs using Neo4j and Together AI.
6
+ """
7
+
8
+ __version__ = "1.0.0"
9
+ __author__ = "GraphRAG Team"
src/config/__init__.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration module for GraphRAG application."""
2
+
3
+ from src.config.schema import SchemaPolicy
4
+ from src.config.settings import Settings
5
+ from src.config.logging_config import (
6
+ configure_logging,
7
+ get_logger,
8
+ get_flow_logger,
9
+ trace_step,
10
+ trace_flow,
11
+ trace_context,
12
+ log_step,
13
+ TraceContext,
14
+ )
15
+
16
+ __all__ = [
17
+ "SchemaPolicy",
18
+ "Settings",
19
+ # Logging
20
+ "configure_logging",
21
+ "get_logger",
22
+ "get_flow_logger",
23
+ "trace_step",
24
+ "trace_flow",
25
+ "trace_context",
26
+ "log_step",
27
+ "TraceContext",
28
+ ]
src/config/logging_config.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Centralized logging configuration with flow tracing support."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import functools
6
+ import logging
7
+ import sys
8
+ import threading
9
+ import time
10
+ import uuid
11
+ from contextlib import contextmanager
12
+ from dataclasses import dataclass, field
13
+ from datetime import datetime
14
+ from typing import Any, Callable, Dict, List, Optional, TypeVar
15
+
16
+ F = TypeVar('F', bound=Callable[..., Any])
17
+
18
+
19
+ class GraphRAGFormatter(logging.Formatter):
20
+ """Custom formatter with color support and structured output."""
21
+
22
+ COLORS = {
23
+ 'DEBUG': '\033[36m',
24
+ 'INFO': '\033[32m',
25
+ 'WARNING': '\033[33m',
26
+ 'ERROR': '\033[31m',
27
+ 'CRITICAL': '\033[35m',
28
+ 'RESET': '\033[0m',
29
+ 'DIM': '\033[2m',
30
+ }
31
+
32
+ STEP_ICONS = {
33
+ 'start': '▶',
34
+ 'end': '✓',
35
+ 'error': '✗',
36
+ 'info': '•',
37
+ 'substep': ' ↳',
38
+ }
39
+
40
+ def __init__(self, fmt: Optional[str] = None, datefmt: Optional[str] = None, use_colors: bool = True):
41
+ super().__init__(fmt, datefmt)
42
+ self.use_colors = use_colors and sys.stdout.isatty()
43
+
44
+ def format(self, record: logging.LogRecord) -> str:
45
+ trace_id = getattr(record, 'trace_id', None)
46
+ step_type = getattr(record, 'step_type', None)
47
+ duration = getattr(record, 'duration', None)
48
+
49
+ prefix_parts = []
50
+ if trace_id:
51
+ prefix_parts.append(f"[{trace_id[:8]}]")
52
+ if step_type and step_type in self.STEP_ICONS:
53
+ prefix_parts.append(self.STEP_ICONS[step_type])
54
+ prefix = " ".join(prefix_parts) + " " if prefix_parts else ""
55
+
56
+ suffix = f" ({duration:.3f}s)" if duration is not None else ""
57
+
58
+ if self.use_colors:
59
+ level_color = self.COLORS.get(record.levelname, '')
60
+ reset = self.COLORS['RESET']
61
+ dim = self.COLORS['DIM']
62
+ timestamp = datetime.fromtimestamp(record.created).strftime('%H:%M:%S.%f')[:-3]
63
+ return (
64
+ f"{dim}{timestamp}{reset} | "
65
+ f"{level_color}{record.levelname:8}{reset} | "
66
+ f"{dim}{record.name:30}{reset} | "
67
+ f"{prefix}{record.getMessage()}{suffix}"
68
+ )
69
+ return f"{prefix}{super().format(record)}{suffix}"
70
+
71
+
72
+ @dataclass
73
+ class TraceContext:
74
+ """Context for tracking execution flow."""
75
+
76
+ trace_id: str = field(default_factory=lambda: str(uuid.uuid4()))
77
+ steps: List[Dict[str, Any]] = field(default_factory=list)
78
+ start_time: float = field(default_factory=time.time)
79
+ current_step: int = 0
80
+
81
+ def add_step(self, name: str, status: str = "completed", duration: Optional[float] = None,
82
+ details: Optional[Dict[str, Any]] = None) -> None:
83
+ self.current_step += 1
84
+ self.steps.append({
85
+ "step": self.current_step,
86
+ "name": name,
87
+ "status": status,
88
+ "duration": duration,
89
+ "details": details or {},
90
+ "timestamp": time.time(),
91
+ })
92
+
93
+ def get_summary(self) -> Dict[str, Any]:
94
+ return {
95
+ "trace_id": self.trace_id,
96
+ "total_duration": time.time() - self.start_time,
97
+ "step_count": len(self.steps),
98
+ "steps": self.steps,
99
+ }
100
+
101
+
102
+ _trace_context = threading.local()
103
+
104
+
105
+ def get_current_trace() -> Optional[TraceContext]:
106
+ return getattr(_trace_context, 'current', None)
107
+
108
+
109
+ def set_current_trace(trace: Optional[TraceContext]) -> None:
110
+ _trace_context.current = trace
111
+
112
+
113
+ class FlowLogger:
114
+ """Logger wrapper with flow tracing capabilities."""
115
+
116
+ def __init__(self, name: str):
117
+ self.logger = logging.getLogger(name)
118
+ self.name = name
119
+
120
+ def _log_with_context(self, level: int, msg: str, step_type: Optional[str] = None,
121
+ duration: Optional[float] = None, **kwargs) -> None:
122
+ trace = get_current_trace()
123
+ extra = kwargs.pop('extra', {})
124
+ extra['step_type'] = step_type
125
+ extra['duration'] = duration
126
+ extra['trace_id'] = trace.trace_id if trace else None
127
+ self.logger.log(level, msg, extra=extra, **kwargs)
128
+
129
+ def step_start(self, step_name: str, details: str = "") -> float:
130
+ msg = f"Starting: {step_name}" + (f" - {details}" if details else "")
131
+ self._log_with_context(logging.INFO, msg, step_type='start')
132
+ return time.time()
133
+
134
+ def step_end(self, step_name: str, start_time: float, details: str = "") -> None:
135
+ duration = time.time() - start_time
136
+ msg = f"Completed: {step_name}" + (f" - {details}" if details else "")
137
+ self._log_with_context(logging.INFO, msg, step_type='end', duration=duration)
138
+ trace = get_current_trace()
139
+ if trace:
140
+ trace.add_step(step_name, "completed", duration)
141
+
142
+ def step_error(self, step_name: str, error: Exception, start_time: Optional[float] = None) -> None:
143
+ duration = time.time() - start_time if start_time else None
144
+ msg = f"Failed: {step_name} - {type(error).__name__}: {error}"
145
+ self._log_with_context(logging.ERROR, msg, step_type='error', duration=duration)
146
+ trace = get_current_trace()
147
+ if trace:
148
+ trace.add_step(step_name, "failed", duration, {"error": str(error)})
149
+
150
+ def substep(self, msg: str) -> None:
151
+ self._log_with_context(logging.DEBUG, msg, step_type='substep')
152
+
153
+ def info(self, msg: str, **kwargs) -> None:
154
+ self._log_with_context(logging.INFO, msg, step_type='info', **kwargs)
155
+
156
+ def debug(self, msg: str, **kwargs) -> None:
157
+ self._log_with_context(logging.DEBUG, msg, **kwargs)
158
+
159
+ def warning(self, msg: str, **kwargs) -> None:
160
+ self._log_with_context(logging.WARNING, msg, **kwargs)
161
+
162
+ def error(self, msg: str, **kwargs) -> None:
163
+ self._log_with_context(logging.ERROR, msg, **kwargs)
164
+
165
+
166
+ def get_flow_logger(name: str) -> FlowLogger:
167
+ return FlowLogger(name)
168
+
169
+
170
+ def get_logger(name: str) -> FlowLogger:
171
+ return FlowLogger(name)
172
+
173
+
174
+ def trace_step(step_name: Optional[str] = None):
175
+ """Decorator to trace a function as a step."""
176
+ def decorator(func: F) -> F:
177
+ @functools.wraps(func)
178
+ def wrapper(*args, **kwargs):
179
+ name = step_name or func.__name__
180
+ logger = get_flow_logger(func.__module__)
181
+ start = logger.step_start(name)
182
+ try:
183
+ result = func(*args, **kwargs)
184
+ logger.step_end(name, start)
185
+ return result
186
+ except Exception as e:
187
+ logger.step_error(name, e, start)
188
+ raise
189
+ return wrapper # type: ignore
190
+ return decorator
191
+
192
+
193
+ def trace_flow(flow_name: str):
194
+ """Decorator to trace an entire flow with a new trace context."""
195
+ def decorator(func: F) -> F:
196
+ @functools.wraps(func)
197
+ def wrapper(*args, **kwargs):
198
+ logger = get_flow_logger(func.__module__)
199
+ trace = TraceContext()
200
+ set_current_trace(trace)
201
+ logger.info(f"{'='*60}")
202
+ logger.info(f"FLOW START: {flow_name} [Trace: {trace.trace_id[:8]}]")
203
+ logger.info(f"{'='*60}")
204
+ start = time.time()
205
+ try:
206
+ result = func(*args, **kwargs)
207
+ duration = time.time() - start
208
+ logger.info(f"{'='*60}")
209
+ logger.info(f"FLOW COMPLETE: {flow_name} ({duration:.3f}s)")
210
+ logger.info(f"Steps completed: {len(trace.steps)}")
211
+ logger.info(f"{'='*60}")
212
+ return result
213
+ except Exception as e:
214
+ duration = time.time() - start
215
+ logger.error(f"{'='*60}")
216
+ logger.error(f"FLOW FAILED: {flow_name} ({duration:.3f}s)")
217
+ logger.error(f"Error: {type(e).__name__}: {e}")
218
+ logger.error(f"{'='*60}")
219
+ raise
220
+ finally:
221
+ set_current_trace(None)
222
+ return wrapper # type: ignore
223
+ return decorator
224
+
225
+
226
+ @contextmanager
227
+ def trace_context(flow_name: str):
228
+ """Context manager for tracing a flow."""
229
+ logger = get_flow_logger(__name__)
230
+ trace = TraceContext()
231
+ set_current_trace(trace)
232
+ logger.info(f"{'='*60}")
233
+ logger.info(f"FLOW START: {flow_name} [Trace: {trace.trace_id[:8]}]")
234
+ logger.info(f"{'='*60}")
235
+ start = time.time()
236
+ try:
237
+ yield trace
238
+ duration = time.time() - start
239
+ logger.info(f"{'='*60}")
240
+ logger.info(f"FLOW COMPLETE: {flow_name} ({duration:.3f}s)")
241
+ logger.info(f"{'='*60}")
242
+ except Exception as e:
243
+ duration = time.time() - start
244
+ logger.error(f"{'='*60}")
245
+ logger.error(f"FLOW FAILED: {flow_name} ({duration:.3f}s)")
246
+ logger.error(f"Error: {type(e).__name__}: {e}")
247
+ logger.error(f"{'='*60}")
248
+ raise
249
+ finally:
250
+ set_current_trace(None)
251
+
252
+
253
+ @contextmanager
254
+ def log_step(logger: FlowLogger, step_name: str, details: str = ""):
255
+ """Context manager for logging a step."""
256
+ start = logger.step_start(step_name, details)
257
+ try:
258
+ yield
259
+ logger.step_end(step_name, start)
260
+ except Exception as e:
261
+ logger.step_error(step_name, e, start)
262
+ raise
263
+
264
+
265
+ def configure_logging(level: int = logging.INFO, use_colors: bool = True,
266
+ log_file: Optional[str] = None, detailed: bool = False) -> None:
267
+ """Configure logging for the application."""
268
+ root = logging.getLogger()
269
+ root.handlers.clear()
270
+ root.setLevel(level)
271
+
272
+ console_handler = logging.StreamHandler(sys.stdout)
273
+ console_handler.setLevel(level)
274
+ console_handler.setFormatter(GraphRAGFormatter(use_colors=use_colors))
275
+ root.addHandler(console_handler)
276
+
277
+ if log_file:
278
+ file_handler = logging.FileHandler(log_file)
279
+ file_handler.setLevel(level)
280
+ file_handler.setFormatter(logging.Formatter(
281
+ "%(asctime)s | %(levelname)-8s | %(name)-30s | %(message)s"
282
+ ))
283
+ root.addHandler(file_handler)
284
+
285
+ for logger_name in ["httpx", "httpcore", "neo4j", "urllib3"]:
286
+ logging.getLogger(logger_name).setLevel(logging.WARNING)
287
+
288
+ if not detailed:
289
+ for logger_name in ["langchain", "langchain_community"]:
290
+ logging.getLogger(logger_name).setLevel(logging.WARNING)
src/config/schema.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Schema policy for LLM-driven graph extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List
6
+
7
+
8
+ class SchemaPolicy:
9
+ """Defines allowed node labels and relationship types for LLM graph extraction.
10
+
11
+ The LLMGraphTransformer benefits from explicit schema constraints. This schema
12
+ is intentionally broad to support diverse project report questions (stakeholders,
13
+ contracts, permitting, schedule, finance, risks, etc.).
14
+ """
15
+
16
+ ALLOWED_NODES: List[str] = [
17
+ # Document structure
18
+ "Project", "Report", "Document", "Section", "Chunk", "Source", "Evidence",
19
+ # Organizations
20
+ "Organization", "Company", "Owner", "ParentCompany", "Client", "Customer",
21
+ "Partner", "JV", "Consortium", "Contractor", "Subcontractor", "Vendor", "Supplier",
22
+ "Consultant", "EngineeringFirm", "EPC", "EPCM", "Operator",
23
+ "GovernmentAgency", "Regulator", "Stakeholder",
24
+ # People
25
+ "Person", "Role", "Team", "Department",
26
+ # Geography
27
+ "Location", "Address", "City", "State", "Province", "Region", "Country", "County",
28
+ "Zone", "Port", "Site", "Plant",
29
+ # Finance
30
+ "Budget", "Cost", "Capex", "Opex", "Estimate", "Investment", "Funding",
31
+ "Currency", "TIV", "Revenue", "Tariff", "Price",
32
+ # Timeline
33
+ "Timeline", "Schedule", "Milestone", "Phase", "Stage", "Date", "Quarter", "Year",
34
+ "Duration", "StartDate", "EndDate",
35
+ # Technical
36
+ "Industry", "Sector", "Market", "Demand", "Product", "Output", "Capacity",
37
+ "Feedstock", "Fuel", "Technology", "Process", "Equipment", "Unit", "System", "Utility",
38
+ "Specification", "Standard",
39
+ # Contracts
40
+ "Contract", "Agreement", "Tender", "Bid", "RFQ", "Procurement", "Permit",
41
+ "WorkPackage", "Deliverable", "Requirement", "KPI", "Metric",
42
+ # Status
43
+ "Status", "StatusReason", "Decision", "Change", "Assumption", "Dependency",
44
+ "Risk", "Issue", "Challenge", "Constraint", "Delay", "Cancellation",
45
+ # ESG
46
+ "EnvironmentalAspect", "Emissions", "Wastewater", "Water", "Waste", "Safety",
47
+ "Regulation", "Compliance",
48
+ ]
49
+
50
+ ALLOWED_RELATIONSHIPS: List[str] = [
51
+ # Document structure
52
+ "HAS_REPORT", "HAS_DOCUMENT", "HAS_SECTION", "HAS_CHUNK", "HAS_EVIDENCE",
53
+ "EVIDENCED_BY", "SUPPORTED_BY", "MENTIONS", "ABOUT",
54
+ # Lifecycle
55
+ "HAS_STATUS", "HAS_STATUS_REASON", "HAS_PHASE", "HAS_STAGE",
56
+ "HAS_TIMELINE", "HAS_SCHEDULE", "HAS_MILESTONE",
57
+ "STARTS_AT", "ENDS_AT", "UPDATED_ON", "RELEASED_ON", "COMPLETES_AT",
58
+ # Organizations
59
+ "OWNED_BY", "PARENT_OF", "HAS_PARENT", "MANAGED_BY", "OPERATED_BY",
60
+ "LED_BY", "RESPONSIBLE_FOR", "WORKS_FOR", "HAS_ROLE",
61
+ "PARTNERED_WITH", "CONTRACTED_BY", "DESIGNED_BY", "ENGINEERED_BY",
62
+ "CONSTRUCTED_BY", "PROCURED_BY", "SUPPLIED_BY", "REGULATED_BY",
63
+ # Geography
64
+ "LOCATED_IN", "HAS_ADDRESS", "IN_CITY", "IN_STATE", "IN_COUNTRY", "IN_REGION", "IN_ZONE",
65
+ # Finance
66
+ "HAS_BUDGET", "HAS_COST", "HAS_CAPEX", "HAS_OPEX", "HAS_TIV", "IN_CURRENCY",
67
+ "FUNDED_BY", "ALLOCATED_TO",
68
+ # Technical
69
+ "IN_INDUSTRY", "IN_SECTOR", "IN_MARKET",
70
+ "PRODUCES", "USES_FEEDSTOCK", "USES_FUEL", "USES_TECHNOLOGY", "USES_PROCESS",
71
+ "REQUIRES_EQUIPMENT", "HAS_UNIT", "HAS_SYSTEM", "HAS_UTILITY", "HAS_CAPACITY",
72
+ "MEETS_STANDARD",
73
+ # Governance
74
+ "REQUIRES_PERMIT", "HAS_REQUIREMENT", "HAS_DELIVERABLE",
75
+ "HAS_ENVIRONMENTAL_ASPECT", "HAS_SAFETY_REQUIREMENT",
76
+ # Risks
77
+ "HAS_RISK", "HAS_ISSUE", "HAS_CHALLENGE", "HAS_CONSTRAINT",
78
+ "CAUSED_BY", "RESULTED_IN", "AFFECTED_BY", "DELAYED_BY", "CANCELLED_DUE_TO",
79
+ ]
80
+
81
+ @classmethod
82
+ def get_allowed_nodes(cls) -> List[str]:
83
+ return cls.ALLOWED_NODES.copy()
84
+
85
+ @classmethod
86
+ def get_allowed_relationships(cls) -> List[str]:
87
+ return cls.ALLOWED_RELATIONSHIPS.copy()
src/config/settings.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Application settings and configuration management."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass, field
7
+ from typing import Optional
8
+
9
+ from dotenv import load_dotenv
10
+
11
+
12
+ @dataclass
13
+ class Neo4jConfig:
14
+ """Neo4j database connection configuration."""
15
+
16
+ uri: str = ""
17
+ username: str = "neo4j"
18
+ password: str = ""
19
+ database: str = "neo4j"
20
+
21
+ def is_valid(self) -> bool:
22
+ return bool(self.uri and self.username and self.password)
23
+
24
+
25
+ @dataclass
26
+ class TogetherAIConfig:
27
+ """Together AI API configuration."""
28
+
29
+ api_key: str = ""
30
+ chat_model: str = "meta-llama/meta-llama-3.1-8b-instruct-turbo"
31
+ embedding_model: str = "BAAI/bge-base-en-v1.5"
32
+
33
+ def is_valid(self) -> bool:
34
+ return bool(self.api_key)
35
+
36
+
37
+ @dataclass
38
+ class AppConfig:
39
+ """Application-level configuration."""
40
+
41
+ port: int = 7860
42
+ host: str = "0.0.0.0"
43
+
44
+
45
+ @dataclass
46
+ class Settings:
47
+ """Centralized application settings."""
48
+
49
+ neo4j: Neo4jConfig = field(default_factory=Neo4jConfig)
50
+ together_ai: TogetherAIConfig = field(default_factory=TogetherAIConfig)
51
+ app: AppConfig = field(default_factory=AppConfig)
52
+
53
+ @classmethod
54
+ def from_env(cls, dotenv_path: Optional[str] = None) -> "Settings":
55
+ """Load settings from environment variables."""
56
+ load_dotenv(dotenv_path)
57
+
58
+ neo4j = Neo4jConfig(
59
+ uri=os.getenv("NEO4J_URI", ""),
60
+ username=os.getenv("NEO4J_USERNAME", "neo4j"),
61
+ password=os.getenv("NEO4J_PASSWORD", ""),
62
+ database=os.getenv("NEO4J_DATABASE", "neo4j"),
63
+ )
64
+
65
+ together_ai = TogetherAIConfig(
66
+ api_key=os.getenv("TOGETHER_API_KEY", ""),
67
+ chat_model=os.getenv("TOGETHER_CHAT_MODEL", "meta-llama/meta-llama-3.1-8b-instruct-turbo"),
68
+ embedding_model=os.getenv("TOGETHER_EMBED_MODEL", "BAAI/bge-base-en-v1.5"),
69
+ )
70
+
71
+ app = AppConfig(
72
+ port=int(os.getenv("PORT", "7860")),
73
+ host=os.getenv("HOST", "0.0.0.0"),
74
+ )
75
+
76
+ return cls(neo4j=neo4j, together_ai=together_ai, app=app)
77
+
78
+ def apply_to_env(self) -> None:
79
+ """Apply current settings to environment variables."""
80
+ if self.together_ai.api_key:
81
+ os.environ["TOGETHER_API_KEY"] = self.together_ai.api_key
src/models/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Data models for GraphRAG application."""
2
+
3
+ from src.models.project import ProjectRecord
4
+ from src.models.state import AppState
5
+
6
+ __all__ = ["ProjectRecord", "AppState"]
src/models/project.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Project record data model for structured extraction from PDF reports."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, Optional
7
+
8
+
9
+ @dataclass
10
+ class ProjectRecord:
11
+ """Canonical structured fields parsed from a single PDF project report."""
12
+
13
+ # Identification
14
+ source: str
15
+ project_id: Optional[str] = None
16
+ project_name: Optional[str] = None
17
+
18
+ # Classification
19
+ industry_code: Optional[str] = None
20
+ project_type: Optional[str] = None
21
+ sector: Optional[str] = None
22
+ sic_code: Optional[str] = None
23
+ sic_product: Optional[str] = None
24
+
25
+ # Financial
26
+ tiv_amount: Optional[float] = None
27
+ tiv_currency: Optional[str] = None
28
+
29
+ # Status
30
+ status: Optional[str] = None
31
+ status_reason: Optional[str] = None
32
+ project_probability: Optional[str] = None
33
+
34
+ # Timeline
35
+ last_update: Optional[str] = None
36
+ initial_release: Optional[str] = None
37
+ pec_timing: Optional[str] = None
38
+ pec_activity: Optional[str] = None
39
+
40
+ # Location
41
+ address: Optional[str] = None
42
+ city_state_line: Optional[str] = None
43
+ zone_county: Optional[str] = None
44
+
45
+ # Plant Info
46
+ plant_owner: Optional[str] = None
47
+ plant_parent: Optional[str] = None
48
+ plant_name: Optional[str] = None
49
+ plant_id: Optional[str] = None
50
+ unit_name: Optional[str] = None
51
+
52
+ # Contacts
53
+ project_manager: Optional[str] = None
54
+ project_manager_company: Optional[str] = None
55
+ project_manager_title: Optional[str] = None
56
+ project_manager_email: Optional[str] = None
57
+ project_manager_phone: Optional[str] = None
58
+ engineer_company: Optional[str] = None
59
+ ec_firm: Optional[str] = None
60
+ phone: Optional[str] = None
61
+
62
+ # Technical
63
+ scope_text: Optional[str] = None
64
+ project_capacity: Optional[str] = None
65
+ environmental: Optional[str] = None
66
+ construction_labor: Optional[str] = None
67
+ operations_labor: Optional[str] = None
68
+ fuel_type: Optional[str] = None
69
+
70
+ # Derived text sections
71
+ schedule_text: Optional[str] = None
72
+ details_text: Optional[str] = None
73
+
74
+ @property
75
+ def owner_company(self) -> Optional[str]:
76
+ """Alias for plant_owner (backward compatibility)."""
77
+ return self.plant_owner
78
+
79
+ def get_unique_key(self) -> str:
80
+ return self.project_id or self.project_name or self.source
81
+
82
+ def has_budget_info(self) -> bool:
83
+ return self.tiv_amount is not None and self.tiv_currency is not None
84
+
85
+ def has_location_info(self) -> bool:
86
+ return any([self.address, self.city_state_line, self.zone_county])
87
+
88
+ def has_timeline_info(self) -> bool:
89
+ return bool(self.schedule_text)
90
+
91
+ def to_dict(self) -> Dict[str, Any]:
92
+ """Convert record to dictionary with non-None fields only."""
93
+ return {
94
+ k: v for k, v in {
95
+ "source": self.source,
96
+ "project_id": self.project_id,
97
+ "project_name": self.project_name,
98
+ "industry_code": self.industry_code,
99
+ "project_type": self.project_type,
100
+ "sector": self.sector,
101
+ "sic_code": self.sic_code,
102
+ "sic_product": self.sic_product,
103
+ "tiv_amount": self.tiv_amount,
104
+ "tiv_currency": self.tiv_currency,
105
+ "status": self.status,
106
+ "status_reason": self.status_reason,
107
+ "project_probability": self.project_probability,
108
+ "last_update": self.last_update,
109
+ "initial_release": self.initial_release,
110
+ "pec_timing": self.pec_timing,
111
+ "pec_activity": self.pec_activity,
112
+ "address": self.address,
113
+ "city_state_line": self.city_state_line,
114
+ "zone_county": self.zone_county,
115
+ "plant_owner": self.plant_owner,
116
+ "plant_parent": self.plant_parent,
117
+ "plant_name": self.plant_name,
118
+ "plant_id": self.plant_id,
119
+ "unit_name": self.unit_name,
120
+ "project_manager": self.project_manager,
121
+ "project_manager_company": self.project_manager_company,
122
+ "project_manager_title": self.project_manager_title,
123
+ "project_manager_email": self.project_manager_email,
124
+ "project_manager_phone": self.project_manager_phone,
125
+ "engineer_company": self.engineer_company,
126
+ "ec_firm": self.ec_firm,
127
+ "phone": self.phone,
128
+ "scope_text": self.scope_text,
129
+ "project_capacity": self.project_capacity,
130
+ "environmental": self.environmental,
131
+ "construction_labor": self.construction_labor,
132
+ "operations_labor": self.operations_labor,
133
+ "fuel_type": self.fuel_type,
134
+ "schedule_text": self.schedule_text,
135
+ "details_text": self.details_text,
136
+ }.items() if v is not None
137
+ }
138
+
139
+
140
+ @dataclass
141
+ class Milestone:
142
+ """A project milestone extracted from schedule text."""
143
+
144
+ name: str
145
+ date_text: str = ""
146
+ sentence: str = ""
147
+ source: str = ""
148
+
149
+ def to_dict(self) -> Dict[str, str]:
150
+ return {"name": self.name, "dateText": self.date_text, "sentence": self.sentence}
151
+
152
+
153
+ @dataclass
154
+ class GeoComponents:
155
+ """Parsed geographic components from city/state line."""
156
+
157
+ city: Optional[str] = None
158
+ state: Optional[str] = None
159
+ postal: Optional[str] = None
160
+ country: Optional[str] = None
161
+
162
+ def to_dict(self) -> Dict[str, Optional[str]]:
163
+ return {"city": self.city, "state": self.state, "postal": self.postal, "country": self.country}
src/models/state.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Application state container for runtime handles."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import TYPE_CHECKING, Any, Optional
7
+
8
+ if TYPE_CHECKING:
9
+ from src.services.neo4j_service import Neo4jService
10
+
11
+
12
+ @dataclass
13
+ class AppState:
14
+ """Runtime handles required for query-time execution after ingestion."""
15
+
16
+ neo4j: Optional[Any] = None
17
+ vector: Optional[Any] = None
18
+ qa_chain: Optional[Any] = None
19
+ llm: Optional[Any] = None
20
+
21
+ def is_ready(self) -> bool:
22
+ return all([self.neo4j, self.vector, self.qa_chain, self.llm])
23
+
24
+ def get_graph(self) -> Optional[Any]:
25
+ return self.neo4j.graph if self.neo4j else None
26
+
27
+ def close(self) -> None:
28
+ if self.neo4j:
29
+ try:
30
+ self.neo4j.close()
31
+ except Exception:
32
+ pass
src/parsers/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Parsing utilities for document extraction."""
2
+
3
+ from src.parsers.project_parser import ProjectReportParser
4
+ from src.parsers.smart_chunker import SemanticChunker, get_chunker
5
+
6
+ __all__ = ["ProjectReportParser", "SemanticChunker", "get_chunker"]
src/parsers/project_parser.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Project report parser for semi-structured PDF documents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Dict, List, Optional, Tuple
7
+
8
+ from src.models.project import GeoComponents, Milestone, ProjectRecord
9
+ from src.config import get_logger
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ class ProjectReportParser:
15
+ """Comprehensive parser for semi-structured project report PDFs."""
16
+
17
+ # Identification patterns
18
+ PATTERN_PROJECT_ID = r"Project ID:\s*([0-9]+)"
19
+ PATTERN_PROJECT_NAME = r"Project Name\s+(.+?)\s+PEC Activity Diagram"
20
+
21
+ # Classification patterns
22
+ PATTERN_INDUSTRY_CODE = r"Industry Code\s+([0-9]+\s+[A-Za-z\s&\(\)]+?)(?:\s+Project Type)"
23
+ PATTERN_PROJECT_TYPE = r"Project Type\s+([A-Za-z]+)"
24
+ PATTERN_SECTOR = r"Sector\s+([A-Za-z\s]+?)(?:\s+SIC Product|\s+Status)"
25
+ PATTERN_SIC_CODE = r"SIC Code\s+([0-9]+\s+[A-Za-z\s&,\[\]]+?)(?:\s+Sector)"
26
+ PATTERN_SIC_PRODUCT = r"SIC Product\s+([0-9\*]+\s+[A-Za-z\s,\(\)\-]+?)(?:\s+Status)"
27
+
28
+ # Financial patterns
29
+ PATTERN_TIV_USD = r"TIV \(USD\)\s*([0-9,]+)"
30
+ PATTERN_TIV_CNY = r"TIV \(CNY\)\s*([0-9,]+)"
31
+
32
+ # Status patterns
33
+ PATTERN_STATUS = r"Status\s+([A-Za-z]+)\s+Last Update"
34
+ PATTERN_STATUS_REASON = r"Status Reason\s+(.+?)\s+Environmental"
35
+ PATTERN_PROJECT_PROBABILITY = r"Project Probability\s+([A-Za-z]+\s*\([0-9\-]+%\))"
36
+
37
+ # Timeline patterns
38
+ PATTERN_LAST_UPDATE = r"Last Update\s+([0-9]{2}-[A-Za-z]{3}-[0-9]{4})"
39
+ PATTERN_INITIAL_RELEASE = r"Initial Release\s+([0-9]{2}-[A-Za-z]{3}-[0-9]{4})"
40
+ PATTERN_PEC_TIMING = r"PEC.\s*Timing\s+([A-Z][0-9])"
41
+ PATTERN_PEC_ACTIVITY = r"PEC.\s*Activity\s+([A-Za-z\s\-]+?)(?:\s+Project Probability)"
42
+
43
+ # Location patterns
44
+ PATTERN_LOCATION = r"Location\s+(.+?)\s+Phone"
45
+ PATTERN_CITY_STATE = r"City/State\s+(.+?)\s+Zone/County"
46
+ PATTERN_ZONE_COUNTY = r"Zone/County\s+(.+?)\s+Project Responsibility"
47
+ PATTERN_PHONE = r"Phone\s+(\+?[0-9\s\-]+)"
48
+
49
+ # Plant info patterns
50
+ PATTERN_PLANT_OWNER = r"Plant Owner\s+([A-Za-z\s&,\.]+?)(?:\s+Plant Parent)"
51
+ PATTERN_PLANT_PARENT = r"Plant Parent\s+([A-Za-z\s&,\.]+?)(?:\s+Plant Name|\s+Unit Name)"
52
+ PATTERN_PLANT_NAME = r"Plant Name\s+([A-Za-z\s&,\.]+?)(?:\s+Unit Name|\s+Plant ID)"
53
+ PATTERN_PLANT_ID = r"Plant ID\s+([0-9]+)"
54
+ PATTERN_UNIT_NAME = r"Unit Name\s+([A-Za-z0-9\s&]+?)(?:\s+Plant ID|\s+Location)"
55
+
56
+ # Contact patterns
57
+ PATTERN_PROJECT_MANAGER = r"Project Manager\s+([A-Za-z\s&,\.]+?)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:\d|No\.|[A-Z][a-z]+\s+(?:Road|Street|Drive|Ave|Suite|Manager))"
58
+ PATTERN_ENGINEER = r"Eng\s+([A-Za-z\s&,\.]+?)\s+(?:[A-Z][a-z]+\s+[A-Z][a-z]+|[0-9])"
59
+ PATTERN_EC_FIRM = r"E&C\s+([A-Za-z\s&,\.]+?)\s+(?:[A-Z][a-z]+\s+[A-Z][a-z]+|[0-9])"
60
+ PATTERN_EMAIL = r"\[E-Mail\]\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})"
61
+
62
+ # Technical patterns
63
+ PATTERN_SCOPE = r"Scope\s+(.+?)\s+Schedule\s+"
64
+ PATTERN_PROJECT_CAPACITY = r"Project Capacity\s+(?:Planned\s+)?([0-9,]+\s*(?:MW|BBL|Megawatts)[^\n]*)"
65
+ PATTERN_ENVIRONMENTAL = r"Environmental\s+(Air\s*\([A-Z]\)[^C]*?)(?:\s+Construction Labor)"
66
+ PATTERN_CONSTRUCTION_LABOR = r"Construction Labor Preference\s+([A-Za-z\-]+)"
67
+ PATTERN_OPERATIONS_LABOR = r"Operations Labor Preference\s+([A-Za-z\-]+)"
68
+ PATTERN_FUEL_TYPE = r"Project Fuel Type\s+([A-Za-z]+)"
69
+
70
+ # Schedule/details patterns
71
+ PATTERN_SCHEDULE = r"Schedule\s+(.+?)\bDetails\b"
72
+ PATTERN_SCHEDULE_FALLBACK = r"Schedule\s+(.+?)\s+Engineering\s+(?:Civil|Contracting|Electrical)"
73
+ PATTERN_DETAILS = r"Details\s+(.+?)\s+Engineering\s+(?:Civil|Contracting)"
74
+
75
+ # Milestone pattern
76
+ PATTERN_MILESTONE = (
77
+ r"(?P<name>[A-Za-z0-9\-\s&/]+?)\s+"
78
+ r"(?P<date>(?:[1-4]Q\d{2,4}|\d{4}|[A-Za-z]{3}-\d{4})(?:\s*\([^\)]*\))?)"
79
+ )
80
+
81
+ CHALLENGE_KEYWORDS = r"funding|partners|agreement|RFQ|bid|cancelled|delay|escalat"
82
+ PATTERN_GEO = r"^(?P<city>[^,]+),\s*(?P<state>[^\d]+?)\s+(?P<postal>\d+)\s+(?P<country>.+)$"
83
+
84
+ def __init__(self) -> None:
85
+ self._compiled_patterns: Dict[str, re.Pattern] = {}
86
+
87
+ def _get_pattern(self, pattern: str, flags: int = 0) -> re.Pattern:
88
+ key = f"{pattern}:{flags}"
89
+ if key not in self._compiled_patterns:
90
+ self._compiled_patterns[key] = re.compile(pattern, flags)
91
+ return self._compiled_patterns[key]
92
+
93
+ def _find_match(self, text: str, pattern: str, flags: int = 0) -> Optional[str]:
94
+ compiled = self._get_pattern(pattern, flags)
95
+ match = compiled.search(text)
96
+ return match.group(1).strip() if match else None
97
+
98
+ def _find_all_matches(self, text: str, pattern: str, flags: int = 0) -> List[str]:
99
+ compiled = self._get_pattern(pattern, flags)
100
+ return [m.group(1).strip() for m in compiled.finditer(text)]
101
+
102
+ @staticmethod
103
+ def _money_to_float(value: str) -> Optional[float]:
104
+ try:
105
+ return float(value.replace(",", ""))
106
+ except (ValueError, AttributeError):
107
+ return None
108
+
109
+ def _extract_project_manager(self, text: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
110
+ """Extract project manager name, company, and email."""
111
+ pm_pattern = self._get_pattern(self.PATTERN_PROJECT_MANAGER, re.IGNORECASE)
112
+ pm_match = pm_pattern.search(text)
113
+
114
+ name, company, email = None, None, None
115
+
116
+ if pm_match:
117
+ company = pm_match.group(1).strip()
118
+ name = pm_match.group(2).strip()
119
+ pm_section = text[pm_match.start():pm_match.start() + 500]
120
+ email_match = re.search(self.PATTERN_EMAIL, pm_section)
121
+ if email_match:
122
+ email = email_match.group(1)
123
+ logger.info(f"Found Project Manager: {name} ({company})")
124
+
125
+ return name, company, email
126
+
127
+ def parse(self, text: str, source_name: str) -> ProjectRecord:
128
+ """Parse a report into a ProjectRecord with comprehensive field extraction."""
129
+ normalized = re.sub(r"\s+", " ", text)
130
+
131
+ # Identification
132
+ project_id = self._find_match(normalized, self.PATTERN_PROJECT_ID)
133
+ project_name = self._find_match(normalized, self.PATTERN_PROJECT_NAME, re.IGNORECASE)
134
+
135
+ # Classification
136
+ industry_code = self._find_match(normalized, self.PATTERN_INDUSTRY_CODE, re.IGNORECASE)
137
+ project_type = self._find_match(normalized, self.PATTERN_PROJECT_TYPE, re.IGNORECASE)
138
+ sector = self._find_match(normalized, self.PATTERN_SECTOR, re.IGNORECASE)
139
+ sic_code = self._find_match(normalized, self.PATTERN_SIC_CODE, re.IGNORECASE)
140
+ sic_product = self._find_match(normalized, self.PATTERN_SIC_PRODUCT, re.IGNORECASE)
141
+
142
+ # Financial
143
+ tiv_usd = self._find_match(normalized, self.PATTERN_TIV_USD)
144
+ tiv_cny = self._find_match(normalized, self.PATTERN_TIV_CNY)
145
+ tiv_amount: Optional[float] = None
146
+ tiv_currency: Optional[str] = None
147
+ if tiv_usd:
148
+ tiv_amount = self._money_to_float(tiv_usd)
149
+ tiv_currency = "USD"
150
+ elif tiv_cny:
151
+ tiv_amount = self._money_to_float(tiv_cny)
152
+ tiv_currency = "CNY"
153
+
154
+ # Status
155
+ status = self._find_match(normalized, self.PATTERN_STATUS, re.IGNORECASE)
156
+ status_reason = self._find_match(normalized, self.PATTERN_STATUS_REASON, re.IGNORECASE)
157
+ project_probability = self._find_match(normalized, self.PATTERN_PROJECT_PROBABILITY, re.IGNORECASE)
158
+
159
+ # Timeline
160
+ last_update = self._find_match(normalized, self.PATTERN_LAST_UPDATE)
161
+ initial_release = self._find_match(normalized, self.PATTERN_INITIAL_RELEASE)
162
+ pec_timing = self._find_match(normalized, self.PATTERN_PEC_TIMING, re.IGNORECASE)
163
+ pec_activity = self._find_match(normalized, self.PATTERN_PEC_ACTIVITY, re.IGNORECASE)
164
+
165
+ # Location
166
+ address = self._find_match(normalized, self.PATTERN_LOCATION, re.IGNORECASE)
167
+ city_state_line = self._find_match(normalized, self.PATTERN_CITY_STATE, re.IGNORECASE)
168
+ zone_county = self._find_match(normalized, self.PATTERN_ZONE_COUNTY, re.IGNORECASE)
169
+ phone = self._find_match(normalized, self.PATTERN_PHONE)
170
+
171
+ # Plant info
172
+ plant_owner = self._find_match(normalized, self.PATTERN_PLANT_OWNER, re.IGNORECASE)
173
+ plant_parent = self._find_match(normalized, self.PATTERN_PLANT_PARENT, re.IGNORECASE)
174
+ plant_name = self._find_match(normalized, self.PATTERN_PLANT_NAME, re.IGNORECASE)
175
+ plant_id = self._find_match(normalized, self.PATTERN_PLANT_ID)
176
+ unit_name = self._find_match(normalized, self.PATTERN_UNIT_NAME, re.IGNORECASE)
177
+
178
+ # Contacts
179
+ project_manager, project_manager_company, project_manager_email = self._extract_project_manager(normalized)
180
+ engineer_company = self._find_match(normalized, self.PATTERN_ENGINEER, re.IGNORECASE)
181
+ ec_firm = self._find_match(normalized, self.PATTERN_EC_FIRM, re.IGNORECASE)
182
+
183
+ # Technical
184
+ scope_text = self._find_match(normalized, self.PATTERN_SCOPE, re.IGNORECASE | re.DOTALL)
185
+ project_capacity = self._find_match(normalized, self.PATTERN_PROJECT_CAPACITY, re.IGNORECASE)
186
+ environmental = self._find_match(normalized, self.PATTERN_ENVIRONMENTAL, re.IGNORECASE)
187
+ construction_labor = self._find_match(normalized, self.PATTERN_CONSTRUCTION_LABOR, re.IGNORECASE)
188
+ operations_labor = self._find_match(normalized, self.PATTERN_OPERATIONS_LABOR, re.IGNORECASE)
189
+ fuel_type = self._find_match(normalized, self.PATTERN_FUEL_TYPE, re.IGNORECASE)
190
+
191
+ # Schedule/details
192
+ schedule_text = self._find_match(normalized, self.PATTERN_SCHEDULE, re.IGNORECASE | re.DOTALL)
193
+ if not schedule_text:
194
+ schedule_text = self._find_match(normalized, self.PATTERN_SCHEDULE_FALLBACK, re.IGNORECASE | re.DOTALL)
195
+ details_text = self._find_match(normalized, self.PATTERN_DETAILS, re.IGNORECASE | re.DOTALL)
196
+
197
+ extracted_count = sum(1 for v in [
198
+ project_id, project_name, industry_code, project_type, sector,
199
+ tiv_amount, status, plant_owner, project_manager, scope_text,
200
+ schedule_text, pec_timing, pec_activity
201
+ ] if v is not None)
202
+ logger.info(f"Extracted {extracted_count}/13 key fields from {source_name}")
203
+
204
+ return ProjectRecord(
205
+ source=source_name,
206
+ project_id=project_id,
207
+ project_name=project_name,
208
+ industry_code=industry_code,
209
+ project_type=project_type,
210
+ sector=sector,
211
+ sic_code=sic_code,
212
+ sic_product=sic_product,
213
+ tiv_amount=tiv_amount,
214
+ tiv_currency=tiv_currency,
215
+ status=status,
216
+ status_reason=status_reason,
217
+ project_probability=project_probability,
218
+ last_update=last_update,
219
+ initial_release=initial_release,
220
+ pec_timing=pec_timing,
221
+ pec_activity=pec_activity,
222
+ address=address,
223
+ city_state_line=city_state_line,
224
+ zone_county=zone_county,
225
+ plant_owner=plant_owner,
226
+ plant_parent=plant_parent,
227
+ plant_name=plant_name,
228
+ plant_id=plant_id,
229
+ unit_name=unit_name,
230
+ project_manager=project_manager,
231
+ project_manager_company=project_manager_company,
232
+ project_manager_email=project_manager_email,
233
+ engineer_company=engineer_company,
234
+ ec_firm=ec_firm,
235
+ phone=phone,
236
+ scope_text=scope_text,
237
+ project_capacity=project_capacity,
238
+ environmental=environmental,
239
+ construction_labor=construction_labor,
240
+ operations_labor=operations_labor,
241
+ fuel_type=fuel_type,
242
+ schedule_text=schedule_text,
243
+ details_text=details_text,
244
+ )
245
+
246
+ def extract_milestones(self, schedule_text: Optional[str]) -> List[Milestone]:
247
+ """Extract milestone-like statements from schedule text."""
248
+ if not schedule_text:
249
+ return []
250
+
251
+ milestones: List[Milestone] = []
252
+ pattern = self._get_pattern(self.PATTERN_MILESTONE)
253
+
254
+ for match in pattern.finditer(schedule_text):
255
+ name = match.group("name").strip()
256
+ date_text = match.group("date").strip()
257
+ if len(name) >= 3 and name.lower() not in ("the", "and", "for", "with"):
258
+ milestones.append(Milestone(
259
+ name=name,
260
+ date_text=date_text,
261
+ sentence=schedule_text[max(0, match.start()-50):match.end()+20].strip(),
262
+ ))
263
+
264
+ if not milestones and schedule_text.strip():
265
+ milestones.append(Milestone(name="Schedule", date_text="", sentence=schedule_text.strip()[:200]))
266
+
267
+ return milestones
268
+
269
+ def derive_challenges(self, record: ProjectRecord) -> List[str]:
270
+ """Derive candidate challenges/constraints from record fields."""
271
+ candidates: List[str] = []
272
+
273
+ if record.status_reason:
274
+ candidates.append(f"Status reason: {record.status_reason}")
275
+ if record.details_text:
276
+ candidates.append(record.details_text)
277
+ if record.schedule_text and re.search(self.CHALLENGE_KEYWORDS, record.schedule_text, re.IGNORECASE):
278
+ candidates.append("Dependencies / commercial gating mentioned in schedule (funding, partners, RFQs/bids).")
279
+ if record.status and record.status.lower() == "cancelled":
280
+ candidates.append("Project status is Cancelled.")
281
+
282
+ seen: set = set()
283
+ cleaned: List[str] = []
284
+ for candidate in candidates:
285
+ candidate = candidate.strip()
286
+ if candidate and candidate not in seen:
287
+ seen.add(candidate)
288
+ cleaned.append(candidate)
289
+ return cleaned
290
+
291
+ def parse_city_state_country(self, city_state_line: Optional[str]) -> GeoComponents:
292
+ """Parse City/State line into structured components."""
293
+ if not city_state_line:
294
+ return GeoComponents()
295
+
296
+ line = city_state_line.strip()
297
+ pattern = self._get_pattern(self.PATTERN_GEO)
298
+ match = pattern.match(line)
299
+
300
+ if not match:
301
+ return GeoComponents(city=line)
302
+
303
+ return GeoComponents(
304
+ city=match.group("city").strip(),
305
+ state=match.group("state").strip(),
306
+ postal=match.group("postal").strip(),
307
+ country=match.group("country").strip(),
308
+ )
309
+
310
+
311
+ _default_parser: Optional[ProjectReportParser] = None
312
+
313
+
314
+ def get_parser() -> ProjectReportParser:
315
+ """Get the default parser instance (singleton)."""
316
+ global _default_parser
317
+ if _default_parser is None:
318
+ _default_parser = ProjectReportParser()
319
+ return _default_parser
src/parsers/smart_chunker.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Smart chunking for semi-structured project reports."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from langchain.schema import Document
9
+
10
+
11
+ class SemanticChunker:
12
+ """Section-aware chunking that respects document structure."""
13
+
14
+ SECTION_PATTERNS = [
15
+ r"^(?:Project\s+)?(?:ID|Name|Summary|Overview)",
16
+ r"^(?:Budget|TIV|Investment|Cost)",
17
+ r"^(?:Schedule|Timeline|Milestones?)",
18
+ r"^(?:Location|Site|Address)",
19
+ r"^(?:Status|Progress|Update)",
20
+ r"^(?:Details?|Description|Scope)",
21
+ r"^(?:Challenge|Risk|Issue|Constraint)",
22
+ r"^(?:Engineering|Construction|Procurement)",
23
+ r"^(?:Environmental|Regulatory|Permit)",
24
+ ]
25
+
26
+ DENSE_INDICATORS = [
27
+ r'\$[\d,]+',
28
+ r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
29
+ r'\b[A-Z]{2,}\b',
30
+ r'\d+\s*(?:MW|GW|tons?|MT|units?|km|miles?)',
31
+ ]
32
+
33
+ def __init__(
34
+ self,
35
+ max_chunk_size: int = 1200,
36
+ min_chunk_size: int = 200,
37
+ overlap_sentences: int = 2,
38
+ ) -> None:
39
+ self.max_chunk_size = max_chunk_size
40
+ self.min_chunk_size = min_chunk_size
41
+ self.overlap_sentences = overlap_sentences
42
+ self._section_pattern = re.compile(
43
+ "|".join(f"({p})" for p in self.SECTION_PATTERNS),
44
+ re.IGNORECASE | re.MULTILINE
45
+ )
46
+
47
+ def _detect_sections(self, text: str) -> List[Dict[str, Any]]:
48
+ """Identify section boundaries in document."""
49
+ sections: List[Dict[str, Any]] = []
50
+ matches = list(self._section_pattern.finditer(text))
51
+
52
+ for i, match in enumerate(matches):
53
+ start = match.start()
54
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
55
+ sections.append({
56
+ "header": match.group().strip(),
57
+ "start": start,
58
+ "end": end,
59
+ "content": text[start:end].strip()
60
+ })
61
+
62
+ if not sections:
63
+ sections.append({
64
+ "header": "Document",
65
+ "start": 0,
66
+ "end": len(text),
67
+ "content": text.strip()
68
+ })
69
+
70
+ return sections
71
+
72
+ def _calculate_density(self, text: str) -> float:
73
+ """Calculate information density of text (matches per 100 chars)."""
74
+ total_matches = sum(len(re.findall(p, text)) for p in self.DENSE_INDICATORS)
75
+ return (total_matches / max(len(text), 1)) * 100
76
+
77
+ def _optimal_chunk_size(self, text: str) -> int:
78
+ """Determine optimal chunk size based on content density."""
79
+ density = self._calculate_density(text)
80
+ if density > 5:
81
+ return 600
82
+ elif density > 2:
83
+ return 900
84
+ return 1200
85
+
86
+ def _split_section(
87
+ self,
88
+ section: Dict[str, Any],
89
+ source: str,
90
+ chunk_size: Optional[int] = None
91
+ ) -> List[Document]:
92
+ """Split a section into appropriately sized chunks."""
93
+ content = section["content"]
94
+ header = section["header"]
95
+ effective_chunk_size = chunk_size or self.max_chunk_size
96
+
97
+ if len(content) <= effective_chunk_size:
98
+ return [Document(
99
+ page_content=f"[{header}] {content}",
100
+ metadata={
101
+ "source": source,
102
+ "section": header,
103
+ "chunk_size": len(content),
104
+ "density": self._calculate_density(content),
105
+ }
106
+ )]
107
+
108
+ sentences = re.split(r'(?<=[.!?])\s+', content)
109
+ chunks: List[Document] = []
110
+ current_chunk: List[str] = []
111
+ current_length = 0
112
+
113
+ for sentence in sentences:
114
+ sentence_len = len(sentence)
115
+
116
+ if current_length + sentence_len > effective_chunk_size and current_chunk:
117
+ chunk_text = " ".join(current_chunk)
118
+ chunks.append(Document(
119
+ page_content=f"[{header}] {chunk_text}",
120
+ metadata={
121
+ "source": source,
122
+ "section": header,
123
+ "chunk_size": len(chunk_text),
124
+ "density": self._calculate_density(chunk_text),
125
+ }
126
+ ))
127
+ current_chunk = current_chunk[-self.overlap_sentences:]
128
+ current_length = sum(len(s) for s in current_chunk)
129
+
130
+ current_chunk.append(sentence)
131
+ current_length += sentence_len
132
+
133
+ if current_chunk:
134
+ chunk_text = " ".join(current_chunk)
135
+ if len(chunk_text) >= self.min_chunk_size or not chunks:
136
+ chunks.append(Document(
137
+ page_content=f"[{header}] {chunk_text}",
138
+ metadata={
139
+ "source": source,
140
+ "section": header,
141
+ "chunk_size": len(chunk_text),
142
+ "density": self._calculate_density(chunk_text),
143
+ }
144
+ ))
145
+
146
+ return chunks
147
+
148
+ def chunk_document(self, text: str, source: str, adaptive: bool = True) -> List[Document]:
149
+ """Chunk document respecting section boundaries."""
150
+ sections = self._detect_sections(text)
151
+ all_chunks: List[Document] = []
152
+
153
+ for section in sections:
154
+ chunk_size = self._optimal_chunk_size(section["content"]) if adaptive else self.max_chunk_size
155
+ chunks = self._split_section(section, source, chunk_size)
156
+ all_chunks.extend(chunks)
157
+
158
+ return all_chunks
159
+
160
+ def chunk_pages(self, pages: List[Document], adaptive: bool = True) -> List[Document]:
161
+ """Chunk a list of page Documents."""
162
+ if not pages:
163
+ return []
164
+
165
+ source = pages[0].metadata.get("source", "document.pdf")
166
+ full_text = ""
167
+ page_boundaries: List[int] = []
168
+
169
+ for page in pages:
170
+ page_boundaries.append(len(full_text))
171
+ full_text += page.page_content + "\n\n"
172
+
173
+ chunks = self.chunk_document(full_text, source, adaptive)
174
+
175
+ for chunk in chunks:
176
+ chunk_start = full_text.find(
177
+ chunk.page_content.replace(f"[{chunk.metadata.get('section', '')}] ", "")[:50]
178
+ )
179
+ if chunk_start >= 0:
180
+ page_num = 1
181
+ for i, boundary in enumerate(page_boundaries):
182
+ if chunk_start >= boundary:
183
+ page_num = i + 1
184
+ chunk.metadata["page"] = page_num
185
+
186
+ return chunks
187
+
188
+
189
+ _default_chunker: Optional[SemanticChunker] = None
190
+
191
+
192
+ def get_chunker() -> SemanticChunker:
193
+ """Get the default chunker instance (singleton)."""
194
+ global _default_chunker
195
+ if _default_chunker is None:
196
+ _default_chunker = SemanticChunker()
197
+ return _default_chunker
src/services/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core services for GraphRAG application."""
2
+
3
+ from src.services.neo4j_service import Neo4jService, Neo4jConnectionError
4
+ from src.services.builder import GraphRAGBuilder
5
+ from src.services.answerer import QueryAnswerer
6
+ from src.services.retriever import OptimizedRetriever
7
+ from src.services.cache import QueryCache, AnswerCache, get_query_cache, get_answer_cache
8
+
9
+ __all__ = [
10
+ "Neo4jService",
11
+ "Neo4jConnectionError",
12
+ "GraphRAGBuilder",
13
+ "QueryAnswerer",
14
+ "OptimizedRetriever",
15
+ "QueryCache",
16
+ "AnswerCache",
17
+ "get_query_cache",
18
+ "get_answer_cache",
19
+ ]
src/services/answerer.py ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Query answering service with hybrid strategy."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, List, Optional, Set, Tuple
6
+
7
+ from langchain.schema import Document
8
+
9
+ from src.config import get_logger, trace_flow, log_step
10
+ from src.models.state import AppState
11
+ from src.services.retriever import OptimizedRetriever
12
+ from src.services.cache import AnswerCache, get_answer_cache
13
+ from src.services.cypher_templates import (
14
+ CypherTemplateRouter,
15
+ TemplateResultFormatter,
16
+ QueryIntent,
17
+ )
18
+
19
+ # Module logger
20
+ logger = get_logger(__name__)
21
+
22
+
23
+ class QueryAnswerer:
24
+ """Answers user questions using an optimized hybrid strategy.
25
+
26
+ Strategy:
27
+ 1) Template-first routing: Pattern matching classifies intent and
28
+ executes pre-validated Cypher templates for most queries.
29
+ This is deterministic, fast, and reliable.
30
+
31
+ 2) For general queries: GraphRAG with optimized retrieval:
32
+ - Pattern-based query expansion (no LLM)
33
+ - Cross-encoder reranking (faster than LLM)
34
+ - Single LLM call for synthesis only
35
+ """
36
+
37
+ # Default retrieval settings
38
+ DEFAULT_K = 6
39
+
40
+ # Optimized synthesis prompt (simpler, more focused)
41
+ SYNTHESIS_PROMPT = """You are an expert analyst for industrial project reports.
42
+
43
+ ## Question
44
+ {question}
45
+
46
+ ## Retrieved Document Excerpts
47
+ {context}
48
+
49
+ ## Graph Database Context
50
+ {graph_context}
51
+
52
+ ## Instructions
53
+ 1. Answer directly and concisely based on the evidence
54
+ 2. If information is incomplete, acknowledge what's missing
55
+ 3. For comparison questions, structure answer by project
56
+ 4. Use citations like [1], [2] to reference sources
57
+ 5. For challenges/risks, consider: cancellation reasons, delays, funding issues, permitting
58
+
59
+ Answer:""".strip()
60
+
61
+ def __init__(
62
+ self,
63
+ k: int = DEFAULT_K,
64
+ use_optimized_retrieval: bool = True,
65
+ use_caching: bool = True,
66
+ cache_ttl: float = 3600,
67
+ use_reranking: bool = True,
68
+ ) -> None:
69
+ """Initialize query answerer.
70
+
71
+ Args:
72
+ k: Number of chunks to retrieve for similarity search.
73
+ use_optimized_retrieval: If True, uses fast pattern-based expansion
74
+ and cross-encoder reranking. If False, uses original LLM-based.
75
+ use_caching: If True, caches answers for repeated queries.
76
+ cache_ttl: Cache time-to-live.
77
+ use_reranking: If True, uses cross-encoder reranking.
78
+ """
79
+ self.k = k
80
+ self.use_optimized_retrieval = use_optimized_retrieval
81
+ self.use_caching = use_caching
82
+ self.use_reranking = use_reranking
83
+ self._retriever: Optional[OptimizedRetriever] = None
84
+ self._cache: Optional[AnswerCache] = None
85
+
86
+ # Initialize template router for fast intent classification
87
+ self._template_router = CypherTemplateRouter()
88
+
89
+ if use_caching:
90
+ self._cache = get_answer_cache(default_ttl=cache_ttl)
91
+
92
+ def _format_citations(self, docs: List[Document]) -> str:
93
+ """Format unique citations from retrieved chunk documents.
94
+
95
+ Args:
96
+ docs: List of retrieved documents.
97
+
98
+ Returns:
99
+ Formatted citation string.
100
+ """
101
+ seen: Set[Tuple[str, Optional[int]]] = set()
102
+ lines: List[str] = []
103
+
104
+ for doc in docs:
105
+ src = doc.metadata.get("source", "")
106
+ page = doc.metadata.get("page", None)
107
+ key = (src, page)
108
+
109
+ if key in seen:
110
+ continue
111
+ seen.add(key)
112
+
113
+ if page is not None:
114
+ lines.append(f"- {src} p.{page}")
115
+ else:
116
+ lines.append(f"- {src}")
117
+
118
+ return "\n".join(lines)
119
+
120
+ def _format_budget_value(
121
+ self,
122
+ budget: Optional[Any],
123
+ currency: Optional[str]
124
+ ) -> str:
125
+ """Format budget value for display.
126
+
127
+ Args:
128
+ budget: Budget amount (may be None or numeric).
129
+ currency: Currency code.
130
+
131
+ Returns:
132
+ Formatted budget string.
133
+ """
134
+ if isinstance(budget, (int, float)) and currency:
135
+ return f"{budget:,.0f} {currency}"
136
+ elif budget:
137
+ return str(budget)
138
+ return "—"
139
+
140
+ def _format_location(self, row: Dict[str, Any]) -> str:
141
+ """Format location components into a string.
142
+
143
+ Args:
144
+ row: Query result row with location fields.
145
+
146
+ Returns:
147
+ Formatted location string.
148
+ """
149
+ loc_parts = [
150
+ x for x in [
151
+ row.get("address"),
152
+ row.get("city"),
153
+ row.get("state"),
154
+ row.get("postal"),
155
+ row.get("country"),
156
+ ] if x
157
+ ]
158
+ return ", ".join(loc_parts) if loc_parts else "—"
159
+
160
+ def _budget_location(self, graph: Any) -> str:
161
+ """Deterministic answer for budget allocation and location.
162
+
163
+ Args:
164
+ graph: Neo4jGraph instance.
165
+
166
+ Returns:
167
+ Formatted budget and location answer.
168
+ """
169
+ rows = graph.query(self.CYPHER_BUDGET_LOCATION)
170
+
171
+ if not rows:
172
+ return "No structured budget/location data found in the graph yet."
173
+
174
+ out = ["**Budget allocation (TIV) and location**"]
175
+ for row in rows:
176
+ budget_str = self._format_budget_value(
177
+ row.get("budget"),
178
+ row.get("currency"),
179
+ )
180
+ loc = self._format_location(row)
181
+ out.append(f"- **{row.get('project')}**: {budget_str}; {loc}")
182
+
183
+ return "\n".join(out)
184
+
185
+ def _timelines(self, graph: Any) -> str:
186
+ """Deterministic timeline comparison using extracted milestones.
187
+
188
+ Args:
189
+ graph: Neo4jGraph instance.
190
+
191
+ Returns:
192
+ Formatted timeline answer.
193
+ """
194
+ rows = graph.query(self.CYPHER_TIMELINES)
195
+ logger.info(f"Timeline query returned {len(rows) if rows else 0} rows")
196
+
197
+ if not rows:
198
+ return "No structured timeline data found in the graph yet."
199
+
200
+ out = ["**Timelines (milestones extracted from Schedule)**"]
201
+ for row in rows:
202
+ project_name = row.get('project') or 'Unknown Project'
203
+ out.append(f"\n### {project_name}")
204
+ milestones = row.get("milestones") or []
205
+ logger.info(f"Project '{project_name}': {len(milestones)} milestones raw")
206
+
207
+ # Filter out null milestones (from OPTIONAL MATCH returning nulls)
208
+ valid_milestones = [m for m in milestones if m and m.get("name")]
209
+ logger.info(f"Project '{project_name}': {len(valid_milestones)} valid milestones")
210
+
211
+ if not valid_milestones:
212
+ out.append("- No milestones extracted")
213
+ else:
214
+ for m in valid_milestones[:14]: # Limit display
215
+ dt = (m.get("dateText") or "").strip()
216
+ nm = (m.get("name") or "Milestone").strip()
217
+ if dt:
218
+ out.append(f"- {nm}: {dt}")
219
+ else:
220
+ sent = m.get('sentence') or ''
221
+ out.append(f"- {nm}: {sent[:100]}")
222
+
223
+ result = "\n".join(out)
224
+ logger.info(f"Timeline result: {len(result)} chars")
225
+ return result
226
+
227
+ def _challenges(self, graph: Any) -> str:
228
+ """Deterministic challenges listing from structured Challenge nodes.
229
+
230
+ Args:
231
+ graph: Neo4jGraph instance.
232
+
233
+ Returns:
234
+ Formatted challenges answer.
235
+ """
236
+ rows = graph.query(self.CYPHER_CHALLENGES)
237
+
238
+ if not rows:
239
+ return "No structured challenges found yet."
240
+
241
+ out = [
242
+ "**Potential challenges / constraints "
243
+ "(from Status reason + Details + schedule heuristics)**"
244
+ ]
245
+ for row in rows:
246
+ out.append(f"\n### {row['project']}")
247
+ challenges = [x for x in (row.get("challenges") or []) if x]
248
+
249
+ if not challenges:
250
+ out.append("- —")
251
+ else:
252
+ for ch in challenges[:14]: # Limit display
253
+ out.append(f"- {ch}")
254
+
255
+ return "\n".join(out)
256
+
257
+ def _get_graph_context(self, question: str, graph: Any) -> str:
258
+ """Get relevant graph context without LLM Cypher generation.
259
+
260
+ Uses simple pattern matching to find related entities.
261
+
262
+ Args:
263
+ question: User question
264
+ graph: Neo4j graph instance
265
+
266
+ Returns:
267
+ Formatted graph context string
268
+ """
269
+ import re
270
+
271
+ # Extract potential project names from question
272
+ potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', question)
273
+
274
+ if not potential_names:
275
+ return ""
276
+
277
+ context_parts = []
278
+
279
+ for name in potential_names[:2]:
280
+ try:
281
+ results = graph.query("""
282
+ MATCH (p:Project)
283
+ WHERE toLower(p.name) CONTAINS toLower($name)
284
+ OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
285
+ OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
286
+ RETURN p.name AS project,
287
+ p.status AS status,
288
+ b.amount AS budget,
289
+ b.currency AS currency,
290
+ l.city AS city,
291
+ l.country AS country
292
+ LIMIT 3
293
+ """, {"name": name.lower()})
294
+
295
+ for r in results:
296
+ parts = [f"**{r['project']}**"]
297
+ if r.get('status'):
298
+ parts.append(f"Status: {r['status']}")
299
+ if r.get('budget'):
300
+ parts.append(f"Budget: {r['budget']:,.0f} {r.get('currency', '')}")
301
+ if r.get('city'):
302
+ parts.append(f"Location: {r['city']}, {r.get('country', '')}")
303
+ context_parts.append(" | ".join(parts))
304
+
305
+ except Exception:
306
+ pass
307
+
308
+ return "\n".join(context_parts) if context_parts else ""
309
+
310
+ def _get_retriever(self, state: AppState) -> OptimizedRetriever:
311
+ """Get or create the optimized retriever.
312
+
313
+ Args:
314
+ state: Application state with vector store.
315
+
316
+ Returns:
317
+ OptimizedRetriever instance (fast pattern-based + cross-encoder).
318
+ """
319
+ if self._retriever is None:
320
+ self._retriever = OptimizedRetriever(
321
+ vector_store=state.vector,
322
+ k_initial=self.k * 2, # Retrieve more initially for reranking
323
+ k_final=self.k,
324
+ use_expansion=True,
325
+ use_reranking=self.use_reranking,
326
+ use_cache=True,
327
+ )
328
+ return self._retriever
329
+
330
+ def _format_context(self, docs: List[Document]) -> str:
331
+ """Format retrieved documents into context string.
332
+
333
+ Args:
334
+ docs: List of retrieved documents.
335
+
336
+ Returns:
337
+ Formatted context string with source attribution.
338
+ """
339
+ context_parts = []
340
+ for i, doc in enumerate(docs, 1):
341
+ source = doc.metadata.get('source', 'Unknown')
342
+ page = doc.metadata.get('page', '?')
343
+ section = doc.metadata.get('section', '')
344
+
345
+ header = f"[{i}] Source: {source}, Page {page}"
346
+ if section:
347
+ header += f", Section: {section}"
348
+
349
+ context_parts.append(f"{header}\n{doc.page_content}")
350
+
351
+ return "\n\n---\n\n".join(context_parts)
352
+
353
+ def _graphrag_answer(
354
+ self,
355
+ question: str,
356
+ state: AppState,
357
+ ) -> str:
358
+ """Generate answer using optimized GraphRAG approach.
359
+
360
+ Optimized flow:
361
+ 1. Retrieve with optimized retriever (pattern expansion + cross-encoder)
362
+ 2. Get graph context (no LLM Cypher generation)
363
+ 3. Single LLM call for synthesis
364
+
365
+ Args:
366
+ question: User question.
367
+ state: Application state.
368
+
369
+ Returns:
370
+ Synthesized answer with citations.
371
+ """
372
+ with log_step(logger, "GraphRAG answer generation"):
373
+ # Retrieve relevant chunks with optimized retriever
374
+ with log_step(logger, "Retrieve relevant chunks"):
375
+ if self.use_optimized_retrieval:
376
+ logger.substep("Using optimized retrieval (pattern expansion + cross-encoder)")
377
+ retriever = self._get_retriever(state)
378
+ docs = retriever.retrieve(question)
379
+ else:
380
+ logger.substep("Using simple similarity search")
381
+ docs = state.vector.similarity_search(question, k=self.k)
382
+ logger.info(f"Retrieved {len(docs)} chunks")
383
+
384
+ # Get graph context (fast, no LLM)
385
+ with log_step(logger, "Get graph context"):
386
+ graph = state.get_graph()
387
+ graph_context = self._get_graph_context(question, graph)
388
+ if graph_context:
389
+ logger.substep(f"Found graph context")
390
+ else:
391
+ logger.substep("No direct graph context found")
392
+
393
+ # Format context
394
+ context = self._format_context(docs)
395
+
396
+ # Single LLM call for synthesis
397
+ with log_step(logger, "Synthesize answer"):
398
+ logger.substep("Invoking LLM for synthesis")
399
+ synthesis_prompt = self.SYNTHESIS_PROMPT.format(
400
+ question=question,
401
+ context=context,
402
+ graph_context=graph_context if graph_context else "(No structured data found)",
403
+ )
404
+
405
+ resp = state.llm.invoke(synthesis_prompt)
406
+ answer = getattr(resp, "content", str(resp))
407
+
408
+ # Cache the answer
409
+ if self._cache and self.use_caching:
410
+ logger.substep("Caching answer")
411
+ self._cache.set_answer(
412
+ query=question,
413
+ answer=answer,
414
+ documents=docs,
415
+ cypher_result=graph_context,
416
+ )
417
+
418
+ return answer
419
+
420
+ def clear_cache(self) -> int:
421
+ """Clear the answer cache.
422
+
423
+ Returns:
424
+ Number of cached entries cleared.
425
+ """
426
+ if self._cache:
427
+ return self._cache.invalidate_all()
428
+ return 0
429
+
430
+ def get_cache_stats(self) -> Dict[str, Any]:
431
+ """Get cache statistics.
432
+
433
+ Returns:
434
+ Dictionary with cache metrics.
435
+ """
436
+ if self._cache:
437
+ return self._cache.get_stats()
438
+ return {"caching_enabled": False}
439
+
440
+ @trace_flow("Query Processing")
441
+ def answer(self, question: str, state: AppState) -> str:
442
+ """Answer a user question using optimized hybrid approach.
443
+
444
+ Flow:
445
+ 1. Check answer cache
446
+ 2. Template routing with pattern classification
447
+ 3. For structured queries: Execute template + format
448
+ 4. For general queries: Vector search + rerank + synthesis
449
+
450
+ Args:
451
+ question: Natural language user query.
452
+ state: AppState initialized after successful ingestion.
453
+
454
+ Returns:
455
+ Markdown response suitable for display.
456
+ """
457
+ logger.info(f"Processing question: {question[:80]}...")
458
+
459
+ if not state or not state.is_ready():
460
+ logger.warning("State not ready - PDFs not ingested")
461
+ return "Please ingest PDFs first."
462
+
463
+ # Check cache first
464
+ if self._cache and self.use_caching:
465
+ with log_step(logger, "Check cache"):
466
+ cached = self._cache.get_answer(question)
467
+ if cached:
468
+ logger.info("Cache hit")
469
+ return cached.answer
470
+
471
+ graph = state.get_graph()
472
+
473
+ # Try template routing first (handles 70-80% of queries)
474
+ with log_step(logger, "Template routing"):
475
+ results, intent = self._template_router.route_query(question, graph)
476
+
477
+ if intent != QueryIntent.GENERAL and results is not None:
478
+ # Format template results (no LLM needed)
479
+ answer = TemplateResultFormatter.format(results, intent)
480
+
481
+ # Cache the answer
482
+ if self._cache and self.use_caching:
483
+ self._cache.set_answer(
484
+ query=question,
485
+ answer=answer,
486
+ documents=[],
487
+ cypher_result=str(results[:3]) if results else "",
488
+ )
489
+
490
+ logger.info(f"Template answer (intent: {intent.value})")
491
+ return answer
492
+
493
+ logger.info(f"Intent: {intent.value} - using RAG fallback")
494
+
495
+ # GraphRAG fallback for general queries
496
+ answer = self._graphrag_answer(question, state)
497
+ logger.info("RAG answer generated")
498
+ return answer
src/services/builder.py ADDED
@@ -0,0 +1,693 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GraphRAG builder for PDF ingestion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import time
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ from typing import Any, Dict, Generator, List, Optional, Tuple
9
+
10
+ from langchain.schema import Document
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain.prompts import PromptTemplate
13
+
14
+ from src.config import get_logger, trace_flow, log_step
15
+
16
+ # LangChain imports with compatibility handling
17
+ try:
18
+ from langchain_community.document_loaders import PyPDFLoader
19
+ from langchain_community.vectorstores import Neo4jVector
20
+ except ImportError:
21
+ from langchain.document_loaders import PyPDFLoader
22
+ from langchain.vectorstores import Neo4jVector
23
+
24
+ from langchain_experimental.graph_transformers import LLMGraphTransformer
25
+ from langchain_community.chains.graph_qa.cypher import GraphCypherQAChain
26
+ from langchain_together import ChatTogether, TogetherEmbeddings
27
+
28
+ from src.config.schema import SchemaPolicy
29
+ from src.config.settings import Neo4jConfig, TogetherAIConfig
30
+ from src.models.state import AppState
31
+ from src.parsers.project_parser import ProjectReportParser
32
+ from src.parsers.smart_chunker import SemanticChunker
33
+ from src.services.neo4j_service import Neo4jService, Neo4jConnectionError
34
+
35
+ # Module logger
36
+ logger = get_logger(__name__)
37
+
38
+
39
+ class GraphRAGBuilder:
40
+ """Builds and populates Neo4j-backed GraphRAG resources from uploaded PDFs.
41
+
42
+ Responsibilities:
43
+ - Configure Together AI chat + embeddings models.
44
+ - Parse PDFs into pages and chunks with provenance metadata.
45
+ - Upsert deterministic structured graph nodes for stable Q/A.
46
+ - Run LLMGraphTransformer for broader entity/relationship extraction.
47
+ - Create/refresh Neo4jVector hybrid indexes.
48
+ - Create GraphCypherQAChain for graph-native Q/A.
49
+
50
+ This class is intentionally stateless across runs; it returns AppState
51
+ for query-time usage.
52
+
53
+ Attributes:
54
+ llm: Chat model instance.
55
+ embeddings: Embeddings model instance.
56
+
57
+ Example:
58
+ >>> builder = GraphRAGBuilder(
59
+ ... together_config=TogetherAIConfig(api_key="key")
60
+ ... )
61
+ >>> message, state = builder.ingest(pdf_files, neo4j_config)
62
+ """
63
+
64
+ # Chunk configuration
65
+ DEFAULT_CHUNK_SIZE = 900
66
+ DEFAULT_CHUNK_OVERLAP = 150
67
+
68
+ # Parallel extraction configuration (optimized for speed)
69
+ EXTRACTION_BATCH_SIZE = 8 # Increased from 5
70
+ MAX_EXTRACTION_WORKERS = 5 # Increased from 3
71
+
72
+ # Vector index configuration
73
+ INDEX_NAME = "project_chunks_vector"
74
+ KEYWORD_INDEX_NAME = "project_chunks_keyword"
75
+ NODE_LABEL = "Chunk"
76
+
77
+ # Enhanced Cypher QA prompt with examples
78
+ CYPHER_PROMPT_TEMPLATE = """You are a Neo4j Cypher expert. Generate a Cypher query to answer the question.
79
+
80
+ ## Schema
81
+ {schema}
82
+
83
+ ## Key Patterns
84
+
85
+ 1. **Project with Budget and Location:**
86
+ ```cypher
87
+ MATCH (p:Project)
88
+ OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
89
+ OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
90
+ RETURN p.name, b.amount, b.currency, l.city, l.country
91
+ ```
92
+
93
+ 2. **Project Milestones/Timeline:**
94
+ ```cypher
95
+ MATCH (p:Project)-[:HAS_MILESTONE]->(m:Milestone)
96
+ RETURN p.name, m.name AS milestone, m.dateText
97
+ ORDER BY p.name, m.dateText
98
+ ```
99
+
100
+ 3. **Challenges and Risks:**
101
+ ```cypher
102
+ MATCH (p:Project)-[:HAS_CHALLENGE]->(c:Challenge)
103
+ RETURN p.name, collect(c.text) AS challenges
104
+ ```
105
+
106
+ 4. **Cross-Project Comparison:**
107
+ ```cypher
108
+ MATCH (p:Project)
109
+ OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
110
+ OPTIONAL MATCH (p)-[:HAS_MILESTONE]->(m:Milestone)
111
+ WITH p, b, collect(m) AS milestones
112
+ RETURN p.name, b.amount, size(milestones) AS milestone_count
113
+ ORDER BY b.amount DESC
114
+ ```
115
+
116
+ 5. **Entity Relationships:**
117
+ ```cypher
118
+ MATCH (p:Project)-[r]->(related)
119
+ WHERE NOT related:Chunk
120
+ RETURN p.name, type(r) AS relationship, labels(related)[0] AS entity_type,
121
+ coalesce(related.name, related.text, related.amount) AS value
122
+ LIMIT 50
123
+ ```
124
+
125
+ ## Rules
126
+ - Use OPTIONAL MATCH when relationships may not exist
127
+ - Always include ORDER BY for consistent results
128
+ - Use collect() to aggregate multiple related nodes
129
+ - Limit results if the query could return many rows
130
+ - Return human-readable names, not IDs
131
+ - For comparisons across projects, ensure all projects are included
132
+
133
+ ## Question
134
+ {question}
135
+
136
+ Return ONLY the Cypher query, no explanation.""".strip()
137
+
138
+ def __init__(
139
+ self,
140
+ together_config: Optional[TogetherAIConfig] = None,
141
+ together_api_key: Optional[str] = None,
142
+ chat_model: str = "deepseek-ai/DeepSeek-V3",
143
+ embedding_model: str = "togethercomputer/m2-bert-80M-8k-retrieval",
144
+ ) -> None:
145
+ """Initialize GraphRAG builder.
146
+
147
+ Args:
148
+ together_config: Together AI configuration object.
149
+ together_api_key: API key (alternative to config object).
150
+ chat_model: Chat model identifier.
151
+ embedding_model: Embedding model identifier.
152
+
153
+ Raises:
154
+ ValueError: If no API key is provided.
155
+ """
156
+ # Handle configuration
157
+ if together_config:
158
+ api_key = together_config.api_key
159
+ chat_model = together_config.chat_model or chat_model
160
+ embedding_model = together_config.embedding_model or embedding_model
161
+ else:
162
+ api_key = together_api_key
163
+
164
+ if not api_key:
165
+ raise ValueError("Together API key is required.")
166
+
167
+ # Set environment variable for SDK
168
+ os.environ["TOGETHER_API_KEY"] = api_key
169
+
170
+ # Initialize models
171
+ self.llm = ChatTogether(model=chat_model, temperature=0)
172
+ self.embeddings = TogetherEmbeddings(model=embedding_model)
173
+
174
+ # Initialize parsers and chunkers
175
+ self._parser = ProjectReportParser()
176
+ self._chunker = SemanticChunker(
177
+ max_chunk_size=self.DEFAULT_CHUNK_SIZE + 300, # Slightly larger for semantic chunks
178
+ min_chunk_size=200,
179
+ overlap_sentences=2,
180
+ )
181
+
182
+ def _load_pdf_pages(
183
+ self,
184
+ pdf_files: List[Any]
185
+ ) -> Tuple[List[Document], List[Tuple[str, str]]]:
186
+ """Load PDF files and extract pages with metadata.
187
+
188
+ Args:
189
+ pdf_files: List of gradio-uploaded file handles.
190
+
191
+ Returns:
192
+ Tuple of (all pages as Documents, list of (source_name, full_text)).
193
+ """
194
+ all_pages: List[Document] = []
195
+ raw_texts: List[Tuple[str, str]] = []
196
+
197
+ with log_step(logger, "Load PDF files", f"{len(pdf_files)} file(s)"):
198
+ for f in pdf_files:
199
+ src_name = (
200
+ getattr(f, "name", None) or
201
+ getattr(f, "orig_name", None) or
202
+ "uploaded.pdf"
203
+ )
204
+ logger.substep(f"Loading: {os.path.basename(src_name)}")
205
+ loader = PyPDFLoader(f.name)
206
+ pages = loader.load()
207
+ all_pages.extend(pages)
208
+ logger.substep(f"Extracted {len(pages)} pages")
209
+
210
+ joined = "\n".join([p.page_content for p in pages])
211
+ raw_texts.append((os.path.basename(src_name), joined))
212
+
213
+ logger.info(f"Total pages loaded: {len(all_pages)}")
214
+ return all_pages, raw_texts
215
+
216
+ def _create_chunks(
217
+ self,
218
+ pages: List[Document],
219
+ use_semantic_chunking: bool = True,
220
+ ) -> List[Document]:
221
+ """Split pages into chunks with normalized metadata.
222
+
223
+ Args:
224
+ pages: List of page Documents.
225
+ use_semantic_chunking: If True, uses section-aware chunking.
226
+
227
+ Returns:
228
+ List of chunk Documents with metadata.
229
+ """
230
+ chunking_type = "semantic" if use_semantic_chunking else "character-based"
231
+ with log_step(logger, "Create document chunks", chunking_type):
232
+ if use_semantic_chunking:
233
+ # Use semantic chunker that respects document structure
234
+ logger.substep("Using section-aware semantic chunking")
235
+ chunks = self._chunker.chunk_pages(pages, adaptive=True)
236
+ else:
237
+ # Fallback to simple character-based splitting
238
+ logger.substep("Using RecursiveCharacterTextSplitter")
239
+ splitter = RecursiveCharacterTextSplitter(
240
+ chunk_size=self.DEFAULT_CHUNK_SIZE,
241
+ chunk_overlap=self.DEFAULT_CHUNK_OVERLAP,
242
+ )
243
+ chunks = splitter.split_documents(pages)
244
+
245
+ logger.substep(f"Raw chunks created: {len(chunks)}")
246
+
247
+ processed_chunks: List[Document] = []
248
+ for chunk in chunks:
249
+ meta = dict(chunk.metadata or {})
250
+ meta["source"] = os.path.basename(meta.get("source", "")) or "uploaded.pdf"
251
+
252
+ # Normalize page numbers (PyPDFLoader uses 0-index)
253
+ if "page" in meta and isinstance(meta["page"], int):
254
+ if meta["page"] == 0 or (not use_semantic_chunking):
255
+ meta["page"] = int(meta["page"]) + 1
256
+
257
+ processed_chunks.append(Document(
258
+ page_content=chunk.page_content.replace("\n", " "),
259
+ metadata=meta,
260
+ ))
261
+
262
+ logger.info(f"Final chunks: {len(processed_chunks)}")
263
+ return processed_chunks
264
+
265
+ def _extract_structured_data(
266
+ self,
267
+ neo4j: Neo4jService,
268
+ raw_texts: List[Tuple[str, str]],
269
+ ) -> List[Dict[str, Any]]:
270
+ """Extract and upsert structured project data.
271
+
272
+ Args:
273
+ neo4j: Neo4j service instance.
274
+ raw_texts: List of (source_name, full_text) tuples.
275
+
276
+ Returns:
277
+ List of project dictionaries with results/warnings.
278
+ """
279
+ projects_created: List[Dict[str, Any]] = []
280
+
281
+ with log_step(logger, "Extract structured data", f"{len(raw_texts)} document(s)"):
282
+ for source, full_text in raw_texts:
283
+ logger.substep(f"Parsing: {source}")
284
+ record = self._parser.parse(full_text, source)
285
+ try:
286
+ proj = neo4j.upsert_structured_project(record)
287
+ projects_created.append(proj)
288
+ logger.substep(f"Created project: {proj.get('name', source)}")
289
+ except Exception as e:
290
+ logger.warning(f"Failed to create project {source}: {e}")
291
+ projects_created.append({
292
+ "projectId": record.project_id or source,
293
+ "name": record.project_name or source,
294
+ "warning": str(e),
295
+ })
296
+
297
+ logger.info(f"Structured extraction complete: {len(projects_created)} project(s)")
298
+ return projects_created
299
+
300
+ def _extract_llm_graph(
301
+ self,
302
+ neo4j: Neo4jService,
303
+ chunks: List[Document],
304
+ parallel: bool = True,
305
+ ) -> None:
306
+ """Extract entities/relationships using LLM and add to graph.
307
+
308
+ Args:
309
+ neo4j: Neo4j service instance.
310
+ chunks: Document chunks for extraction.
311
+ parallel: If True, uses parallel batch processing.
312
+ """
313
+ mode = "parallel" if parallel else "sequential"
314
+ with log_step(logger, "LLM graph extraction", f"{len(chunks)} chunks, {mode}"):
315
+ logger.substep("Initializing LLMGraphTransformer")
316
+ transformer = LLMGraphTransformer(
317
+ llm=self.llm,
318
+ allowed_nodes=SchemaPolicy.ALLOWED_NODES,
319
+ allowed_relationships=SchemaPolicy.ALLOWED_RELATIONSHIPS,
320
+ node_properties=True, # Enable property extraction for richer graph
321
+ )
322
+
323
+ if not parallel or len(chunks) <= self.EXTRACTION_BATCH_SIZE:
324
+ # Sequential extraction for small chunk sets
325
+ logger.substep("Using sequential extraction (small chunk set)")
326
+ graph_documents = transformer.convert_to_graph_documents(chunks)
327
+ neo4j.graph.add_graph_documents(graph_documents, include_source=True)
328
+ logger.info(f"Added {len(graph_documents)} graph documents")
329
+ return
330
+
331
+ # Parallel extraction for larger chunk sets
332
+ def process_batch(batch: List[Document]) -> List:
333
+ """Process a batch of chunks."""
334
+ try:
335
+ return transformer.convert_to_graph_documents(batch)
336
+ except Exception:
337
+ return []
338
+
339
+ # Split into batches
340
+ batches = [
341
+ chunks[i:i + self.EXTRACTION_BATCH_SIZE]
342
+ for i in range(0, len(chunks), self.EXTRACTION_BATCH_SIZE)
343
+ ]
344
+ logger.substep(f"Split into {len(batches)} batches ({self.EXTRACTION_BATCH_SIZE} chunks each)")
345
+
346
+ all_graph_docs = []
347
+ failed_batches = 0
348
+
349
+ # Process batches with thread pool for IO-bound LLM calls
350
+ logger.substep(f"Starting parallel extraction with {self.MAX_EXTRACTION_WORKERS} workers")
351
+ with ThreadPoolExecutor(max_workers=self.MAX_EXTRACTION_WORKERS) as executor:
352
+ futures = {
353
+ executor.submit(process_batch, batch): i
354
+ for i, batch in enumerate(batches)
355
+ }
356
+
357
+ for future in as_completed(futures):
358
+ batch_idx = futures[future]
359
+ try:
360
+ result = future.result(timeout=120)
361
+ all_graph_docs.extend(result)
362
+ logger.substep(f"Batch {batch_idx + 1}/{len(batches)} complete")
363
+ except Exception as e:
364
+ failed_batches += 1
365
+ logger.warning(f"Batch {batch_idx + 1} failed: {e}")
366
+
367
+ # Bulk add to graph
368
+ if all_graph_docs:
369
+ logger.substep(f"Adding {len(all_graph_docs)} graph documents to Neo4j")
370
+ neo4j.graph.add_graph_documents(all_graph_docs, include_source=True)
371
+
372
+ if failed_batches > 0:
373
+ logger.warning(f"{failed_batches} batch(es) failed during extraction")
374
+
375
+ logger.info(f"LLM extraction complete: {len(all_graph_docs)} graph documents")
376
+
377
+ def _create_vector_index(
378
+ self,
379
+ chunks: List[Document],
380
+ neo4j_config: Neo4jConfig,
381
+ ) -> Neo4jVector:
382
+ """Create or refresh vector index for chunks.
383
+
384
+ Args:
385
+ chunks: Document chunks to index.
386
+ neo4j_config: Neo4j connection configuration.
387
+
388
+ Returns:
389
+ Neo4jVector index instance.
390
+ """
391
+ with log_step(logger, "Create vector index", f"{len(chunks)} chunks"):
392
+ logger.substep(f"Index name: {self.INDEX_NAME}")
393
+ logger.substep(f"Keyword index: {self.KEYWORD_INDEX_NAME}")
394
+ logger.substep("Creating hybrid search index (dense + BM25)")
395
+
396
+ vector = Neo4jVector.from_documents(
397
+ documents=chunks,
398
+ embedding=self.embeddings,
399
+ url=neo4j_config.uri,
400
+ username=neo4j_config.username,
401
+ password=neo4j_config.password,
402
+ database=neo4j_config.database or "neo4j",
403
+ index_name=self.INDEX_NAME,
404
+ keyword_index_name=self.KEYWORD_INDEX_NAME,
405
+ node_label=self.NODE_LABEL,
406
+ embedding_node_property="embedding",
407
+ search_type="hybrid",
408
+ )
409
+
410
+ logger.info("Vector index created successfully")
411
+ return vector
412
+
413
+ def _create_qa_chain(self, neo4j: Neo4jService) -> GraphCypherQAChain:
414
+ """Create Cypher QA chain for graph querying.
415
+
416
+ Args:
417
+ neo4j: Neo4j service instance.
418
+
419
+ Returns:
420
+ GraphCypherQAChain instance.
421
+ """
422
+ with log_step(logger, "Create Cypher QA chain"):
423
+ logger.substep("Configuring enhanced Cypher prompt template")
424
+ cypher_prompt = PromptTemplate(
425
+ template=self.CYPHER_PROMPT_TEMPLATE,
426
+ input_variables=["schema", "question"],
427
+ )
428
+
429
+ logger.substep("Initializing GraphCypherQAChain")
430
+ chain = GraphCypherQAChain.from_llm(
431
+ llm=self.llm,
432
+ graph=neo4j.graph,
433
+ cypher_prompt=cypher_prompt,
434
+ verbose=False,
435
+ allow_dangerous_requests=True,
436
+ )
437
+
438
+ logger.info("Cypher QA chain ready")
439
+ return chain
440
+
441
+ @trace_flow("PDF Ingestion Pipeline")
442
+ def ingest(
443
+ self,
444
+ pdf_files: List[Any],
445
+ neo4j_config: Optional[Neo4jConfig] = None,
446
+ neo4j_uri: Optional[str] = None,
447
+ neo4j_user: Optional[str] = None,
448
+ neo4j_password: Optional[str] = None,
449
+ neo4j_database: str = "neo4j",
450
+ clear_db: bool = True,
451
+ ) -> Tuple[str, AppState]:
452
+ """Ingest one or more PDF reports into Neo4j and build GraphRAG indices.
453
+
454
+ Args:
455
+ pdf_files: List of gradio-uploaded file handles.
456
+ neo4j_config: Neo4j configuration object (preferred).
457
+ neo4j_uri: Neo4j connection URI (alternative).
458
+ neo4j_user: Username (alternative).
459
+ neo4j_password: Password (alternative).
460
+ neo4j_database: Database name.
461
+ clear_db: If True, deletes all existing nodes prior to ingestion.
462
+
463
+ Returns:
464
+ Tuple of (human-readable status message, AppState).
465
+
466
+ Notes:
467
+ - The ingestion process can be compute-heavy due to LLM graph extraction.
468
+ - Even if the deterministic parser yields partial results, chunk retrieval
469
+ still works.
470
+ """
471
+ # Validate inputs
472
+ if not pdf_files:
473
+ logger.warning("No PDF files provided")
474
+ return "Please upload at least one PDF.", AppState()
475
+
476
+ logger.info(f"Starting ingestion of {len(pdf_files)} PDF file(s)")
477
+
478
+ # Build config from parameters if not provided
479
+ if neo4j_config is None:
480
+ neo4j_config = Neo4jConfig(
481
+ uri=neo4j_uri or "",
482
+ username=neo4j_user or "neo4j",
483
+ password=neo4j_password or "",
484
+ database=neo4j_database,
485
+ )
486
+
487
+ if not neo4j_config.is_valid():
488
+ logger.error("Invalid Neo4j configuration")
489
+ return "Please provide Neo4j connection details.", AppState()
490
+
491
+ # Connect to Neo4j
492
+ with log_step(logger, "Connect to Neo4j"):
493
+ try:
494
+ neo4j = Neo4jService(
495
+ uri=neo4j_config.uri,
496
+ user=neo4j_config.username,
497
+ password=neo4j_config.password,
498
+ database=neo4j_config.database,
499
+ )
500
+ logger.substep(f"Connected to {neo4j_config.uri}")
501
+ except Neo4jConnectionError as e:
502
+ logger.error(f"Neo4j connection failed: {e}")
503
+ return (
504
+ f"Neo4j connection failed. For Aura, use the exact URI shown in the "
505
+ f"console (typically starts with neo4j+s://...). Error: {e}",
506
+ AppState(),
507
+ )
508
+
509
+ # Ensure constraints
510
+ with log_step(logger, "Ensure database constraints"):
511
+ neo4j.ensure_constraints()
512
+
513
+ # Clear database if requested
514
+ if clear_db:
515
+ with log_step(logger, "Clear existing data"):
516
+ neo4j.clear()
517
+
518
+ # 1) Load PDF pages
519
+ all_pages, raw_texts = self._load_pdf_pages(pdf_files)
520
+
521
+ # 2) Structured extraction (high precision)
522
+ projects_created = self._extract_structured_data(neo4j, raw_texts)
523
+
524
+ # 3) Create chunks
525
+ chunks = self._create_chunks(all_pages)
526
+
527
+ # 4) LLM-based KG extraction (high recall)
528
+ self._extract_llm_graph(neo4j, chunks)
529
+
530
+ # 5) Vector index
531
+ vector = self._create_vector_index(chunks, neo4j_config)
532
+
533
+ # 6) Cypher QA chain
534
+ qa_chain = self._create_qa_chain(neo4j)
535
+
536
+ # Build status message
537
+ proj_lines = []
538
+ for p in projects_created:
539
+ warn = f" (warning: {p.get('warning')})" if "warning" in p else ""
540
+ proj_lines.append(f"- {p.get('name')} [{p.get('projectId')}]{warn}")
541
+
542
+ msg = (
543
+ "Ingestion complete.\n\n"
544
+ f"Neo4j database: `{neo4j_config.database}`\n\n"
545
+ "Projects found:\n" + "\n".join(proj_lines)
546
+ )
547
+
548
+ logger.info(f"Ingestion complete: {len(projects_created)} project(s), {len(chunks)} chunks")
549
+
550
+ return msg, AppState(
551
+ neo4j=neo4j,
552
+ vector=vector,
553
+ qa_chain=qa_chain,
554
+ llm=self.llm,
555
+ )
556
+
557
+ def ingest_with_progress(
558
+ self,
559
+ pdf_files: List[Any],
560
+ neo4j_config: Optional[Neo4jConfig] = None,
561
+ neo4j_uri: Optional[str] = None,
562
+ neo4j_user: Optional[str] = None,
563
+ neo4j_password: Optional[str] = None,
564
+ neo4j_database: str = "neo4j",
565
+ clear_db: bool = True,
566
+ skip_llm_extraction: bool = True, # Skip LLM extraction for faster ingestion
567
+ ) -> Generator[Tuple[str, float, Optional[AppState]], None, None]:
568
+ """Ingest PDFs with progress updates for UI.
569
+
570
+ This generator yields progress updates during ingestion, allowing
571
+ the UI to display a progress bar with status messages.
572
+
573
+ Args:
574
+ pdf_files: List of gradio-uploaded file handles.
575
+ neo4j_config: Neo4j configuration object (preferred).
576
+ neo4j_uri: Neo4j connection URI (alternative).
577
+ neo4j_user: Username (alternative).
578
+ neo4j_password: Password (alternative).
579
+ neo4j_database: Database name.
580
+ clear_db: If True, deletes all existing nodes prior to ingestion.
581
+ skip_llm_extraction: If True, skips LLM graph extraction for faster ingestion.
582
+
583
+ Yields:
584
+ Tuple of (status_message, progress_fraction, optional_state)
585
+ - progress_fraction is 0.0 to 1.0
586
+ - optional_state is None until final yield, then contains AppState
587
+
588
+ Example:
589
+ >>> for status, progress, state in builder.ingest_with_progress(files, config):
590
+ ... print(f"{progress*100:.0f}%: {status}")
591
+ ... if state:
592
+ ... print("Done!")
593
+ """
594
+ start_time = time.time()
595
+
596
+ # Validate inputs
597
+ if not pdf_files:
598
+ yield "❌ Please upload at least one PDF file.", 0.0, None
599
+ return
600
+
601
+ # Build config from parameters if not provided
602
+ if neo4j_config is None:
603
+ neo4j_config = Neo4jConfig(
604
+ uri=neo4j_uri or "",
605
+ username=neo4j_user or "neo4j",
606
+ password=neo4j_password or "",
607
+ database=neo4j_database,
608
+ )
609
+
610
+ if not neo4j_config.is_valid():
611
+ yield "❌ Please provide Neo4j connection details.", 0.0, None
612
+ return
613
+
614
+ # Step 1: Connect to Neo4j (5%)
615
+ yield "🔌 Connecting to Neo4j...", 0.05, None
616
+ try:
617
+ neo4j = Neo4jService(
618
+ uri=neo4j_config.uri,
619
+ user=neo4j_config.username,
620
+ password=neo4j_config.password,
621
+ database=neo4j_config.database,
622
+ )
623
+ except Neo4jConnectionError as e:
624
+ yield f"❌ Neo4j connection failed: {e}", 0.05, None
625
+ return
626
+
627
+ # Step 2: Ensure constraints (10%)
628
+ yield "📋 Setting up database constraints...", 0.10, None
629
+ neo4j.ensure_constraints()
630
+
631
+ # Step 3: Clear database if requested (15%)
632
+ if clear_db:
633
+ yield "🗑️ Clearing existing data...", 0.15, None
634
+ neo4j.clear()
635
+
636
+ # Step 4: Load PDF pages (25%)
637
+ yield f"📄 Loading {len(pdf_files)} PDF file(s)...", 0.20, None
638
+ all_pages, raw_texts = self._load_pdf_pages(pdf_files)
639
+ yield f"📄 Loaded {len(all_pages)} pages from PDFs", 0.25, None
640
+
641
+ # Step 5: Structured extraction (35%)
642
+ yield "🔍 Extracting structured project data...", 0.30, None
643
+ projects_created = self._extract_structured_data(neo4j, raw_texts)
644
+ project_names = [p.get('name', 'Unknown') for p in projects_created]
645
+ yield f"✅ Found {len(projects_created)} project(s): {', '.join(project_names)}", 0.35, None
646
+
647
+ # Step 6: Create chunks (45%)
648
+ yield "✂️ Creating document chunks...", 0.40, None
649
+ chunks = self._create_chunks(all_pages)
650
+ yield f"✅ Created {len(chunks)} chunks", 0.45, None
651
+
652
+ # Step 7: LLM Graph Extraction (optional) (45-70%)
653
+ if not skip_llm_extraction:
654
+ yield f"🧠 Extracting entities with LLM ({len(chunks)} chunks)...", 0.50, None
655
+ # This is the slowest step - show batch progress
656
+ total_batches = (len(chunks) + self.EXTRACTION_BATCH_SIZE - 1) // self.EXTRACTION_BATCH_SIZE
657
+ for batch_num in range(total_batches):
658
+ progress = 0.50 + (0.20 * (batch_num + 1) / total_batches)
659
+ yield f"🧠 LLM extraction: batch {batch_num + 1}/{total_batches}...", progress, None
660
+ self._extract_llm_graph(neo4j, chunks)
661
+ yield "✅ LLM graph extraction complete", 0.70, None
662
+ else:
663
+ yield "⏩ Skipping LLM extraction (using fast mode)", 0.70, None
664
+
665
+ # Step 8: Create vector index (90%)
666
+ yield f"📊 Creating vector index ({len(chunks)} chunks)...", 0.75, None
667
+ vector = self._create_vector_index(chunks, neo4j_config)
668
+ yield "✅ Vector index created", 0.90, None
669
+
670
+ # Step 9: Create QA chain (95%)
671
+ yield "⚙️ Initializing QA chain...", 0.95, None
672
+ qa_chain = self._create_qa_chain(neo4j)
673
+
674
+ # Final step: Complete (100%)
675
+ elapsed = time.time() - start_time
676
+ proj_lines = []
677
+ for p in projects_created:
678
+ warn = f" ⚠️ {p.get('warning')}" if "warning" in p else ""
679
+ proj_lines.append(f"- **{p.get('name')}** [{p.get('projectId')}]{warn}")
680
+
681
+ final_msg = (
682
+ f"## ✅ Ingestion Complete ({elapsed:.1f}s)\n\n"
683
+ f"**Database:** `{neo4j_config.database}`\n\n"
684
+ f"**Projects found:**\n" + "\n".join(proj_lines) + "\n\n"
685
+ f"**Stats:** {len(chunks)} chunks indexed"
686
+ )
687
+
688
+ yield final_msg, 1.0, AppState(
689
+ neo4j=neo4j,
690
+ vector=vector,
691
+ qa_chain=qa_chain,
692
+ llm=self.llm,
693
+ )
src/services/cache.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Query result caching for improved performance.
3
+
4
+ Provides in-memory caching with TTL for query results,
5
+ reducing latency and API costs for repeated queries.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ import threading
12
+ import time
13
+ from dataclasses import dataclass, field
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ from langchain.schema import Document
17
+
18
+
19
+ @dataclass
20
+ class CacheEntry:
21
+ """A single cache entry with TTL support."""
22
+ value: Any
23
+ timestamp: float
24
+ ttl: float
25
+ hits: int = 0
26
+
27
+ def is_expired(self) -> bool:
28
+ """Check if entry has expired.
29
+
30
+ Returns:
31
+ True if entry is past its TTL.
32
+ """
33
+ return time.time() - self.timestamp > self.ttl
34
+
35
+ def access(self) -> Any:
36
+ """Access the cached value and increment hit counter.
37
+
38
+ Returns:
39
+ Cached value.
40
+ """
41
+ self.hits += 1
42
+ return self.value
43
+
44
+
45
+ class QueryCache:
46
+ """In-memory cache for query results with TTL."""
47
+
48
+ def __init__(
49
+ self,
50
+ default_ttl: float = 3600,
51
+ max_size: int = 1000,
52
+ ) -> None:
53
+ self._cache: Dict[str, CacheEntry] = {}
54
+ self._lock = threading.RLock()
55
+ self.default_ttl = default_ttl
56
+ self.max_size = max_size
57
+ self._total_hits = 0
58
+ self._total_misses = 0
59
+
60
+ def _make_key(self, query: str, context_hash: str = "") -> str:
61
+ """Create cache key from query and context.
62
+
63
+ Args:
64
+ query: Query string.
65
+ context_hash: Optional context identifier.
66
+
67
+ Returns:
68
+ MD5 hash key.
69
+ """
70
+ combined = f"{query.lower().strip()}:{context_hash}"
71
+ return hashlib.md5(combined.encode()).hexdigest()
72
+
73
+ def _evict_if_needed(self) -> None:
74
+ """Evict oldest entries if cache is at capacity."""
75
+ if len(self._cache) < self.max_size:
76
+ return
77
+
78
+ # Remove expired entries first
79
+ self.cleanup_expired()
80
+
81
+ # If still over capacity, remove oldest entries
82
+ if len(self._cache) >= self.max_size:
83
+ sorted_keys = sorted(
84
+ self._cache.keys(),
85
+ key=lambda k: self._cache[k].timestamp
86
+ )
87
+ # Remove oldest 10%
88
+ to_remove = max(1, len(sorted_keys) // 10)
89
+ for key in sorted_keys[:to_remove]:
90
+ del self._cache[key]
91
+
92
+ def get(
93
+ self,
94
+ query: str,
95
+ context_hash: str = ""
96
+ ) -> Optional[Any]:
97
+ """Get cached result if exists and not expired.
98
+
99
+ Args:
100
+ query: Query string.
101
+ context_hash: Optional context identifier.
102
+
103
+ Returns:
104
+ Cached value or None if not found/expired.
105
+ """
106
+ key = self._make_key(query, context_hash)
107
+
108
+ with self._lock:
109
+ entry = self._cache.get(key)
110
+
111
+ if entry is None:
112
+ self._total_misses += 1
113
+ return None
114
+
115
+ if entry.is_expired():
116
+ del self._cache[key]
117
+ self._total_misses += 1
118
+ return None
119
+
120
+ self._total_hits += 1
121
+ return entry.access()
122
+
123
+ def set(
124
+ self,
125
+ query: str,
126
+ context_hash: str,
127
+ value: Any,
128
+ ttl: Optional[float] = None,
129
+ ) -> None:
130
+ """Cache a result.
131
+
132
+ Args:
133
+ query: Query string.
134
+ context_hash: Context identifier.
135
+ value: Value to cache.
136
+ ttl: Optional TTL override.
137
+ """
138
+ key = self._make_key(query, context_hash)
139
+
140
+ with self._lock:
141
+ self._evict_if_needed()
142
+ self._cache[key] = CacheEntry(
143
+ value=value,
144
+ timestamp=time.time(),
145
+ ttl=ttl or self.default_ttl,
146
+ )
147
+
148
+ def invalidate(self, query: str, context_hash: str = "") -> bool:
149
+ """Invalidate a specific cache entry.
150
+
151
+ Args:
152
+ query: Query string.
153
+ context_hash: Context identifier.
154
+
155
+ Returns:
156
+ True if entry was found and removed.
157
+ """
158
+ key = self._make_key(query, context_hash)
159
+
160
+ with self._lock:
161
+ if key in self._cache:
162
+ del self._cache[key]
163
+ return True
164
+ return False
165
+
166
+ def invalidate_all(self) -> int:
167
+ """Clear entire cache.
168
+
169
+ Returns:
170
+ Number of entries cleared.
171
+ """
172
+ with self._lock:
173
+ count = len(self._cache)
174
+ self._cache.clear()
175
+ return count
176
+
177
+ def cleanup_expired(self) -> int:
178
+ """Remove expired entries.
179
+
180
+ Returns:
181
+ Number of entries removed.
182
+ """
183
+ with self._lock:
184
+ expired_keys = [
185
+ k for k, v in self._cache.items() if v.is_expired()
186
+ ]
187
+ for key in expired_keys:
188
+ del self._cache[key]
189
+ return len(expired_keys)
190
+
191
+ def get_stats(self) -> Dict[str, Any]:
192
+ """Get cache statistics.
193
+
194
+ Returns:
195
+ Dictionary with cache metrics.
196
+ """
197
+ with self._lock:
198
+ total_requests = self._total_hits + self._total_misses
199
+ hit_rate = (
200
+ self._total_hits / total_requests
201
+ if total_requests > 0
202
+ else 0.0
203
+ )
204
+
205
+ return {
206
+ "size": len(self._cache),
207
+ "max_size": self.max_size,
208
+ "total_hits": self._total_hits,
209
+ "total_misses": self._total_misses,
210
+ "hit_rate": round(hit_rate, 3),
211
+ "default_ttl": self.default_ttl,
212
+ }
213
+
214
+
215
+ class AnswerCache(QueryCache):
216
+ """Specialized cache for GraphRAG answers.
217
+
218
+ Extends QueryCache with answer-specific functionality like
219
+ caching both the answer and supporting documents.
220
+ """
221
+
222
+ @dataclass
223
+ class AnswerEntry:
224
+ """Cached answer with supporting documents."""
225
+ answer: str
226
+ documents: List[Document] = field(default_factory=list)
227
+ cypher_result: str = ""
228
+ metadata: Dict[str, Any] = field(default_factory=dict)
229
+
230
+ def set_answer(
231
+ self,
232
+ query: str,
233
+ answer: str,
234
+ documents: Optional[List[Document]] = None,
235
+ cypher_result: str = "",
236
+ context_hash: str = "",
237
+ ttl: Optional[float] = None,
238
+ ) -> None:
239
+ """Cache a complete answer with metadata.
240
+
241
+ Args:
242
+ query: User query.
243
+ answer: Generated answer.
244
+ documents: Supporting documents.
245
+ cypher_result: Cypher query result if any.
246
+ context_hash: Context identifier.
247
+ ttl: Optional TTL override.
248
+ """
249
+ entry = self.AnswerEntry(
250
+ answer=answer,
251
+ documents=documents or [],
252
+ cypher_result=cypher_result,
253
+ metadata={"cached_at": time.time()},
254
+ )
255
+ self.set(query, context_hash, entry, ttl)
256
+
257
+ def get_answer(
258
+ self,
259
+ query: str,
260
+ context_hash: str = ""
261
+ ) -> Optional[AnswerEntry]:
262
+ """Get cached answer.
263
+
264
+ Args:
265
+ query: User query.
266
+ context_hash: Context identifier.
267
+
268
+ Returns:
269
+ AnswerEntry or None if not found.
270
+ """
271
+ result = self.get(query, context_hash)
272
+ if isinstance(result, self.AnswerEntry):
273
+ return result
274
+ return None
275
+
276
+
277
+ # Global cache instances
278
+ _query_cache: Optional[QueryCache] = None
279
+ _answer_cache: Optional[AnswerCache] = None
280
+
281
+
282
+ def get_query_cache(
283
+ default_ttl: float = 3600,
284
+ max_size: int = 1000,
285
+ ) -> QueryCache:
286
+ """Get or create the global query cache.
287
+
288
+ Args:
289
+ default_ttl: Default TTL for entries.
290
+ max_size: Maximum cache size.
291
+
292
+ Returns:
293
+ QueryCache singleton instance.
294
+ """
295
+ global _query_cache
296
+ if _query_cache is None:
297
+ _query_cache = QueryCache(default_ttl=default_ttl, max_size=max_size)
298
+ return _query_cache
299
+
300
+
301
+ def get_answer_cache(
302
+ default_ttl: float = 3600,
303
+ max_size: int = 500,
304
+ ) -> AnswerCache:
305
+ """Get or create the global answer cache.
306
+
307
+ Args:
308
+ default_ttl: Default TTL for entries.
309
+ max_size: Maximum cache size.
310
+
311
+ Returns:
312
+ AnswerCache singleton instance.
313
+ """
314
+ global _answer_cache
315
+ if _answer_cache is None:
316
+ _answer_cache = AnswerCache(default_ttl=default_ttl, max_size=max_size)
317
+ return _answer_cache
src/services/cypher_templates.py ADDED
@@ -0,0 +1,1332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pre-validated Cypher query templates for deterministic query routing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import hashlib
8
+ from dataclasses import dataclass, field
9
+ from enum import Enum
10
+ from typing import Any, Dict, List, Optional, Tuple
11
+ import logging
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ # =============================================================================
17
+ # LLM-BASED INTENT CLASSIFIER
18
+ # =============================================================================
19
+
20
+ class LLMIntentClassifier:
21
+ """Classifies query intent using a lightweight LLM.
22
+
23
+ Uses a small model from Together AI for intent classification.
24
+ Handles synonyms naturally without hardcoding patterns.
25
+ Caches results and falls back to pattern matching if LLM fails.
26
+ """
27
+
28
+ # Cheap, fast model for classification
29
+ DEFAULT_MODEL = "meta-llama/Llama-3.2-3B-Instruct-Turbo"
30
+
31
+ # Classification prompt - designed to be concise for speed
32
+ CLASSIFICATION_PROMPT = """Classify this query into exactly ONE category. For compound queries, pick the combined category.
33
+
34
+ Categories:
35
+ - TIMELINE_LOCATION: Questions about BOTH timeline/schedule AND location/place
36
+ - TIMELINE_BUDGET: Questions about BOTH timeline/schedule AND budget/cost
37
+ - BUDGET_LOCATION: Questions about BOTH cost/money AND location/place
38
+ - CONTACTS: Questions about project manager, owner, engineer, contractor, lead, head, E&C firm, personnel, who is responsible
39
+ - TIMELINE: Questions ONLY about schedule, dates, milestones, deadlines, duration, when things happen
40
+ - CHALLENGES: Questions about problems, risks, issues, obstacles, delays, failures, difficulties, constraints
41
+ - BUDGET: Questions ONLY about cost, money, investment, funding, expenses, price, TIV, financial aspects, spend
42
+ - LOCATION: Questions ONLY about where, place, site, city, country, address, geography, region
43
+ - TECHNICAL: Questions about capacity, scope, technical details, specifications, requirements, fuel type, labor
44
+ - COMPARISON: Generic comparison of ALL aspects of projects (budget, timeline, location, challenges, contacts)
45
+ - STATUS: Questions about current state, progress, whether active/cancelled, probability
46
+ - OVERVIEW: Questions asking for summary, description, general information, tell me about
47
+ - GENERAL: Questions that don't fit above categories or need detailed analysis
48
+
49
+ Query: "{query}"
50
+
51
+ Respond with ONLY the category name, nothing else."""
52
+
53
+ def __init__(
54
+ self,
55
+ model: str = None,
56
+ api_key: str = None,
57
+ use_cache: bool = True,
58
+ fallback_to_patterns: bool = True,
59
+ ):
60
+ """Initialize LLM intent classifier.
61
+
62
+ Args:
63
+ model: Together AI model ID. Defaults to Llama-3.2-3B.
64
+ api_key: Together AI API key. Uses env var if not provided.
65
+ use_cache: Whether to cache classification results.
66
+ fallback_to_patterns: Whether to use pattern matching as fallback.
67
+ """
68
+ self.model = model or self.DEFAULT_MODEL
69
+ self.api_key = api_key or os.environ.get("TOGETHER_API_KEY")
70
+ self.use_cache = use_cache
71
+ self.fallback_to_patterns = fallback_to_patterns
72
+ self._cache: Dict[str, str] = {}
73
+ self._client = None
74
+
75
+ def _get_client(self):
76
+ """Lazy-load Together AI client."""
77
+ if self._client is None:
78
+ try:
79
+ from together import Together
80
+ self._client = Together(api_key=self.api_key)
81
+ except ImportError:
82
+ logger.warning("together package not installed")
83
+ return None
84
+ except Exception as e:
85
+ logger.warning(f"Failed to initialize Together client: {e}")
86
+ return None
87
+ return self._client
88
+
89
+ def _cache_key(self, query: str) -> str:
90
+ """Generate cache key for query."""
91
+ return hashlib.md5(query.lower().strip().encode()).hexdigest()
92
+
93
+ def classify(self, query: str) -> str:
94
+ """Classify query intent using LLM.
95
+
96
+ Args:
97
+ query: User query string
98
+
99
+ Returns:
100
+ Intent category name (e.g., "TIMELINE", "BUDGET")
101
+ """
102
+ # Check cache first
103
+ if self.use_cache:
104
+ cache_key = self._cache_key(query)
105
+ if cache_key in self._cache:
106
+ logger.debug(f"Intent cache hit: {self._cache[cache_key]}")
107
+ return self._cache[cache_key]
108
+
109
+ # Try LLM classification
110
+ client = self._get_client()
111
+ if client:
112
+ try:
113
+ response = client.chat.completions.create(
114
+ model=self.model,
115
+ messages=[
116
+ {"role": "user", "content": self.CLASSIFICATION_PROMPT.format(query=query)}
117
+ ],
118
+ max_tokens=20, # Only need category name
119
+ temperature=0, # Deterministic
120
+ )
121
+
122
+ intent = response.choices[0].message.content.strip().upper()
123
+
124
+ # Validate intent is a known category
125
+ valid_intents = {
126
+ "BUDGET_LOCATION", "TIMELINE_LOCATION", "TIMELINE_BUDGET",
127
+ "TIMELINE", "CHALLENGES", "BUDGET", "LOCATION",
128
+ "CONTACTS", "TECHNICAL", "COMPARISON", "STATUS",
129
+ "OVERVIEW", "GENERAL"
130
+ }
131
+
132
+ # Handle variations in response - check longer names first
133
+ matched = False
134
+ for valid in sorted(valid_intents, key=len, reverse=True):
135
+ if valid in intent:
136
+ intent = valid
137
+ matched = True
138
+ break
139
+
140
+ if not matched:
141
+ intent = "GENERAL"
142
+
143
+ # Cache result
144
+ if self.use_cache:
145
+ self._cache[cache_key] = intent
146
+
147
+ logger.info(f"LLM classified query as: {intent}")
148
+ return intent
149
+
150
+ except Exception as e:
151
+ logger.warning(f"LLM classification failed: {e}")
152
+
153
+ # Fallback to pattern matching
154
+ if self.fallback_to_patterns:
155
+ return self._pattern_fallback(query)
156
+
157
+ return "GENERAL"
158
+
159
+ def _pattern_fallback(self, query: str) -> str:
160
+ """Simple pattern-based fallback if LLM fails."""
161
+ q = query.lower()
162
+
163
+ # Check for keywords - expanded synonym sets
164
+ has_timeline = any(w in q for w in [
165
+ "timeline", "schedule", "milestone", "deadline", "when", "date",
166
+ "duration", "start", "finish", "complete", "begin", "end"
167
+ ])
168
+ has_budget = any(w in q for w in [
169
+ "budget", "cost", "investment", "money", "spend", "fund", "price",
170
+ "expense", "tiv", "financial", "dollar", "amount", "funding"
171
+ ])
172
+ has_location = any(w in q for w in [
173
+ "location", "where", "site", "city", "country", "place", "address",
174
+ "region", "state", "area", "geography", "situated"
175
+ ])
176
+ has_challenge = any(w in q for w in [
177
+ "challenge", "risk", "issue", "problem", "obstacle", "delay",
178
+ "difficult", "constraint", "failure", "cancelled", "cancel"
179
+ ])
180
+ has_contacts = any(w in q for w in [
181
+ "manager", "owner", "engineer", "contractor", "lead", "head",
182
+ "contact", "personnel", "responsible", "e&c", "firm", "who"
183
+ ])
184
+ has_technical = any(w in q for w in [
185
+ "capacity", "scope", "technical", "specification", "requirement",
186
+ "fuel", "labor", "megawatt", "mw", "barrel", "bbl", "unit"
187
+ ])
188
+
189
+ # Check for compound intents first (most specific)
190
+ if has_timeline and has_location:
191
+ return "TIMELINE_LOCATION"
192
+ if has_timeline and has_budget:
193
+ return "TIMELINE_BUDGET"
194
+ if has_budget and has_location:
195
+ return "BUDGET_LOCATION"
196
+
197
+ # Single intents - prioritize more specific ones
198
+ if has_contacts:
199
+ return "CONTACTS"
200
+ if has_technical:
201
+ return "TECHNICAL"
202
+ if has_timeline:
203
+ return "TIMELINE"
204
+ if has_challenge:
205
+ return "CHALLENGES"
206
+ if has_budget:
207
+ return "BUDGET"
208
+ if has_location:
209
+ return "LOCATION"
210
+
211
+ # Generic intents
212
+ if any(w in q for w in ["compare", "comparison", "versus", "vs", "differ", "difference"]):
213
+ return "COMPARISON"
214
+ if any(w in q for w in ["status", "progress", "state", "active", "probability"]):
215
+ return "STATUS"
216
+ if any(w in q for w in ["overview", "summary", "describe", "explain", "tell me", "about"]):
217
+ return "OVERVIEW"
218
+
219
+ return "GENERAL"
220
+
221
+ def clear_cache(self) -> int:
222
+ """Clear the classification cache."""
223
+ count = len(self._cache)
224
+ self._cache.clear()
225
+ return count
226
+
227
+
228
+ class QueryIntent(Enum):
229
+ """Detected query intents for template routing."""
230
+ BUDGET = "budget"
231
+ LOCATION = "location"
232
+ BUDGET_LOCATION = "budget_location"
233
+ TIMELINE = "timeline"
234
+ TIMELINE_LOCATION = "timeline_location" # Combined: timeline + location
235
+ TIMELINE_BUDGET = "timeline_budget" # Combined: timeline + budget
236
+ CHALLENGES = "challenges"
237
+ CONTACTS = "contacts" # Project manager, owner, engineer
238
+ TECHNICAL = "technical" # Capacity, scope, specifications
239
+ COMPARISON = "comparison" # Full comparison with all data
240
+ PROJECT_OVERVIEW = "overview"
241
+ PROJECT_STATUS = "status"
242
+ GENERAL = "general" # Requires RAG fallback
243
+
244
+
245
+ @dataclass
246
+ class CypherTemplate:
247
+ """Pre-validated Cypher query template.
248
+
249
+ Attributes:
250
+ intent: The query intent this template handles
251
+ cypher: The Cypher query string
252
+ description: Human-readable description
253
+ required_params: List of required parameter names (if any)
254
+ """
255
+ intent: QueryIntent
256
+ cypher: str
257
+ description: str
258
+ required_params: List[str] = field(default_factory=list)
259
+
260
+ def execute(self, graph: Any, params: Optional[Dict[str, Any]] = None) -> List[Dict]:
261
+ """Execute template against the graph.
262
+
263
+ Args:
264
+ graph: Neo4j graph instance (LangChain Neo4jGraph)
265
+ params: Optional query parameters
266
+
267
+ Returns:
268
+ List of result dictionaries
269
+ """
270
+ try:
271
+ return graph.query(self.cypher, params or {})
272
+ except Exception as e:
273
+ logger.warning(f"Template execution failed: {e}")
274
+ return []
275
+
276
+
277
+ class CypherTemplateRouter:
278
+ """Routes queries to pre-validated Cypher templates.
279
+
280
+ This eliminates LLM Cypher generation for ~70-80% of queries,
281
+ providing deterministic, fast, and reliable results.
282
+
283
+ Example:
284
+ >>> router = CypherTemplateRouter()
285
+ >>> results, intent = router.route_query("What is the budget?", graph)
286
+ >>> if results is not None:
287
+ ... print(f"Used template for {intent.value}")
288
+ """
289
+
290
+ # =====================================================================
291
+ # PRE-VALIDATED CYPHER TEMPLATES
292
+ # =====================================================================
293
+ # These queries have been tested against the actual graph schema and
294
+ # are guaranteed to work correctly.
295
+
296
+ TEMPLATES = {
297
+ QueryIntent.BUDGET_LOCATION: CypherTemplate(
298
+ intent=QueryIntent.BUDGET_LOCATION,
299
+ cypher="""
300
+ MATCH (p:Project)
301
+ OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
302
+ OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
303
+ RETURN p.name AS project,
304
+ p.projectId AS projectId,
305
+ p.status AS status,
306
+ b.amount AS budget,
307
+ b.currency AS currency,
308
+ l.address AS address,
309
+ l.city AS city,
310
+ l.state AS state,
311
+ l.postal AS postal,
312
+ l.country AS country,
313
+ l.zoneCounty AS zoneCounty
314
+ ORDER BY p.name
315
+ """,
316
+ description="Get budget (TIV) and location for all projects",
317
+ ),
318
+
319
+ QueryIntent.BUDGET: CypherTemplate(
320
+ intent=QueryIntent.BUDGET,
321
+ cypher="""
322
+ MATCH (p:Project)
323
+ OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
324
+ RETURN p.name AS project,
325
+ p.projectId AS projectId,
326
+ p.status AS status,
327
+ b.amount AS budget,
328
+ b.currency AS currency,
329
+ b.kind AS budgetType
330
+ ORDER BY b.amount DESC
331
+ """,
332
+ description="Get budget/investment information for all projects",
333
+ ),
334
+
335
+ QueryIntent.LOCATION: CypherTemplate(
336
+ intent=QueryIntent.LOCATION,
337
+ cypher="""
338
+ MATCH (p:Project)
339
+ OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
340
+ RETURN p.name AS project,
341
+ p.projectId AS projectId,
342
+ l.address AS address,
343
+ l.city AS city,
344
+ l.state AS state,
345
+ l.postal AS postal,
346
+ l.country AS country,
347
+ l.zoneCounty AS zone
348
+ ORDER BY p.name
349
+ """,
350
+ description="Get location information for all projects",
351
+ ),
352
+
353
+ QueryIntent.TIMELINE: CypherTemplate(
354
+ intent=QueryIntent.TIMELINE,
355
+ cypher="""
356
+ MATCH (p:Project)
357
+ OPTIONAL MATCH (p)-[:HAS_MILESTONE]->(m:Milestone)
358
+ WITH p, m
359
+ ORDER BY p.name, m.dateText
360
+ RETURN p.name AS project,
361
+ p.projectId AS projectId,
362
+ p.status AS status,
363
+ collect({
364
+ name: m.name,
365
+ date: m.dateText,
366
+ detail: m.sentence
367
+ }) AS milestones
368
+ ORDER BY p.name
369
+ """,
370
+ description="Get timeline and milestones for all projects",
371
+ ),
372
+
373
+ QueryIntent.CHALLENGES: CypherTemplate(
374
+ intent=QueryIntent.CHALLENGES,
375
+ cypher="""
376
+ MATCH (p:Project)
377
+ OPTIONAL MATCH (p)-[:HAS_CHALLENGE]->(c:Challenge)
378
+ RETURN p.name AS project,
379
+ p.projectId AS projectId,
380
+ p.status AS status,
381
+ p.statusReason AS statusReason,
382
+ collect(DISTINCT c.text) AS challenges
383
+ ORDER BY p.name
384
+ """,
385
+ description="Get challenges, constraints, and risks for all projects",
386
+ ),
387
+
388
+ QueryIntent.TIMELINE_LOCATION: CypherTemplate(
389
+ intent=QueryIntent.TIMELINE_LOCATION,
390
+ cypher="""
391
+ MATCH (p:Project)
392
+ OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
393
+ OPTIONAL MATCH (p)-[:HAS_MILESTONE]->(m:Milestone)
394
+ WITH p, l, m
395
+ ORDER BY p.name, m.dateText
396
+ RETURN p.name AS project,
397
+ p.projectId AS projectId,
398
+ p.status AS status,
399
+ l.city AS city,
400
+ l.state AS state,
401
+ l.country AS country,
402
+ l.address AS address,
403
+ collect({
404
+ name: m.name,
405
+ date: m.dateText,
406
+ detail: m.sentence
407
+ }) AS milestones
408
+ ORDER BY p.name
409
+ """,
410
+ description="Get timeline milestones AND location for all projects",
411
+ ),
412
+
413
+ QueryIntent.TIMELINE_BUDGET: CypherTemplate(
414
+ intent=QueryIntent.TIMELINE_BUDGET,
415
+ cypher="""
416
+ MATCH (p:Project)
417
+ OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
418
+ OPTIONAL MATCH (p)-[:HAS_MILESTONE]->(m:Milestone)
419
+ WITH p, b, m
420
+ ORDER BY p.name, m.dateText
421
+ RETURN p.name AS project,
422
+ p.projectId AS projectId,
423
+ p.status AS status,
424
+ b.amount AS budget,
425
+ b.currency AS currency,
426
+ collect({
427
+ name: m.name,
428
+ date: m.dateText,
429
+ detail: m.sentence
430
+ }) AS milestones
431
+ ORDER BY p.name
432
+ """,
433
+ description="Get timeline milestones AND budget for all projects",
434
+ ),
435
+
436
+ QueryIntent.CONTACTS: CypherTemplate(
437
+ intent=QueryIntent.CONTACTS,
438
+ cypher="""
439
+ MATCH (p:Project)
440
+ RETURN p.name AS project,
441
+ p.projectId AS projectId,
442
+ p.status AS status,
443
+ p.projectManager AS projectManager,
444
+ p.projectManagerCompany AS projectManagerCompany,
445
+ p.projectManagerTitle AS projectManagerTitle,
446
+ p.projectManagerEmail AS projectManagerEmail,
447
+ p.projectManagerPhone AS projectManagerPhone,
448
+ p.plantOwner AS plantOwner,
449
+ p.plantParent AS plantParent,
450
+ p.plantName AS plantName,
451
+ p.engineerCompany AS engineerCompany,
452
+ p.ecFirm AS ecFirm,
453
+ p.phone AS phone
454
+ ORDER BY p.name
455
+ """,
456
+ description="Get project manager, owner, engineer, and contact information",
457
+ ),
458
+
459
+ QueryIntent.TECHNICAL: CypherTemplate(
460
+ intent=QueryIntent.TECHNICAL,
461
+ cypher="""
462
+ MATCH (p:Project)
463
+ RETURN p.name AS project,
464
+ p.projectId AS projectId,
465
+ p.status AS status,
466
+ p.industryCode AS industryCode,
467
+ p.projectType AS projectType,
468
+ p.sector AS sector,
469
+ p.sicCode AS sicCode,
470
+ p.sicProduct AS sicProduct,
471
+ p.pecTiming AS pecTiming,
472
+ p.pecActivity AS pecActivity,
473
+ p.projectCapacity AS projectCapacity,
474
+ p.scopeText AS scopeText,
475
+ p.environmental AS environmental,
476
+ p.constructionLabor AS constructionLabor,
477
+ p.operationsLabor AS operationsLabor,
478
+ p.fuelType AS fuelType,
479
+ p.unitName AS unitName
480
+ ORDER BY p.name
481
+ """,
482
+ description="Get technical details including capacity, scope, and specifications",
483
+ ),
484
+
485
+ QueryIntent.COMPARISON: CypherTemplate(
486
+ intent=QueryIntent.COMPARISON,
487
+ cypher="""
488
+ MATCH (p:Project)
489
+ OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
490
+ OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
491
+ OPTIONAL MATCH (p)-[:HAS_MILESTONE]->(m:Milestone)
492
+ OPTIONAL MATCH (p)-[:HAS_CHALLENGE]->(c:Challenge)
493
+ WITH p, b, l, m, c
494
+ ORDER BY p.name, m.dateText
495
+ WITH p, b, l,
496
+ collect(DISTINCT {name: m.name, date: m.dateText}) AS milestones,
497
+ collect(DISTINCT c.text) AS challenges
498
+ RETURN p.name AS project,
499
+ p.projectId AS projectId,
500
+ p.status AS status,
501
+ p.statusReason AS statusReason,
502
+ p.projectProbability AS projectProbability,
503
+ p.projectManager AS projectManager,
504
+ p.projectManagerCompany AS projectManagerCompany,
505
+ p.projectManagerTitle AS projectManagerTitle,
506
+ p.plantOwner AS plantOwner,
507
+ p.plantParent AS plantParent,
508
+ p.plantName AS plantName,
509
+ p.engineerCompany AS engineerCompany,
510
+ p.ecFirm AS ecFirm,
511
+ p.industryCode AS industryCode,
512
+ p.projectType AS projectType,
513
+ p.sector AS sector,
514
+ p.sicCode AS sicCode,
515
+ p.pecTiming AS pecTiming,
516
+ p.pecActivity AS pecActivity,
517
+ p.projectCapacity AS projectCapacity,
518
+ p.scopeText AS scopeText,
519
+ b.amount AS budget,
520
+ b.currency AS currency,
521
+ l.city AS city,
522
+ l.state AS state,
523
+ l.country AS country,
524
+ l.address AS address,
525
+ milestones,
526
+ challenges
527
+ ORDER BY b.amount DESC
528
+ """,
529
+ description="Compare all projects with full details (budget, location, timeline, challenges, contacts, technical)",
530
+ ),
531
+
532
+ QueryIntent.PROJECT_OVERVIEW: CypherTemplate(
533
+ intent=QueryIntent.PROJECT_OVERVIEW,
534
+ cypher="""
535
+ MATCH (p:Project)
536
+ OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
537
+ OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
538
+ OPTIONAL MATCH (p)-[:HAS_REPORT]->(r:Report)
539
+ RETURN p.name AS project,
540
+ p.projectId AS projectId,
541
+ p.status AS status,
542
+ p.statusReason AS statusReason,
543
+ p.projectProbability AS projectProbability,
544
+ p.projectManager AS projectManager,
545
+ p.projectManagerCompany AS projectManagerCompany,
546
+ p.projectManagerTitle AS projectManagerTitle,
547
+ p.plantOwner AS plantOwner,
548
+ p.plantParent AS plantParent,
549
+ p.plantName AS plantName,
550
+ p.engineerCompany AS engineerCompany,
551
+ p.ecFirm AS ecFirm,
552
+ p.industryCode AS industryCode,
553
+ p.projectType AS projectType,
554
+ p.sector AS sector,
555
+ p.sicCode AS sicCode,
556
+ p.pecTiming AS pecTiming,
557
+ p.pecActivity AS pecActivity,
558
+ p.projectCapacity AS projectCapacity,
559
+ p.constructionLabor AS constructionLabor,
560
+ p.operationsLabor AS operationsLabor,
561
+ p.fuelType AS fuelType,
562
+ p.unitName AS unitName,
563
+ b.amount AS budget,
564
+ b.currency AS currency,
565
+ l.city AS city,
566
+ l.state AS state,
567
+ l.country AS country,
568
+ l.address AS address,
569
+ r.lastUpdate AS lastUpdate,
570
+ r.initialRelease AS initialRelease
571
+ ORDER BY p.name
572
+ """,
573
+ description="Get comprehensive overview of all projects with all attributes",
574
+ ),
575
+
576
+ QueryIntent.PROJECT_STATUS: CypherTemplate(
577
+ intent=QueryIntent.PROJECT_STATUS,
578
+ cypher="""
579
+ MATCH (p:Project)
580
+ OPTIONAL MATCH (p)-[:HAS_REPORT]->(r:Report)
581
+ RETURN p.name AS project,
582
+ p.projectId AS projectId,
583
+ p.status AS status,
584
+ p.statusReason AS statusReason,
585
+ r.lastUpdate AS lastUpdate
586
+ ORDER BY p.name
587
+ """,
588
+ description="Get project status information",
589
+ ),
590
+ }
591
+
592
+ def __init__(self, use_llm: bool = True) -> None:
593
+ """Initialize the template router.
594
+
595
+ Args:
596
+ use_llm: If True, uses LLM for intent classification (handles synonyms).
597
+ If False, uses simple pattern matching (faster but limited).
598
+ """
599
+ self.use_llm = use_llm
600
+ self._llm_classifier: Optional[LLMIntentClassifier] = None
601
+
602
+ def _get_classifier(self) -> LLMIntentClassifier:
603
+ """Lazy-load the LLM classifier."""
604
+ if self._llm_classifier is None:
605
+ self._llm_classifier = LLMIntentClassifier(
606
+ use_cache=True,
607
+ fallback_to_patterns=True,
608
+ )
609
+ return self._llm_classifier
610
+
611
+ def classify_intent(self, query: str) -> QueryIntent:
612
+ """Classify query intent using LLM or pattern matching.
613
+
614
+ Args:
615
+ query: User query string
616
+
617
+ Returns:
618
+ Detected QueryIntent
619
+ """
620
+ if self.use_llm:
621
+ classifier = self._get_classifier()
622
+ intent_str = classifier.classify(query)
623
+ else:
624
+ # Fallback to simple pattern matching
625
+ intent_str = self._simple_pattern_match(query)
626
+
627
+ # Map string to QueryIntent enum
628
+ intent_map = {
629
+ "BUDGET_LOCATION": QueryIntent.BUDGET_LOCATION,
630
+ "TIMELINE_LOCATION": QueryIntent.TIMELINE_LOCATION,
631
+ "TIMELINE_BUDGET": QueryIntent.TIMELINE_BUDGET,
632
+ "TIMELINE": QueryIntent.TIMELINE,
633
+ "CHALLENGES": QueryIntent.CHALLENGES,
634
+ "CONTACTS": QueryIntent.CONTACTS,
635
+ "TECHNICAL": QueryIntent.TECHNICAL,
636
+ "BUDGET": QueryIntent.BUDGET,
637
+ "LOCATION": QueryIntent.LOCATION,
638
+ "COMPARISON": QueryIntent.COMPARISON,
639
+ "STATUS": QueryIntent.PROJECT_STATUS,
640
+ "OVERVIEW": QueryIntent.PROJECT_OVERVIEW,
641
+ "GENERAL": QueryIntent.GENERAL,
642
+ }
643
+
644
+ return intent_map.get(intent_str, QueryIntent.GENERAL)
645
+
646
+ def _simple_pattern_match(self, query: str) -> str:
647
+ """Simple pattern matching fallback (no LLM)."""
648
+ q = query.lower()
649
+
650
+ # Check for combined intents first
651
+ if any(w in q for w in ["budget", "cost", "money"]) and any(w in q for w in ["location", "where", "site"]):
652
+ return "BUDGET_LOCATION"
653
+
654
+ # Single intents - check domain keywords
655
+ if any(w in q for w in ["timeline", "schedule", "milestone", "deadline", "when", "duration"]):
656
+ return "TIMELINE"
657
+ if any(w in q for w in ["challenge", "risk", "issue", "problem", "obstacle", "delay"]):
658
+ return "CHALLENGES"
659
+ if any(w in q for w in ["budget", "cost", "investment", "money", "spend", "fund", "price"]):
660
+ return "BUDGET"
661
+ if any(w in q for w in ["location", "where", "site", "city", "country", "place"]):
662
+ return "LOCATION"
663
+ if any(w in q for w in ["compare", "comparison", "versus", "differ"]):
664
+ return "COMPARISON"
665
+ if any(w in q for w in ["status", "progress", "state"]):
666
+ return "STATUS"
667
+ if any(w in q for w in ["overview", "summary", "describe", "explain"]):
668
+ return "OVERVIEW"
669
+
670
+ return "GENERAL"
671
+
672
+ def get_template(self, intent: QueryIntent) -> Optional[CypherTemplate]:
673
+ """Get template for a given intent.
674
+
675
+ Args:
676
+ intent: Query intent
677
+
678
+ Returns:
679
+ CypherTemplate or None if no template for intent
680
+ """
681
+ return self.TEMPLATES.get(intent)
682
+
683
+ def route_query(
684
+ self,
685
+ query: str,
686
+ graph: Any,
687
+ ) -> Tuple[Optional[List[Dict]], QueryIntent]:
688
+ """Route query to template or indicate fallback needed.
689
+
690
+ Args:
691
+ query: User query string
692
+ graph: Neo4j graph instance
693
+
694
+ Returns:
695
+ Tuple of (results or None, detected intent)
696
+ Results is None if intent is GENERAL or template execution failed
697
+ """
698
+ intent = self.classify_intent(query)
699
+ logger.info(f"Query classified as: {intent.value}")
700
+
701
+ if intent == QueryIntent.GENERAL:
702
+ return None, intent
703
+
704
+ template = self.get_template(intent)
705
+ if template is None:
706
+ logger.warning(f"No template found for intent: {intent.value}")
707
+ return None, intent
708
+
709
+ try:
710
+ results = template.execute(graph)
711
+ if results:
712
+ logger.info(f"Template returned {len(results)} results")
713
+ return results, intent
714
+ else:
715
+ logger.warning("Template returned empty results")
716
+ return [], intent
717
+ except Exception as e:
718
+ logger.warning(f"Template execution error: {e}")
719
+ return None, intent
720
+
721
+ def get_all_intents(self) -> List[QueryIntent]:
722
+ """Get list of all supported intents (excluding GENERAL)."""
723
+ return [intent for intent in QueryIntent if intent != QueryIntent.GENERAL]
724
+
725
+ def get_template_description(self, intent: QueryIntent) -> str:
726
+ """Get human-readable description of what a template does."""
727
+ template = self.get_template(intent)
728
+ if template:
729
+ return template.description
730
+ return f"No template available for {intent.value}"
731
+
732
+
733
+ # =========================================================================
734
+ # RESULT FORMATTERS
735
+ # =========================================================================
736
+ # These functions format Cypher results into human-readable markdown
737
+ # without requiring LLM synthesis.
738
+
739
+ class TemplateResultFormatter:
740
+ """Formats template results into markdown without LLM."""
741
+
742
+ # Standard message for missing information
743
+ NOT_FOUND_MSG = "I couldn't find this information in the provided documents."
744
+
745
+ @staticmethod
746
+ def format_budget(results: List[Dict]) -> str:
747
+ """Format budget results."""
748
+ if not results:
749
+ return "I couldn't find any budget information in the provided documents."
750
+
751
+ lines = ["## Budget Information\n"]
752
+ for r in results:
753
+ project = r.get('project') or 'Unknown Project'
754
+ budget = r.get('budget')
755
+ currency = r.get('currency') or ''
756
+ status = r.get('status') or ''
757
+
758
+ if budget is not None:
759
+ if isinstance(budget, (int, float)):
760
+ budget_str = f"{budget:,.0f} {currency}".strip()
761
+ else:
762
+ budget_str = f"{budget} {currency}".strip()
763
+ else:
764
+ budget_str = "Not available"
765
+
766
+ status_str = f" ({status})" if status else ""
767
+ lines.append(f"- **{project}**{status_str}: {budget_str}")
768
+
769
+ return "\n".join(lines)
770
+
771
+ @staticmethod
772
+ def format_location(results: List[Dict]) -> str:
773
+ """Format location results."""
774
+ if not results:
775
+ return "I couldn't find any location information in the provided documents."
776
+
777
+ lines = ["## Location Information\n"]
778
+ for r in results:
779
+ project = r.get('project') or 'Unknown Project'
780
+ loc_parts = [
781
+ r.get('address'),
782
+ r.get('city'),
783
+ r.get('state'),
784
+ r.get('country'),
785
+ ]
786
+ loc = ", ".join([p for p in loc_parts if p]) or "Not available"
787
+ lines.append(f"- **{project}**: {loc}")
788
+
789
+ return "\n".join(lines)
790
+
791
+ @staticmethod
792
+ def format_budget_location(results: List[Dict]) -> str:
793
+ """Format combined budget and location results."""
794
+ if not results:
795
+ return "I couldn't find any budget or location information in the provided documents."
796
+
797
+ lines = ["## Budget Allocation and Location\n"]
798
+ for r in results:
799
+ project = r.get('project') or 'Unknown Project'
800
+ status = r.get('status') or ''
801
+
802
+ # Format budget
803
+ budget = r.get('budget')
804
+ currency = r.get('currency') or ''
805
+ if budget is not None:
806
+ if isinstance(budget, (int, float)):
807
+ budget_str = f"{budget:,.0f} {currency}".strip()
808
+ else:
809
+ budget_str = f"{budget} {currency}".strip()
810
+ else:
811
+ budget_str = "Not available"
812
+
813
+ # Format location
814
+ loc_parts = [r.get('city'), r.get('state'), r.get('country')]
815
+ loc = ", ".join([p for p in loc_parts if p]) or "Not available"
816
+
817
+ status_str = f" *({status})*" if status else ""
818
+ lines.append(f"\n### {project}{status_str}")
819
+ lines.append(f"- **Budget (TIV)**: {budget_str}")
820
+ lines.append(f"- **Location**: {loc}")
821
+
822
+ if r.get('address'):
823
+ lines.append(f"- **Address**: {r['address']}")
824
+ if r.get('zoneCounty'):
825
+ lines.append(f"- **Zone/County**: {r['zoneCounty']}")
826
+
827
+ return "\n".join(lines)
828
+
829
+ @staticmethod
830
+ def format_timeline(results: List[Dict]) -> str:
831
+ """Format timeline/milestone results."""
832
+ if not results:
833
+ return "I couldn't find any timeline information in the provided documents."
834
+
835
+ lines = ["## Project Timelines\n"]
836
+ for r in results:
837
+ project = r.get('project') or 'Unknown Project'
838
+ status = r.get('status') or ''
839
+ milestones = r.get('milestones') or []
840
+
841
+ status_str = f" *({status})*" if status else ""
842
+ lines.append(f"\n### {project}{status_str}")
843
+
844
+ # Filter out null milestones
845
+ valid_milestones = [
846
+ m for m in milestones
847
+ if m and (m.get('name') or m.get('date'))
848
+ ]
849
+
850
+ if not valid_milestones:
851
+ lines.append("- No milestones recorded")
852
+ else:
853
+ for m in valid_milestones[:12]: # Limit display
854
+ name = m.get('name') or 'Milestone'
855
+ date = m.get('date') or ''
856
+ detail = m.get('detail') or ''
857
+
858
+ if date:
859
+ lines.append(f"- **{name}**: {date}")
860
+ elif detail:
861
+ lines.append(f"- **{name}**: {detail[:100]}...")
862
+ else:
863
+ lines.append(f"- {name}")
864
+
865
+ return "\n".join(lines)
866
+
867
+ @staticmethod
868
+ def format_challenges(results: List[Dict]) -> str:
869
+ """Format challenges results."""
870
+ if not results:
871
+ return "I couldn't find any challenge or risk information in the provided documents."
872
+
873
+ lines = ["## Project Challenges and Constraints\n"]
874
+ for r in results:
875
+ project = r.get('project') or 'Unknown Project'
876
+ status = r.get('status') or ''
877
+ status_reason = r.get('statusReason') or ''
878
+ challenges = r.get('challenges') or []
879
+
880
+ lines.append(f"\n### {project}")
881
+
882
+ if status:
883
+ lines.append(f"**Status**: {status}")
884
+ if status_reason:
885
+ lines.append(f"**Status Reason**: {status_reason}")
886
+
887
+ # Filter out None/empty challenges
888
+ valid_challenges = [c for c in challenges if c]
889
+
890
+ if valid_challenges:
891
+ lines.append("\n**Identified Challenges:**")
892
+ for ch in valid_challenges[:10]:
893
+ lines.append(f"- {ch}")
894
+ elif status_reason:
895
+ lines.append("\n*Challenges inferred from status reason above.*")
896
+ else:
897
+ lines.append("- No specific challenges recorded")
898
+
899
+ return "\n".join(lines)
900
+
901
+ @staticmethod
902
+ def format_contacts(results: List[Dict]) -> str:
903
+ """Format contact/personnel information results."""
904
+ if not results:
905
+ return "I couldn't find any contact or personnel information in the provided documents."
906
+
907
+ lines = ["## Project Contacts and Personnel\n"]
908
+
909
+ for r in results:
910
+ project = r.get('project') or 'Unknown Project'
911
+ lines.append(f"\n### {project}")
912
+
913
+ has_any_contact = False
914
+
915
+ # Project Manager
916
+ pm_name = r.get('projectManager')
917
+ if pm_name:
918
+ has_any_contact = True
919
+ pm_info = pm_name
920
+ if r.get('projectManagerTitle'):
921
+ pm_info += f", {r['projectManagerTitle']}"
922
+ if r.get('projectManagerCompany'):
923
+ pm_info += f" ({r['projectManagerCompany']})"
924
+ lines.append(f"- **Project Manager**: {pm_info}")
925
+ if r.get('projectManagerEmail'):
926
+ lines.append(f" - Email: {r['projectManagerEmail']}")
927
+ if r.get('projectManagerPhone'):
928
+ lines.append(f" - Phone: {r['projectManagerPhone']}")
929
+
930
+ # Owner
931
+ plant_owner = r.get('plantOwner')
932
+ if plant_owner:
933
+ has_any_contact = True
934
+ owner_info = plant_owner
935
+ if r.get('plantParent'):
936
+ owner_info += f" (Parent: {r['plantParent']})"
937
+ lines.append(f"- **Owner**: {owner_info}")
938
+ if r.get('plantName'):
939
+ lines.append(f" - Plant/Facility: {r['plantName']}")
940
+
941
+ # Engineer
942
+ if r.get('engineerCompany'):
943
+ has_any_contact = True
944
+ lines.append(f"- **Engineer**: {r['engineerCompany']}")
945
+
946
+ # E&C Firm
947
+ if r.get('ecFirm'):
948
+ has_any_contact = True
949
+ lines.append(f"- **E&C Firm**: {r['ecFirm']}")
950
+
951
+ # General phone
952
+ if r.get('phone'):
953
+ has_any_contact = True
954
+ lines.append(f"- **Phone**: {r['phone']}")
955
+
956
+ if not has_any_contact:
957
+ lines.append("- No contact information available")
958
+
959
+ return "\n".join(lines)
960
+
961
+ @staticmethod
962
+ def format_technical(results: List[Dict]) -> str:
963
+ """Format technical details and specifications results."""
964
+ if not results:
965
+ return "I couldn't find any technical specifications in the provided documents."
966
+
967
+ lines = ["## Technical Details and Specifications\n"]
968
+
969
+ for r in results:
970
+ project = r.get('project') or 'Unknown Project'
971
+ lines.append(f"\n### {project}")
972
+
973
+ has_any_technical = False
974
+
975
+ # Classification
976
+ if r.get('industryCode') or r.get('projectType') or r.get('sector'):
977
+ has_any_technical = True
978
+ lines.append("- **Classification**:")
979
+ if r.get('industryCode'):
980
+ lines.append(f" - Industry: {r['industryCode']}")
981
+ if r.get('projectType'):
982
+ lines.append(f" - Type: {r['projectType']}")
983
+ if r.get('sector'):
984
+ lines.append(f" - Sector: {r['sector']}")
985
+ if r.get('sicCode'):
986
+ lines.append(f" - SIC Code: {r['sicCode']}")
987
+ if r.get('sicProduct'):
988
+ lines.append(f" - SIC Product: {r['sicProduct']}")
989
+
990
+ # PEC Stage
991
+ if r.get('pecTiming') or r.get('pecActivity'):
992
+ has_any_technical = True
993
+ pec = f"{r.get('pecTiming', '')} - {r.get('pecActivity', '')}".strip(' -')
994
+ if pec:
995
+ lines.append(f"- **PEC Stage**: {pec}")
996
+
997
+ # Capacity
998
+ if r.get('projectCapacity'):
999
+ has_any_technical = True
1000
+ lines.append(f"- **Project Capacity**: {r['projectCapacity']}")
1001
+
1002
+ # Scope
1003
+ if r.get('scopeText'):
1004
+ has_any_technical = True
1005
+ scope = r['scopeText']
1006
+ if len(scope) > 300:
1007
+ scope = scope[:300] + "..."
1008
+ lines.append(f"- **Scope**: {scope}")
1009
+
1010
+ # Environmental
1011
+ if r.get('environmental'):
1012
+ has_any_technical = True
1013
+ lines.append(f"- **Environmental**: {r['environmental']}")
1014
+
1015
+ # Labor
1016
+ if r.get('constructionLabor') or r.get('operationsLabor'):
1017
+ has_any_technical = True
1018
+ labor_parts = []
1019
+ if r.get('constructionLabor'):
1020
+ labor_parts.append(f"Construction: {r['constructionLabor']}")
1021
+ if r.get('operationsLabor'):
1022
+ labor_parts.append(f"Operations: {r['operationsLabor']}")
1023
+ lines.append(f"- **Labor**: {', '.join(labor_parts)}")
1024
+
1025
+ # Fuel type
1026
+ if r.get('fuelType'):
1027
+ has_any_technical = True
1028
+ lines.append(f"- **Fuel Type**: {r['fuelType']}")
1029
+
1030
+ # Unit
1031
+ if r.get('unitName'):
1032
+ has_any_technical = True
1033
+ lines.append(f"- **Unit**: {r['unitName']}")
1034
+
1035
+ if not has_any_technical:
1036
+ lines.append("- No technical specifications available")
1037
+
1038
+ return "\n".join(lines)
1039
+
1040
+ @staticmethod
1041
+ def format_comparison(results: List[Dict]) -> str:
1042
+ """Format comparison results with comprehensive project details."""
1043
+ if not results:
1044
+ return "I couldn't find any project data for comparison in the provided documents."
1045
+
1046
+ lines = ["## Project Comparison\n"]
1047
+
1048
+ for r in results:
1049
+ project = r.get('project') or 'Unknown'
1050
+ lines.append(f"### {project}")
1051
+
1052
+ # Status section
1053
+ status = r.get('status')
1054
+ if status:
1055
+ lines.append(f"- **Status**: {status}")
1056
+ if r.get('statusReason'):
1057
+ lines.append(f" - Reason: {r['statusReason']}")
1058
+ if r.get('projectProbability'):
1059
+ lines.append(f" - Probability: {r['projectProbability']}")
1060
+
1061
+ # Classification
1062
+ if r.get('industryCode') or r.get('projectType') or r.get('sector'):
1063
+ lines.append("- **Classification**:")
1064
+ if r.get('industryCode'):
1065
+ lines.append(f" - Industry: {r['industryCode']}")
1066
+ if r.get('projectType'):
1067
+ lines.append(f" - Type: {r['projectType']}")
1068
+ if r.get('sector'):
1069
+ lines.append(f" - Sector: {r['sector']}")
1070
+ if r.get('sicCode'):
1071
+ lines.append(f" - SIC Code: {r['sicCode']}")
1072
+
1073
+ # Budget
1074
+ budget = r.get('budget')
1075
+ currency = r.get('currency') or ''
1076
+ if budget is not None and isinstance(budget, (int, float)):
1077
+ if budget >= 1_000_000_000:
1078
+ budget_str = f"{budget/1_000_000_000:.1f}B {currency}".strip()
1079
+ elif budget >= 1_000_000:
1080
+ budget_str = f"{budget/1_000_000:.0f}M {currency}".strip()
1081
+ else:
1082
+ budget_str = f"{budget:,.0f} {currency}".strip()
1083
+ lines.append(f"- **Budget (TIV)**: {budget_str}")
1084
+
1085
+ # Location
1086
+ loc_parts = [r.get('address'), r.get('city'), r.get('state'), r.get('country')]
1087
+ loc_parts = [p for p in loc_parts if p]
1088
+ if loc_parts:
1089
+ lines.append(f"- **Location**: {', '.join(loc_parts)}")
1090
+
1091
+ # Capacity/Technical
1092
+ if r.get('projectCapacity'):
1093
+ lines.append(f"- **Project Capacity**: {r['projectCapacity']}")
1094
+ if r.get('pecTiming') or r.get('pecActivity'):
1095
+ pec = f"{r.get('pecTiming', '')} - {r.get('pecActivity', '')}".strip(' -')
1096
+ if pec:
1097
+ lines.append(f"- **PEC Stage**: {pec}")
1098
+
1099
+ # Contacts section
1100
+ pm_name = r.get('projectManager')
1101
+ pm_company = r.get('projectManagerCompany')
1102
+ pm_title = r.get('projectManagerTitle')
1103
+ plant_owner = r.get('plantOwner')
1104
+ plant_parent = r.get('plantParent')
1105
+ engineer = r.get('engineerCompany')
1106
+ ec_firm = r.get('ecFirm')
1107
+
1108
+ if any([pm_name, plant_owner, engineer, ec_firm]):
1109
+ lines.append("- **Key Contacts**:")
1110
+ if pm_name:
1111
+ pm_info = pm_name
1112
+ if pm_title:
1113
+ pm_info += f", {pm_title}"
1114
+ if pm_company:
1115
+ pm_info += f" ({pm_company})"
1116
+ lines.append(f" - Project Manager: {pm_info}")
1117
+ if plant_owner:
1118
+ owner_info = plant_owner
1119
+ if plant_parent:
1120
+ owner_info += f" (Parent: {plant_parent})"
1121
+ lines.append(f" - Owner: {owner_info}")
1122
+ if engineer:
1123
+ lines.append(f" - Engineer: {engineer}")
1124
+ if ec_firm:
1125
+ lines.append(f" - E&C Firm: {ec_firm}")
1126
+
1127
+ # Plant info
1128
+ if r.get('plantName'):
1129
+ lines.append(f"- **Plant/Facility**: {r['plantName']}")
1130
+
1131
+ # Milestones and Challenges counts
1132
+ ms = r.get('milestones') or []
1133
+ ch = r.get('challenges') or []
1134
+ if isinstance(ms, list):
1135
+ milestone_count = len([m for m in ms if m and m.get('name')])
1136
+ else:
1137
+ milestone_count = 0
1138
+ if isinstance(ch, list):
1139
+ challenge_count = len([c for c in ch if c])
1140
+ else:
1141
+ challenge_count = 0
1142
+
1143
+ lines.append(f"- **Milestones**: {milestone_count}")
1144
+ lines.append(f"- **Challenges**: {challenge_count}")
1145
+ lines.append("")
1146
+
1147
+ return "\n".join(lines)
1148
+
1149
+ @staticmethod
1150
+ def format_overview(results: List[Dict]) -> str:
1151
+ """Format comprehensive project overview results."""
1152
+ if not results:
1153
+ return "I couldn't find any project data in the provided documents."
1154
+
1155
+ lines = ["## Project Overview\n"]
1156
+ for r in results:
1157
+ project = r.get('project') or 'Unknown Project'
1158
+ lines.append(f"\n### {project}")
1159
+
1160
+ # Basic identification
1161
+ if r.get('projectId'):
1162
+ lines.append(f"- **Project ID**: {r['projectId']}")
1163
+
1164
+ # Status section
1165
+ if r.get('status'):
1166
+ lines.append(f"- **Status**: {r['status']}")
1167
+ if r.get('statusReason'):
1168
+ lines.append(f" - Reason: {r['statusReason']}")
1169
+ if r.get('projectProbability'):
1170
+ lines.append(f" - Probability: {r['projectProbability']}")
1171
+
1172
+ # Classification section
1173
+ has_classification = any([r.get('industryCode'), r.get('projectType'),
1174
+ r.get('sector'), r.get('sicCode')])
1175
+ if has_classification:
1176
+ lines.append("- **Classification**:")
1177
+ if r.get('industryCode'):
1178
+ lines.append(f" - Industry: {r['industryCode']}")
1179
+ if r.get('projectType'):
1180
+ lines.append(f" - Type: {r['projectType']}")
1181
+ if r.get('sector'):
1182
+ lines.append(f" - Sector: {r['sector']}")
1183
+ if r.get('sicCode'):
1184
+ lines.append(f" - SIC Code: {r['sicCode']}")
1185
+
1186
+ # Budget
1187
+ if r.get('budget') is not None:
1188
+ budget = r['budget']
1189
+ currency = r.get('currency') or ''
1190
+ if isinstance(budget, (int, float)):
1191
+ if budget >= 1_000_000_000:
1192
+ budget_str = f"{budget/1_000_000_000:.1f}B {currency}".strip()
1193
+ elif budget >= 1_000_000:
1194
+ budget_str = f"{budget/1_000_000:.0f}M {currency}".strip()
1195
+ else:
1196
+ budget_str = f"{budget:,.0f} {currency}".strip()
1197
+ else:
1198
+ budget_str = f"{budget} {currency}".strip()
1199
+ lines.append(f"- **Budget (TIV)**: {budget_str}")
1200
+
1201
+ # Location
1202
+ loc_parts = [r.get('address'), r.get('city'), r.get('state'), r.get('country')]
1203
+ loc_parts = [p for p in loc_parts if p]
1204
+ if loc_parts:
1205
+ lines.append(f"- **Location**: {', '.join(loc_parts)}")
1206
+
1207
+ # Technical details
1208
+ if r.get('projectCapacity'):
1209
+ lines.append(f"- **Project Capacity**: {r['projectCapacity']}")
1210
+ if r.get('pecTiming') or r.get('pecActivity'):
1211
+ pec = f"{r.get('pecTiming', '')} - {r.get('pecActivity', '')}".strip(' -')
1212
+ if pec:
1213
+ lines.append(f"- **PEC Stage**: {pec}")
1214
+ if r.get('fuelType'):
1215
+ lines.append(f"- **Fuel Type**: {r['fuelType']}")
1216
+ if r.get('unitName'):
1217
+ lines.append(f"- **Unit**: {r['unitName']}")
1218
+
1219
+ # Labor information
1220
+ if r.get('constructionLabor') or r.get('operationsLabor'):
1221
+ labor_info = []
1222
+ if r.get('constructionLabor'):
1223
+ labor_info.append(f"Construction: {r['constructionLabor']}")
1224
+ if r.get('operationsLabor'):
1225
+ labor_info.append(f"Operations: {r['operationsLabor']}")
1226
+ lines.append(f"- **Labor**: {', '.join(labor_info)}")
1227
+
1228
+ # Contacts section
1229
+ pm_name = r.get('projectManager')
1230
+ pm_company = r.get('projectManagerCompany')
1231
+ pm_title = r.get('projectManagerTitle')
1232
+ plant_owner = r.get('plantOwner')
1233
+ plant_parent = r.get('plantParent')
1234
+ plant_name = r.get('plantName')
1235
+ engineer = r.get('engineerCompany')
1236
+ ec_firm = r.get('ecFirm')
1237
+
1238
+ if any([pm_name, plant_owner, engineer, ec_firm]):
1239
+ lines.append("- **Key Contacts**:")
1240
+ if pm_name:
1241
+ pm_info = pm_name
1242
+ if pm_title:
1243
+ pm_info += f", {pm_title}"
1244
+ if pm_company:
1245
+ pm_info += f" ({pm_company})"
1246
+ lines.append(f" - Project Manager: {pm_info}")
1247
+ if plant_owner:
1248
+ owner_info = plant_owner
1249
+ if plant_parent:
1250
+ owner_info += f" (Parent: {plant_parent})"
1251
+ lines.append(f" - Owner: {owner_info}")
1252
+ if engineer:
1253
+ lines.append(f" - Engineer: {engineer}")
1254
+ if ec_firm:
1255
+ lines.append(f" - E&C Firm: {ec_firm}")
1256
+
1257
+ # Plant/Facility info
1258
+ if plant_name:
1259
+ lines.append(f"- **Plant/Facility**: {plant_name}")
1260
+
1261
+ # Report dates
1262
+ if r.get('lastUpdate') or r.get('initialRelease'):
1263
+ lines.append("- **Report Info**:")
1264
+ if r.get('lastUpdate'):
1265
+ lines.append(f" - Last Updated: {r['lastUpdate']}")
1266
+ if r.get('initialRelease'):
1267
+ lines.append(f" - Initial Release: {r['initialRelease']}")
1268
+
1269
+ return "\n".join(lines)
1270
+
1271
+ @staticmethod
1272
+ def format_status(results: List[Dict]) -> str:
1273
+ """Format status results."""
1274
+ if not results:
1275
+ return "I couldn't find any project status information in the provided documents."
1276
+
1277
+ lines = ["## Project Status\n"]
1278
+ for r in results:
1279
+ project = r.get('project') or 'Unknown Project'
1280
+ status = r.get('status') or 'Unknown'
1281
+ reason = r.get('statusReason') or ''
1282
+ last_update = r.get('lastUpdate') or ''
1283
+
1284
+ lines.append(f"\n### {project}")
1285
+ lines.append(f"- **Status**: {status}")
1286
+ if reason:
1287
+ lines.append(f"- **Reason**: {reason}")
1288
+ if last_update:
1289
+ lines.append(f"- **Last Updated**: {last_update}")
1290
+
1291
+ return "\n".join(lines)
1292
+
1293
+ @classmethod
1294
+ def format(cls, results: List[Dict], intent: QueryIntent) -> str:
1295
+ """Format results based on intent.
1296
+
1297
+ Args:
1298
+ results: Query results
1299
+ intent: Detected intent
1300
+
1301
+ Returns:
1302
+ Formatted markdown string
1303
+ """
1304
+ formatters = {
1305
+ QueryIntent.BUDGET: cls.format_budget,
1306
+ QueryIntent.LOCATION: cls.format_location,
1307
+ QueryIntent.BUDGET_LOCATION: cls.format_budget_location,
1308
+ QueryIntent.TIMELINE: cls.format_timeline,
1309
+ QueryIntent.TIMELINE_LOCATION: cls.format_timeline, # Use timeline formatter
1310
+ QueryIntent.TIMELINE_BUDGET: cls.format_timeline, # Use timeline formatter
1311
+ QueryIntent.CHALLENGES: cls.format_challenges,
1312
+ QueryIntent.CONTACTS: cls.format_contacts,
1313
+ QueryIntent.TECHNICAL: cls.format_technical,
1314
+ QueryIntent.COMPARISON: cls.format_comparison,
1315
+ QueryIntent.PROJECT_OVERVIEW: cls.format_overview,
1316
+ QueryIntent.PROJECT_STATUS: cls.format_status,
1317
+ }
1318
+
1319
+ formatter = formatters.get(intent)
1320
+ if formatter:
1321
+ return formatter(results)
1322
+
1323
+ # Generic fallback
1324
+ if not results:
1325
+ return "I couldn't find this information in the provided documents."
1326
+
1327
+ lines = ["## Query Results\n"]
1328
+ for r in results:
1329
+ items = [f"**{k}**: {v}" for k, v in r.items() if v is not None]
1330
+ lines.append("- " + " | ".join(items))
1331
+
1332
+ return "\n".join(lines)
src/services/neo4j_service.py ADDED
@@ -0,0 +1,588 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Neo4j database access layer.
3
+
4
+ Provides centralized Neo4j connectivity and data management
5
+ with Aura/hosted instance best practices.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from neo4j import GraphDatabase, Driver
13
+ from neo4j.exceptions import ServiceUnavailable, AuthError
14
+
15
+ # LangChain Neo4j integration
16
+ try:
17
+ from langchain_community.graphs import Neo4jGraph
18
+ except ImportError:
19
+ from langchain.graphs import Neo4jGraph
20
+
21
+ from src.config import get_logger, log_step
22
+ from src.models.project import ProjectRecord, GeoComponents, Milestone
23
+ from src.parsers.project_parser import ProjectReportParser
24
+
25
+ # Module logger
26
+ logger = get_logger(__name__)
27
+
28
+
29
+ class Neo4jConnectionError(Exception):
30
+ """Raised when Neo4j connection fails."""
31
+ pass
32
+
33
+
34
+ class Neo4jService:
35
+ """Neo4j access layer with Aura/hosted best practices.
36
+
37
+ This class centralizes:
38
+ - Driver construction and connectivity validation
39
+ - LangChain Neo4jGraph wrapper configuration
40
+ - Constraints, structured writes, and database cleanup
41
+
42
+ Attributes:
43
+ uri: Neo4j connection URI.
44
+ user: Database username.
45
+ password: Database password.
46
+ database: Database name.
47
+ driver: Low-level Neo4j driver.
48
+ graph: LangChain Neo4jGraph wrapper.
49
+
50
+ Raises:
51
+ Neo4jConnectionError: If connection fails.
52
+
53
+ Example:
54
+ >>> service = Neo4jService(
55
+ ... uri="neo4j+s://xxx.databases.neo4j.io",
56
+ ... user="neo4j",
57
+ ... password="password"
58
+ ... )
59
+ >>> service.ensure_constraints()
60
+ >>> service.close()
61
+ """
62
+
63
+ # Constraint definitions for structured layer
64
+ CONSTRAINTS = [
65
+ "CREATE CONSTRAINT project_id IF NOT EXISTS FOR (p:Project) REQUIRE p.projectId IS UNIQUE",
66
+ "CREATE CONSTRAINT project_name IF NOT EXISTS FOR (p:Project) REQUIRE p.name IS UNIQUE",
67
+ "CREATE CONSTRAINT budget_key IF NOT EXISTS FOR (b:Budget) REQUIRE b.key IS UNIQUE",
68
+ "CREATE CONSTRAINT location_key IF NOT EXISTS FOR (l:Location) REQUIRE l.key IS UNIQUE",
69
+ "CREATE CONSTRAINT milestone_key IF NOT EXISTS FOR (m:Milestone) REQUIRE m.key IS UNIQUE",
70
+ "CREATE CONSTRAINT report_key IF NOT EXISTS FOR (r:Report) REQUIRE r.key IS UNIQUE",
71
+ ]
72
+
73
+ # Performance indexes for faster queries
74
+ INDEXES = [
75
+ "CREATE INDEX project_name_idx IF NOT EXISTS FOR (p:Project) ON (p.name)",
76
+ "CREATE INDEX project_source_idx IF NOT EXISTS FOR (p:Project) ON (p.source)",
77
+ "CREATE INDEX chunk_source_idx IF NOT EXISTS FOR (c:Chunk) ON (c.source)",
78
+ "CREATE INDEX milestone_date_idx IF NOT EXISTS FOR (m:Milestone) ON (m.dateText)",
79
+ "CREATE INDEX location_city_idx IF NOT EXISTS FOR (l:Location) ON (l.city)",
80
+ "CREATE INDEX location_country_idx IF NOT EXISTS FOR (l:Location) ON (l.country)",
81
+ "CREATE INDEX challenge_source_idx IF NOT EXISTS FOR (c:Challenge) ON (c.source)",
82
+ ]
83
+
84
+ # Full-text index for semantic search within graph
85
+ FULLTEXT_INDEX = """
86
+ CREATE FULLTEXT INDEX entity_fulltext IF NOT EXISTS
87
+ FOR (n:Project|Organization|Location|Milestone|Challenge)
88
+ ON EACH [n.name, n.text, n.description]
89
+ """
90
+
91
+ # Cypher template with APOC support
92
+ # Uses CALL subqueries to handle empty lists properly
93
+ CYPHER_UPSERT_WITH_APOC = """
94
+ MERGE (p:Project {projectId: $project_id})
95
+ ON CREATE SET p.name = $project_name
96
+ ON MATCH SET p.name = coalesce(p.name, $project_name)
97
+ SET p.source = $source,
98
+ p.status = $status,
99
+ p.statusReason = $status_reason,
100
+ p.lastUpdate = $last_update,
101
+ p.initialRelease = $initial_release
102
+
103
+ WITH p
104
+ MERGE (b:Budget {key: $bud_key})
105
+ SET b.amount = $tiv_amount,
106
+ b.currency = $tiv_currency,
107
+ b.kind = 'TIV',
108
+ b.source = $source
109
+ MERGE (p)-[:HAS_BUDGET]->(b)
110
+
111
+ WITH p
112
+ MERGE (l:Location {key: $loc_key})
113
+ SET l.address = $address,
114
+ l.city = $city,
115
+ l.state = $state,
116
+ l.postal = $postal,
117
+ l.country = $country,
118
+ l.zoneCounty = $zone_county,
119
+ l.source = $source
120
+ MERGE (p)-[:LOCATED_IN]->(l)
121
+
122
+ WITH p
123
+ MERGE (r:Report {key: $rep_key})
124
+ SET r.source = $source,
125
+ r.lastUpdate = $last_update,
126
+ r.initialRelease = $initial_release
127
+ MERGE (p)-[:HAS_REPORT]->(r)
128
+
129
+ WITH p
130
+ CALL {
131
+ WITH p
132
+ UNWIND CASE WHEN size($challenges) > 0 THEN $challenges ELSE [null] END AS ch
133
+ WITH p, ch WHERE ch IS NOT NULL
134
+ MERGE (c:Challenge {key: p.projectId + '::ch::' + toString(apoc.util.md5(ch))})
135
+ SET c.text = ch, c.source = $source
136
+ MERGE (p)-[:HAS_CHALLENGE]->(c)
137
+ RETURN count(*) AS chCount
138
+ }
139
+
140
+ WITH p
141
+ CALL {
142
+ WITH p
143
+ UNWIND CASE WHEN size($milestones) > 0 THEN $milestones ELSE [null] END AS ms
144
+ WITH p, ms WHERE ms IS NOT NULL
145
+ MERGE (m:Milestone {key: p.projectId + '::ms::' + toString(apoc.util.md5(ms.sentence))})
146
+ SET m.name = ms.name, m.dateText = ms.dateText, m.sentence = ms.sentence, m.source = $source
147
+ MERGE (p)-[:HAS_MILESTONE]->(m)
148
+ RETURN count(*) AS msCount
149
+ }
150
+
151
+ RETURN p.projectId AS projectId, p.name AS name
152
+ """
153
+
154
+ # Cypher template without APOC (fallback)
155
+ # Uses CALL subqueries to handle empty lists properly
156
+ CYPHER_UPSERT_NO_APOC = """
157
+ MERGE (p:Project {projectId: $project_id})
158
+ ON CREATE SET p.name = $project_name
159
+ ON MATCH SET p.name = coalesce(p.name, $project_name)
160
+ SET p.source = $source,
161
+ p.status = $status,
162
+ p.statusReason = $status_reason,
163
+ p.lastUpdate = $last_update,
164
+ p.initialRelease = $initial_release
165
+
166
+ WITH p
167
+ MERGE (b:Budget {key: $bud_key})
168
+ SET b.amount = $tiv_amount,
169
+ b.currency = $tiv_currency,
170
+ b.kind = 'TIV',
171
+ b.source = $source
172
+ MERGE (p)-[:HAS_BUDGET]->(b)
173
+
174
+ WITH p
175
+ MERGE (l:Location {key: $loc_key})
176
+ SET l.address = $address,
177
+ l.city = $city,
178
+ l.state = $state,
179
+ l.postal = $postal,
180
+ l.country = $country,
181
+ l.zoneCounty = $zone_county,
182
+ l.source = $source
183
+ MERGE (p)-[:LOCATED_IN]->(l)
184
+
185
+ WITH p
186
+ MERGE (r:Report {key: $rep_key})
187
+ SET r.source = $source,
188
+ r.lastUpdate = $last_update,
189
+ r.initialRelease = $initial_release
190
+ MERGE (p)-[:HAS_REPORT]->(r)
191
+
192
+ WITH p
193
+ CALL {
194
+ WITH p
195
+ UNWIND CASE WHEN size($challenges) > 0 THEN range(0, size($challenges)-1) ELSE [null] END AS i
196
+ WITH p, i WHERE i IS NOT NULL
197
+ MERGE (c:Challenge {key: p.projectId + '::ch::' + toString(i)})
198
+ SET c.text = $challenges[i], c.source = $source
199
+ MERGE (p)-[:HAS_CHALLENGE]->(c)
200
+ RETURN count(*) AS chCount
201
+ }
202
+
203
+ WITH p
204
+ CALL {
205
+ WITH p
206
+ UNWIND CASE WHEN size($milestones) > 0 THEN range(0, size($milestones)-1) ELSE [null] END AS j
207
+ WITH p, j WHERE j IS NOT NULL
208
+ MERGE (m:Milestone {key: p.projectId + '::ms::' + toString(j)})
209
+ SET m.name = $milestones[j].name, m.dateText = $milestones[j].dateText,
210
+ m.sentence = $milestones[j].sentence, m.source = $source
211
+ MERGE (p)-[:HAS_MILESTONE]->(m)
212
+ RETURN count(*) AS msCount
213
+ }
214
+
215
+ RETURN p.projectId AS projectId, p.name AS name
216
+ """
217
+
218
+ def __init__(
219
+ self,
220
+ uri: str,
221
+ user: str,
222
+ password: str,
223
+ database: str = "neo4j"
224
+ ) -> None:
225
+ """Initialize Neo4j service.
226
+
227
+ Args:
228
+ uri: Neo4j URI (typically neo4j+s://... for Aura).
229
+ user: Neo4j username.
230
+ password: Neo4j password.
231
+ database: Neo4j database name (Aura commonly uses "neo4j").
232
+
233
+ Raises:
234
+ Neo4jConnectionError: If connection or authentication fails.
235
+ """
236
+ self.uri = uri
237
+ self.user = user
238
+ self.password = password
239
+ self.database = database or "neo4j"
240
+
241
+ logger.info(f"Connecting to Neo4j: {uri}")
242
+ try:
243
+ # Low-level driver for constraint management and transactional writes
244
+ logger.substep("Creating driver")
245
+ self.driver: Driver = GraphDatabase.driver(uri, auth=(user, password))
246
+ self.driver.verify_connectivity()
247
+ logger.substep("Driver connectivity verified")
248
+
249
+ # LangChain wrapper for GraphCypherQAChain and graph operations
250
+ logger.substep("Initializing Neo4jGraph wrapper")
251
+ self.graph: Neo4jGraph = Neo4jGraph(
252
+ url=uri,
253
+ username=user,
254
+ password=password,
255
+ database=self.database
256
+ )
257
+ logger.info(f"Connected to Neo4j database: {self.database}")
258
+ except ServiceUnavailable as e:
259
+ logger.error(f"Service unavailable: {e}")
260
+ raise Neo4jConnectionError(
261
+ f"Could not connect to Neo4j at {uri}. "
262
+ f"Ensure the URI is correct and the database is running. "
263
+ f"Error: {e}"
264
+ ) from e
265
+ except AuthError as e:
266
+ logger.error(f"Authentication failed: {e}")
267
+ raise Neo4jConnectionError(
268
+ f"Authentication failed for Neo4j. "
269
+ f"Check username and password. Error: {e}"
270
+ ) from e
271
+ except Exception as e:
272
+ logger.error(f"Connection failed: {e}")
273
+ raise Neo4jConnectionError(
274
+ f"Failed to connect to Neo4j: {e}"
275
+ ) from e
276
+
277
+ self._parser = ProjectReportParser()
278
+
279
+ def close(self) -> None:
280
+ """Close the underlying Neo4j driver."""
281
+ logger.debug("Closing Neo4j driver")
282
+ try:
283
+ self.driver.close()
284
+ logger.debug("Neo4j driver closed")
285
+ except Exception as e:
286
+ logger.warning(f"Error closing driver: {e}")
287
+
288
+ def ensure_constraints(self) -> None:
289
+ """Create constraints for the structured layer.
290
+
291
+ Notes:
292
+ Some Aura tiers or policies may restrict certain DDL operations.
293
+ Failures are logged but swallowed to keep ingestion operational.
294
+ """
295
+ with log_step(logger, "Create database constraints"):
296
+ success_count = 0
297
+ with self.driver.session(database=self.database) as session:
298
+ for stmt in self.CONSTRAINTS:
299
+ try:
300
+ session.run(stmt)
301
+ success_count += 1
302
+ except Exception as e:
303
+ logger.debug(f"Constraint skipped: {e}")
304
+ logger.info(f"Constraints created: {success_count}/{len(self.CONSTRAINTS)}")
305
+
306
+ # Also create performance indexes
307
+ self.ensure_indexes()
308
+
309
+ def ensure_indexes(self) -> None:
310
+ """Create performance indexes for faster queries.
311
+
312
+ Creates indexes on frequently queried properties and
313
+ optionally a full-text index for semantic search.
314
+ """
315
+ with log_step(logger, "Create performance indexes"):
316
+ success_count = 0
317
+ with self.driver.session(database=self.database) as session:
318
+ for stmt in self.INDEXES:
319
+ try:
320
+ session.run(stmt)
321
+ success_count += 1
322
+ except Exception as e:
323
+ logger.debug(f"Index skipped: {e}")
324
+
325
+ # Try to create full-text index (may not be available on all tiers)
326
+ try:
327
+ session.run(self.FULLTEXT_INDEX)
328
+ logger.substep("Full-text index created")
329
+ except Exception as e:
330
+ logger.debug(f"Full-text index skipped: {e}")
331
+
332
+ logger.info(f"Indexes created: {success_count}/{len(self.INDEXES)}")
333
+
334
+ def get_statistics(self) -> Dict[str, Any]:
335
+ """Get database statistics for monitoring.
336
+
337
+ Returns:
338
+ Dictionary with node/relationship counts and other stats.
339
+ """
340
+ stats: Dict[str, Any] = {}
341
+
342
+ queries = {
343
+ "node_count": "MATCH (n) RETURN count(n) AS count",
344
+ "relationship_count": "MATCH ()-[r]->() RETURN count(r) AS count",
345
+ "project_count": "MATCH (p:Project) RETURN count(p) AS count",
346
+ "chunk_count": "MATCH (c:Chunk) RETURN count(c) AS count",
347
+ "entity_count": "MATCH (e) WHERE NOT e:Chunk AND NOT e:Project RETURN count(e) AS count",
348
+ }
349
+
350
+ for name, query in queries.items():
351
+ try:
352
+ result = self.graph.query(query)
353
+ stats[name] = result[0]["count"] if result else 0
354
+ except Exception:
355
+ stats[name] = -1
356
+
357
+ return stats
358
+
359
+ def clear(self) -> None:
360
+ """Delete all nodes and relationships from the database."""
361
+ logger.info("Clearing all nodes and relationships from database")
362
+ self.graph.query("MATCH (n) DETACH DELETE n")
363
+ logger.info("Database cleared")
364
+
365
+ def upsert_structured_project(
366
+ self,
367
+ record: ProjectRecord
368
+ ) -> Dict[str, Any]:
369
+ """Upsert structured nodes/relationships for a single project record.
370
+
371
+ This function is the reliability backbone for:
372
+ - Budget allocation & location questions
373
+ - Timeline comparison questions
374
+ - Challenges questions (derived from reason/details/schedule heuristics)
375
+
376
+ Args:
377
+ record: Parsed ProjectRecord.
378
+
379
+ Returns:
380
+ Dictionary with {"projectId": ..., "name": ...}.
381
+ """
382
+ project_name = record.project_name or record.source
383
+ logger.debug(f"Upserting project: {project_name}")
384
+
385
+ project_key = record.get_unique_key()
386
+ loc_key = f"{project_key}::loc"
387
+ bud_key = f"{project_key}::tiv"
388
+ rep_key = f"{project_key}::report::{record.last_update or ''}"
389
+
390
+ # Parse geographic components
391
+ geo = self._parser.parse_city_state_country(record.city_state_line)
392
+
393
+ # Derive challenges and milestones
394
+ challenges = self._parser.derive_challenges(record)
395
+ milestones = self._parser.extract_milestones(record.schedule_text)
396
+ milestone_dicts = [m.to_dict() for m in milestones]
397
+
398
+ logger.substep(f"Extracted {len(challenges)} challenges, {len(milestones)} milestones")
399
+ if milestones:
400
+ for ms in milestones:
401
+ logger.substep(f" Milestone: {ms.name} -> {ms.date_text}")
402
+ else:
403
+ logger.warning(f"No milestones extracted from schedule_text: {record.schedule_text[:100] if record.schedule_text else 'None'}...")
404
+
405
+ params = {
406
+ # Identification
407
+ "source": record.source,
408
+ "project_id": record.project_id or record.project_name or record.source,
409
+ "project_name": record.project_name or record.source,
410
+ # Classification
411
+ "industry_code": record.industry_code,
412
+ "project_type": record.project_type,
413
+ "sector": record.sector,
414
+ "sic_code": record.sic_code,
415
+ # Financial
416
+ "bud_key": bud_key,
417
+ "tiv_amount": record.tiv_amount,
418
+ "tiv_currency": record.tiv_currency,
419
+ # Status
420
+ "status": record.status,
421
+ "status_reason": record.status_reason,
422
+ "project_probability": record.project_probability,
423
+ # Timeline
424
+ "last_update": record.last_update,
425
+ "initial_release": record.initial_release,
426
+ "pec_timing": record.pec_timing,
427
+ "pec_activity": record.pec_activity,
428
+ # Location
429
+ "loc_key": loc_key,
430
+ "address": record.address,
431
+ "city": geo.city,
432
+ "state": geo.state,
433
+ "postal": geo.postal,
434
+ "country": geo.country,
435
+ "zone_county": record.zone_county,
436
+ "phone": record.phone,
437
+ # Plant Info
438
+ "plant_owner": record.plant_owner,
439
+ "plant_parent": record.plant_parent,
440
+ "plant_name": record.plant_name,
441
+ "plant_id": record.plant_id,
442
+ "unit_name": record.unit_name,
443
+ # Contacts
444
+ "project_manager": record.project_manager,
445
+ "project_manager_company": record.project_manager_company,
446
+ "project_manager_email": record.project_manager_email,
447
+ "engineer_company": record.engineer_company,
448
+ "ec_firm": record.ec_firm,
449
+ # Technical
450
+ "scope_text": record.scope_text,
451
+ "project_capacity": record.project_capacity,
452
+ "environmental": record.environmental,
453
+ "construction_labor": record.construction_labor,
454
+ "fuel_type": record.fuel_type,
455
+ # Report
456
+ "rep_key": rep_key,
457
+ # Derived
458
+ "challenges": challenges,
459
+ "milestones": milestone_dicts,
460
+ }
461
+
462
+ with self.driver.session(database=self.database) as session:
463
+ # Step 1: Upsert base project with all fields
464
+ base_query = """
465
+ MERGE (p:Project {projectId: $project_id})
466
+ ON CREATE SET p.name = $project_name
467
+ ON MATCH SET p.name = coalesce(p.name, $project_name)
468
+ SET p.source = $source,
469
+ // Classification
470
+ p.industryCode = $industry_code,
471
+ p.projectType = $project_type,
472
+ p.sector = $sector,
473
+ p.sicCode = $sic_code,
474
+ // Status
475
+ p.status = $status,
476
+ p.statusReason = $status_reason,
477
+ p.projectProbability = $project_probability,
478
+ // Timeline
479
+ p.lastUpdate = $last_update,
480
+ p.initialRelease = $initial_release,
481
+ p.pecTiming = $pec_timing,
482
+ p.pecActivity = $pec_activity,
483
+ // Plant Info
484
+ p.plantOwner = $plant_owner,
485
+ p.plantParent = $plant_parent,
486
+ p.plantName = $plant_name,
487
+ p.plantId = $plant_id,
488
+ p.unitName = $unit_name,
489
+ p.phone = $phone,
490
+ // Contacts
491
+ p.projectManager = $project_manager,
492
+ p.projectManagerCompany = $project_manager_company,
493
+ p.projectManagerEmail = $project_manager_email,
494
+ p.engineerCompany = $engineer_company,
495
+ p.ecFirm = $ec_firm,
496
+ // Technical
497
+ p.scopeText = $scope_text,
498
+ p.projectCapacity = $project_capacity,
499
+ p.environmental = $environmental,
500
+ p.constructionLabor = $construction_labor,
501
+ p.fuelType = $fuel_type
502
+
503
+ WITH p
504
+ MERGE (b:Budget {key: $bud_key})
505
+ SET b.amount = $tiv_amount, b.currency = $tiv_currency, b.kind = 'TIV', b.source = $source
506
+ MERGE (p)-[:HAS_BUDGET]->(b)
507
+
508
+ WITH p
509
+ MERGE (l:Location {key: $loc_key})
510
+ SET l.address = $address, l.city = $city, l.state = $state,
511
+ l.postal = $postal, l.country = $country, l.zoneCounty = $zone_county, l.source = $source
512
+ MERGE (p)-[:LOCATED_IN]->(l)
513
+
514
+ WITH p
515
+ MERGE (r:Report {key: $rep_key})
516
+ SET r.source = $source, r.lastUpdate = $last_update, r.initialRelease = $initial_release
517
+ MERGE (p)-[:HAS_REPORT]->(r)
518
+
519
+ RETURN p.projectId AS projectId, p.name AS name
520
+ """
521
+ logger.substep("Executing base project upsert")
522
+ row = session.run(base_query, params).single()
523
+
524
+ if row is None:
525
+ logger.warning("Base project upsert returned no result")
526
+ return {"projectId": params["project_id"], "name": params["project_name"]}
527
+
528
+ project_id = row["projectId"]
529
+ project_name = row["name"]
530
+ logger.substep(f"Project created: {project_name}")
531
+
532
+ # Step 2: Add challenges (separate query)
533
+ if challenges:
534
+ for i, ch in enumerate(challenges):
535
+ ch_query = """
536
+ MATCH (p:Project {projectId: $project_id})
537
+ MERGE (c:Challenge {key: $ch_key})
538
+ SET c.text = $ch_text, c.source = $source
539
+ MERGE (p)-[:HAS_CHALLENGE]->(c)
540
+ """
541
+ session.run(ch_query, {
542
+ "project_id": project_id,
543
+ "ch_key": f"{project_id}::ch::{i}",
544
+ "ch_text": ch,
545
+ "source": record.source
546
+ })
547
+ logger.substep(f"Added {len(challenges)} challenges")
548
+
549
+ # Step 3: Add milestones (separate query)
550
+ if milestone_dicts:
551
+ for i, ms in enumerate(milestone_dicts):
552
+ ms_query = """
553
+ MATCH (p:Project {projectId: $project_id})
554
+ MERGE (m:Milestone {key: $ms_key})
555
+ SET m.name = $ms_name, m.dateText = $ms_date, m.sentence = $ms_sentence, m.source = $source
556
+ MERGE (p)-[:HAS_MILESTONE]->(m)
557
+ """
558
+ session.run(ms_query, {
559
+ "project_id": project_id,
560
+ "ms_key": f"{project_id}::ms::{i}",
561
+ "ms_name": ms.get("name", ""),
562
+ "ms_date": ms.get("dateText", ""),
563
+ "ms_sentence": ms.get("sentence", ""),
564
+ "source": record.source
565
+ })
566
+ logger.substep(f"Added {len(milestone_dicts)} milestones")
567
+
568
+ return {"projectId": project_id, "name": project_name}
569
+
570
+ def query(self, cypher: str, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
571
+ """Execute a Cypher query and return results.
572
+
573
+ Args:
574
+ cypher: Cypher query string.
575
+ params: Optional query parameters.
576
+
577
+ Returns:
578
+ List of result dictionaries.
579
+ """
580
+ return self.graph.query(cypher, params or {})
581
+
582
+ def __enter__(self) -> "Neo4jService":
583
+ """Context manager entry."""
584
+ return self
585
+
586
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
587
+ """Context manager exit with cleanup."""
588
+ self.close()
src/services/reranker.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Cross-encoder reranker for document retrieval."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, List, Optional, Tuple
6
+ import logging
7
+
8
+ from langchain.schema import Document
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Lazy import to avoid loading model at import time
13
+ _cross_encoder = None
14
+ _cross_encoder_model_name = None
15
+
16
+
17
+ def _get_cross_encoder(model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
18
+ """Lazy load the cross-encoder model.
19
+
20
+ Args:
21
+ model_name: HuggingFace model identifier
22
+
23
+ Returns:
24
+ CrossEncoder instance
25
+ """
26
+ global _cross_encoder, _cross_encoder_model_name
27
+
28
+ if _cross_encoder is None or _cross_encoder_model_name != model_name:
29
+ try:
30
+ from sentence_transformers import CrossEncoder
31
+ logger.info(f"Loading cross-encoder model: {model_name}")
32
+ _cross_encoder = CrossEncoder(model_name, max_length=512)
33
+ _cross_encoder_model_name = model_name
34
+ except ImportError:
35
+ logger.warning(
36
+ "sentence-transformers not installed. "
37
+ "Run: pip install sentence-transformers"
38
+ )
39
+ return None
40
+ except Exception as e:
41
+ logger.warning(f"Failed to load cross-encoder: {e}")
42
+ return None
43
+
44
+ return _cross_encoder
45
+
46
+
47
+ class FastCrossEncoderReranker:
48
+ """Cross-encoder reranker using sentence-transformers.
49
+
50
+ Runs locally and is faster than LLM-based reranking.
51
+ """
52
+
53
+ MODEL_OPTIONS = {
54
+ "fast": "cross-encoder/ms-marco-MiniLM-L-6-v2",
55
+ "balanced": "cross-encoder/ms-marco-MiniLM-L-12-v2",
56
+ "tiny": "cross-encoder/ms-marco-TinyBERT-L-2-v2",
57
+ }
58
+
59
+ def __init__(
60
+ self,
61
+ model_name: str = "fast",
62
+ max_length: int = 512,
63
+ batch_size: int = 16,
64
+ ) -> None:
65
+ """Initialize cross-encoder reranker.
66
+
67
+ Args:
68
+ model_name: One of "fast", "balanced", "tiny", or a HuggingFace model ID
69
+ max_length: Maximum sequence length for encoding
70
+ batch_size: Batch size for scoring (higher = faster but more memory)
71
+ """
72
+ # Resolve model name alias
73
+ self.model_name = self.MODEL_OPTIONS.get(model_name, model_name)
74
+ self.max_length = max_length
75
+ self.batch_size = batch_size
76
+ self._model = None
77
+
78
+ def _ensure_model(self) -> bool:
79
+ """Ensure model is loaded.
80
+
81
+ Returns:
82
+ True if model is available, False otherwise
83
+ """
84
+ if self._model is None:
85
+ self._model = _get_cross_encoder(self.model_name)
86
+ return self._model is not None
87
+
88
+ def rerank(
89
+ self,
90
+ query: str,
91
+ documents: List[Document],
92
+ top_k: int = 6,
93
+ ) -> List[Document]:
94
+ """Rerank documents by relevance to query.
95
+
96
+ Args:
97
+ query: User query
98
+ documents: Documents to rerank
99
+ top_k: Number of top documents to return
100
+
101
+ Returns:
102
+ Reranked documents (most relevant first)
103
+ """
104
+ if not documents:
105
+ return []
106
+
107
+ if len(documents) <= 1:
108
+ return documents
109
+
110
+ if not self._ensure_model():
111
+ logger.warning("Cross-encoder not available, returning original order")
112
+ return documents[:top_k]
113
+
114
+ try:
115
+ # Prepare query-document pairs
116
+ pairs = [
117
+ (query, self._get_text(doc)[:self.max_length])
118
+ for doc in documents
119
+ ]
120
+
121
+ # Score all pairs (batched for efficiency)
122
+ scores = self._model.predict(
123
+ pairs,
124
+ batch_size=self.batch_size,
125
+ show_progress_bar=False,
126
+ )
127
+
128
+ # Sort by score descending
129
+ scored_docs = sorted(
130
+ zip(documents, scores),
131
+ key=lambda x: x[1],
132
+ reverse=True,
133
+ )
134
+
135
+ return [doc for doc, _ in scored_docs[:top_k]]
136
+
137
+ except Exception as e:
138
+ logger.warning(f"Reranking failed: {e}, returning original order")
139
+ return documents[:top_k]
140
+
141
+ def rerank_with_scores(
142
+ self,
143
+ query: str,
144
+ documents: List[Document],
145
+ top_k: int = 6,
146
+ ) -> List[Tuple[Document, float]]:
147
+ """Rerank documents and return with scores.
148
+
149
+ Args:
150
+ query: User query
151
+ documents: Documents to rerank
152
+ top_k: Number of top documents to return
153
+
154
+ Returns:
155
+ List of (document, score) tuples, sorted by score descending
156
+ """
157
+ if not documents:
158
+ return []
159
+
160
+ if len(documents) <= 1:
161
+ return [(doc, 1.0) for doc in documents]
162
+
163
+ if not self._ensure_model():
164
+ return [(doc, 1.0 - i * 0.1) for i, doc in enumerate(documents[:top_k])]
165
+
166
+ try:
167
+ pairs = [
168
+ (query, self._get_text(doc)[:self.max_length])
169
+ for doc in documents
170
+ ]
171
+
172
+ scores = self._model.predict(
173
+ pairs,
174
+ batch_size=self.batch_size,
175
+ show_progress_bar=False,
176
+ )
177
+
178
+ scored_docs = sorted(
179
+ zip(documents, scores),
180
+ key=lambda x: x[1],
181
+ reverse=True,
182
+ )
183
+
184
+ return scored_docs[:top_k]
185
+
186
+ except Exception as e:
187
+ logger.warning(f"Reranking failed: {e}")
188
+ return [(doc, 1.0 - i * 0.1) for i, doc in enumerate(documents[:top_k])]
189
+
190
+ def _get_text(self, doc: Document) -> str:
191
+ """Extract text content from document.
192
+
193
+ Args:
194
+ doc: LangChain Document
195
+
196
+ Returns:
197
+ Text content
198
+ """
199
+ if hasattr(doc, 'page_content'):
200
+ return doc.page_content
201
+ return str(doc)
202
+
203
+
204
+ class NoOpReranker:
205
+ """No-op reranker that returns documents in original order.
206
+
207
+ Use this as a fallback when cross-encoder is not available.
208
+ """
209
+
210
+ def rerank(
211
+ self,
212
+ query: str,
213
+ documents: List[Document],
214
+ top_k: int = 6,
215
+ ) -> List[Document]:
216
+ """Return documents without reranking."""
217
+ return documents[:top_k]
218
+
219
+ def rerank_with_scores(
220
+ self,
221
+ query: str,
222
+ documents: List[Document],
223
+ top_k: int = 6,
224
+ ) -> List[Tuple[Document, float]]:
225
+ """Return documents with dummy scores."""
226
+ return [(doc, 1.0 - i * 0.05) for i, doc in enumerate(documents[:top_k])]
227
+
228
+
229
+ def get_reranker(
230
+ model_name: str = "fast",
231
+ fallback_to_noop: bool = True,
232
+ ) -> FastCrossEncoderReranker:
233
+ """Factory function to get a reranker instance.
234
+
235
+ Args:
236
+ model_name: Model name or alias
237
+ fallback_to_noop: If True, return NoOpReranker when cross-encoder fails
238
+
239
+ Returns:
240
+ Reranker instance
241
+ """
242
+ try:
243
+ reranker = FastCrossEncoderReranker(model_name)
244
+ # Test model loading
245
+ if reranker._ensure_model():
246
+ return reranker
247
+ except Exception as e:
248
+ logger.warning(f"Failed to create cross-encoder reranker: {e}")
249
+
250
+ if fallback_to_noop:
251
+ logger.info("Using no-op reranker as fallback")
252
+ return NoOpReranker()
253
+
254
+ raise RuntimeError("Cross-encoder reranker not available")
src/services/retriever.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Optimized retriever with pattern-based expansion and cross-encoder reranking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import re
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+
9
+ from langchain.schema import Document
10
+
11
+ from src.config import get_logger, log_step
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ class OptimizedRetriever:
17
+ """Fast retriever without LLM calls for expansion/reranking.
18
+
19
+ Uses pattern-based query expansion and cross-encoder reranking
20
+ instead of LLM calls for faster retrieval.
21
+ """
22
+
23
+ EXPANSION_PATTERNS = {
24
+ "budget": ["cost", "investment", "TIV", "capex", "funding", "allocation", "financial"],
25
+ "location": ["site", "address", "city", "country", "region", "plant", "facility"],
26
+ "timeline": ["schedule", "milestone", "deadline", "completion", "duration", "phase"],
27
+ "challenge": ["risk", "issue", "constraint", "problem", "delay", "obstacle", "barrier"],
28
+ "project": ["plant", "facility", "refinery", "station", "development"],
29
+ "status": ["progress", "state", "condition", "update"],
30
+ }
31
+
32
+ def __init__(
33
+ self,
34
+ vector_store: Any,
35
+ reranker: Optional[Any] = None,
36
+ k_initial: int = 12,
37
+ k_final: int = 6,
38
+ use_expansion: bool = True,
39
+ use_reranking: bool = True,
40
+ use_cache: bool = True,
41
+ ) -> None:
42
+ self.vector_store = vector_store
43
+ self.k_initial = k_initial
44
+ self.k_final = k_final
45
+ self.use_expansion = use_expansion
46
+ self.use_reranking = use_reranking
47
+ self.use_cache = use_cache
48
+ self._cache: Dict[str, List[Document]] = {}
49
+ self._reranker = reranker
50
+ self._reranker_loaded = reranker is not None
51
+
52
+ def _get_reranker(self) -> Optional[Any]:
53
+ if self._reranker_loaded:
54
+ return self._reranker
55
+
56
+ try:
57
+ from src.services.reranker import get_reranker
58
+ self._reranker = get_reranker("fast")
59
+ self._reranker_loaded = True
60
+ logger.info("Loaded cross-encoder reranker")
61
+ except Exception as e:
62
+ logger.warning(f"Could not load reranker: {e}")
63
+ self._reranker = None
64
+ self._reranker_loaded = True
65
+
66
+ return self._reranker
67
+
68
+ def _cache_key(self, query: str) -> str:
69
+ return hashlib.md5(query.lower().strip().encode()).hexdigest()
70
+
71
+ def _expand_query_fast(self, query: str) -> List[str]:
72
+ queries = [query]
73
+ query_lower = query.lower()
74
+
75
+ for keyword, expansions in self.EXPANSION_PATTERNS.items():
76
+ if keyword in query_lower:
77
+ for exp in expansions[:2]:
78
+ if exp.lower() not in query_lower:
79
+ variation = re.sub(
80
+ rf'\b{keyword}\b',
81
+ exp,
82
+ query,
83
+ flags=re.IGNORECASE
84
+ )
85
+ if variation != query and variation not in queries:
86
+ queries.append(variation)
87
+ break
88
+
89
+ return queries[:3]
90
+
91
+ def _reciprocal_rank_fusion(
92
+ self,
93
+ result_lists: List[List[Tuple[Document, float]]],
94
+ k: int = 60,
95
+ ) -> List[Document]:
96
+ doc_scores: Dict[str, Dict[str, Any]] = {}
97
+
98
+ for results in result_lists:
99
+ for rank, (doc, _) in enumerate(results):
100
+ doc_id = hashlib.md5(doc.page_content[:200].encode()).hexdigest()
101
+
102
+ if doc_id not in doc_scores:
103
+ doc_scores[doc_id] = {"doc": doc, "score": 0}
104
+
105
+ doc_scores[doc_id]["score"] += 1.0 / (k + rank + 1)
106
+
107
+ sorted_items = sorted(
108
+ doc_scores.values(),
109
+ key=lambda x: x["score"],
110
+ reverse=True,
111
+ )
112
+
113
+ return [item["doc"] for item in sorted_items]
114
+
115
+ def retrieve(self, question: str) -> List[Document]:
116
+ with log_step(logger, "Optimized retrieval"):
117
+ if self.use_cache:
118
+ cache_key = self._cache_key(question)
119
+ if cache_key in self._cache:
120
+ logger.info("Cache hit - returning cached results")
121
+ return self._cache[cache_key]
122
+
123
+ if self.use_expansion:
124
+ queries = self._expand_query_fast(question)
125
+ logger.substep(f"Expanded to {len(queries)} queries")
126
+ else:
127
+ queries = [question]
128
+
129
+ all_results: List[List[Tuple[Document, float]]] = []
130
+
131
+ for i, query in enumerate(queries):
132
+ try:
133
+ if hasattr(self.vector_store, 'similarity_search_with_score'):
134
+ results = self.vector_store.similarity_search_with_score(
135
+ query, k=self.k_initial
136
+ )
137
+ else:
138
+ docs = self.vector_store.similarity_search(
139
+ query, k=self.k_initial
140
+ )
141
+ results = [(doc, 1.0 - j * 0.01) for j, doc in enumerate(docs)]
142
+
143
+ all_results.append(results)
144
+ except Exception as e:
145
+ logger.warning(f"Query {i+1} failed: {e}")
146
+
147
+ if not all_results:
148
+ logger.warning("No results from any query")
149
+ return []
150
+
151
+ if len(all_results) > 1:
152
+ fused_docs = self._reciprocal_rank_fusion(all_results)
153
+ else:
154
+ fused_docs = [doc for doc, _ in all_results[0]]
155
+
156
+ fused_docs = fused_docs[:self.k_initial]
157
+ logger.substep(f"Fused to {len(fused_docs)} documents")
158
+
159
+ if self.use_reranking and len(fused_docs) > self.k_final:
160
+ reranker = self._get_reranker()
161
+ if reranker:
162
+ with log_step(logger, "Cross-encoder reranking"):
163
+ fused_docs = reranker.rerank(question, fused_docs, self.k_final)
164
+
165
+ final_docs = fused_docs[:self.k_final]
166
+
167
+ if self.use_cache:
168
+ self._cache[cache_key] = final_docs
169
+
170
+ logger.info(f"Returning {len(final_docs)} documents")
171
+ return final_docs
172
+
173
+ def clear_cache(self) -> None:
174
+ self._cache.clear()
175
+
176
+ def get_cache_stats(self) -> Dict[str, int]:
177
+ return {"cached_queries": len(self._cache)}
src/ui/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """User interface components."""
2
+
3
+ from src.ui.gradio_app import GradioApp
4
+
5
+ __all__ = ["GradioApp"]
src/ui/gradio_app.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio web interface for Project Intelligence Hub."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, List
6
+
7
+ import gradio as gr
8
+
9
+ from src.config.settings import Settings, Neo4jConfig, TogetherAIConfig
10
+ from src.models.state import AppState
11
+ from src.services.builder import GraphRAGBuilder
12
+ from src.services.answerer import QueryAnswerer
13
+ from src.services.neo4j_service import Neo4jService, Neo4jConnectionError
14
+
15
+
16
+ class GradioApp:
17
+ """Gradio controller for ingestion and query-time interactions."""
18
+
19
+ TITLE = "Project Intelligence Hub"
20
+ DESCRIPTION = """
21
+ # Project Intelligence Hub
22
+
23
+ Transform unstructured PDF reports into a queryable knowledge graph.
24
+
25
+ 1. **Ingest** — Upload documents to extract entities and relationships
26
+ 2. **Index** — Build vector embeddings and graph structure
27
+ 3. **Query** — Retrieve answers via hybrid graph + semantic search
28
+ """
29
+
30
+ GRAPH_EXPLORER_QUERIES = {
31
+ "node_labels": """
32
+ CALL db.labels() YIELD label
33
+ CALL { WITH label MATCH (n) WHERE label IN labels(n) RETURN count(n) AS cnt }
34
+ RETURN label, cnt ORDER BY cnt DESC
35
+ """,
36
+ "relationship_types": """
37
+ CALL db.relationshipTypes() YIELD relationshipType
38
+ CALL { WITH relationshipType MATCH ()-[r]->() WHERE type(r) = relationshipType RETURN count(r) AS cnt }
39
+ RETURN relationshipType, cnt ORDER BY cnt DESC
40
+ """,
41
+ "sample_projects": """
42
+ MATCH (p:Project)
43
+ OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
44
+ OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
45
+ RETURN p.name AS project, b.amount AS budget, b.currency AS currency,
46
+ l.city AS city, l.country AS country
47
+ LIMIT 10
48
+ """,
49
+ }
50
+
51
+ def __init__(self, settings: Settings | None = None) -> None:
52
+ self.settings = settings or Settings.from_env()
53
+ self.answerer = QueryAnswerer()
54
+ self._validate_settings()
55
+
56
+ def _validate_settings(self) -> None:
57
+ issues = []
58
+ if not self.settings.together_ai.api_key:
59
+ issues.append("TOGETHER_API_KEY not set in .env")
60
+ if not self.settings.neo4j.uri:
61
+ issues.append("NEO4J_URI not set in .env")
62
+ if not self.settings.neo4j.password:
63
+ issues.append("NEO4J_PASSWORD not set in .env")
64
+
65
+ if issues:
66
+ print("Configuration warnings:")
67
+ for issue in issues:
68
+ print(f" - {issue}")
69
+
70
+ def _ingest_action(self, pdf_files: List[Any], clear_db: str):
71
+ clear_db_bool = clear_db == "Yes"
72
+
73
+ if not pdf_files:
74
+ yield "No documents provided. Upload at least one PDF.", gr.update(value=0, visible=True), None
75
+ return
76
+
77
+ if not self.settings.together_ai.api_key:
78
+ yield "Missing API credentials: TOGETHER_API_KEY", gr.update(value=0, visible=True), None
79
+ return
80
+
81
+ if not self.settings.neo4j.uri or not self.settings.neo4j.password:
82
+ yield "Missing database credentials: NEO4J_URI or NEO4J_PASSWORD", gr.update(value=0, visible=True), None
83
+ return
84
+
85
+ together_config = TogetherAIConfig(
86
+ api_key=self.settings.together_ai.api_key,
87
+ chat_model=self.settings.together_ai.chat_model,
88
+ embedding_model=self.settings.together_ai.embedding_model,
89
+ )
90
+
91
+ neo4j_config = Neo4jConfig(
92
+ uri=self.settings.neo4j.uri,
93
+ username=self.settings.neo4j.username,
94
+ password=self.settings.neo4j.password,
95
+ database=self.settings.neo4j.database,
96
+ )
97
+
98
+ try:
99
+ builder = GraphRAGBuilder(together_config=together_config)
100
+
101
+ final_state = None
102
+ for status, progress, state in builder.ingest_with_progress(
103
+ pdf_files=pdf_files,
104
+ neo4j_config=neo4j_config,
105
+ clear_db=clear_db_bool,
106
+ skip_llm_extraction=True,
107
+ ):
108
+ yield status, gr.update(value=progress, visible=True), state
109
+ if state is not None:
110
+ final_state = state
111
+
112
+ if final_state:
113
+ yield "Pipeline complete. Ready for queries.", gr.update(value=1.0, visible=False), final_state
114
+
115
+ except ValueError as e:
116
+ yield f"Configuration error: {e}", gr.update(value=0, visible=True), None
117
+ except Exception as e:
118
+ import traceback
119
+ traceback.print_exc()
120
+ yield f"Pipeline failed: {e}", gr.update(value=0, visible=True), None
121
+
122
+ def _clear_action(self) -> str:
123
+ if not self.settings.neo4j.uri or not self.settings.neo4j.password:
124
+ return "Database credentials not configured."
125
+
126
+ try:
127
+ with Neo4jService(
128
+ uri=self.settings.neo4j.uri,
129
+ user=self.settings.neo4j.username,
130
+ password=self.settings.neo4j.password,
131
+ database=self.settings.neo4j.database,
132
+ ) as neo4j:
133
+ neo4j.clear()
134
+ return "Graph database cleared. All nodes and relationships removed."
135
+ except Neo4jConnectionError as e:
136
+ return f"Connection error: {e}"
137
+ except Exception as e:
138
+ return f"Operation failed: {e}"
139
+
140
+ def _ask_action(self, question: str, state: AppState) -> str:
141
+ return self.answerer.answer(question, state)
142
+
143
+ def _explore_graph_action(self) -> str:
144
+ if not self.settings.neo4j.uri or not self.settings.neo4j.password:
145
+ return "Database credentials not configured."
146
+
147
+ try:
148
+ with Neo4jService(
149
+ uri=self.settings.neo4j.uri,
150
+ user=self.settings.neo4j.username,
151
+ password=self.settings.neo4j.password,
152
+ database=self.settings.neo4j.database,
153
+ ) as neo4j:
154
+ output = []
155
+
156
+ # Node counts by label
157
+ output.append("### Node Distribution\n")
158
+ output.append("| Label | Count |")
159
+ output.append("|-------|-------|")
160
+ try:
161
+ results = neo4j.query(self.GRAPH_EXPLORER_QUERIES["node_labels"])
162
+ for row in results:
163
+ output.append(f"| {row['label']} | {row['cnt']:,} |")
164
+ except Exception:
165
+ output.append("| (unable to fetch) | - |")
166
+
167
+ # Relationship counts
168
+ output.append("\n### Relationship Distribution\n")
169
+ output.append("| Type | Count |")
170
+ output.append("|------|-------|")
171
+ try:
172
+ results = neo4j.query(self.GRAPH_EXPLORER_QUERIES["relationship_types"])
173
+ for row in results:
174
+ output.append(f"| {row['relationshipType']} | {row['cnt']:,} |")
175
+ except Exception:
176
+ output.append("| (unable to fetch) | - |")
177
+
178
+ # Sample projects
179
+ output.append("\n### Sample Projects\n")
180
+ output.append("| Project | Budget | Location |")
181
+ output.append("|---------|--------|----------|")
182
+ try:
183
+ results = neo4j.query(self.GRAPH_EXPLORER_QUERIES["sample_projects"])
184
+ if not results:
185
+ output.append("| (no projects found) | - | - |")
186
+ for row in results:
187
+ name = row.get('project') or '-'
188
+ budget = f"{row.get('budget') or '-'} {row.get('currency') or ''}".strip()
189
+ location = f"{row.get('city') or ''}, {row.get('country') or ''}".strip(", ")
190
+ output.append(f"| {name} | {budget} | {location or '-'} |")
191
+ except Exception:
192
+ output.append("| (unable to fetch) | - | - |")
193
+
194
+ return "\n".join(output)
195
+
196
+ except Neo4jConnectionError as e:
197
+ return f"Connection error: {e}"
198
+ except Exception as e:
199
+ return f"Failed to fetch graph data: {e}"
200
+
201
+ def build(self) -> gr.Blocks:
202
+ with gr.Blocks(title=self.TITLE) as demo:
203
+ gr.Markdown(self.DESCRIPTION)
204
+
205
+ state = gr.State(value=None)
206
+
207
+ with gr.Group():
208
+ pdfs = gr.File(
209
+ label="Document Source",
210
+ file_types=[".pdf"],
211
+ file_count="multiple",
212
+ )
213
+
214
+ with gr.Row():
215
+ clear_toggle = gr.Radio(
216
+ label="Reset graph before ingestion",
217
+ choices=["Yes", "No"],
218
+ value="Yes",
219
+ scale=1,
220
+ )
221
+
222
+ with gr.Row():
223
+ ingest_btn = gr.Button("Run Ingestion Pipeline", variant="primary", scale=2)
224
+ clear_btn = gr.Button("Reset Graph", variant="secondary", scale=1)
225
+
226
+ progress_bar = gr.Slider(
227
+ label="Progress",
228
+ minimum=0,
229
+ maximum=1,
230
+ value=0,
231
+ interactive=False,
232
+ visible=False,
233
+ )
234
+
235
+ ingest_status = gr.Markdown()
236
+
237
+ gr.Markdown("---")
238
+
239
+ with gr.Group():
240
+ gr.Markdown("### Query Interface")
241
+ question = gr.Textbox(
242
+ label="Natural Language Query",
243
+ placeholder="e.g., Compare budget allocations and milestone timelines across projects",
244
+ lines=2,
245
+ )
246
+ ask_btn = gr.Button("Execute Query", variant="primary")
247
+ answer = gr.Markdown(label="Response")
248
+
249
+ with gr.Accordion("Graph Explorer", open=False):
250
+ gr.Markdown("View database contents without direct access to credentials.")
251
+ explore_btn = gr.Button("Load Graph Statistics", variant="secondary")
252
+ graph_stats = gr.Markdown()
253
+
254
+ with gr.Accordion("System Configuration", open=False):
255
+ gr.Markdown(self._get_config_status())
256
+
257
+ ingest_btn.click(
258
+ fn=self._ingest_action,
259
+ inputs=[pdfs, clear_toggle],
260
+ outputs=[ingest_status, progress_bar, state],
261
+ )
262
+
263
+ clear_btn.click(
264
+ fn=self._clear_action,
265
+ inputs=[],
266
+ outputs=[ingest_status],
267
+ )
268
+
269
+ ask_btn.click(
270
+ fn=self._ask_action,
271
+ inputs=[question, state],
272
+ outputs=[answer],
273
+ )
274
+
275
+ explore_btn.click(
276
+ fn=self._explore_graph_action,
277
+ inputs=[],
278
+ outputs=[graph_stats],
279
+ )
280
+
281
+ return demo
282
+
283
+ def _get_config_status(self) -> str:
284
+ def status(value: str) -> str:
285
+ return "Connected" if value else "Not configured"
286
+
287
+ return f"""
288
+ | Component | Status |
289
+ |-----------|--------|
290
+ | LLM Provider (Together AI) | {status(self.settings.together_ai.api_key)} |
291
+ | Graph Database (Neo4j) | {status(self.settings.neo4j.uri)} |
292
+ """
293
+
294
+ def launch(self, **kwargs) -> None:
295
+ demo = self.build()
296
+ demo.launch(
297
+ server_name=kwargs.get("server_name", self.settings.app.host),
298
+ server_port=kwargs.get("server_port", self.settings.app.port),
299
+ theme=gr.themes.Soft(),
300
+ **{k: v for k, v in kwargs.items() if k not in ("server_name", "server_port")},
301
+ )