File size: 16,917 Bytes
8c35759
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
"""Query answering service with hybrid strategy."""

from __future__ import annotations

from typing import Any, Dict, List, Optional, Set, Tuple

from langchain.schema import Document

from src.config import get_logger, trace_flow, log_step
from src.models.state import AppState
from src.services.retriever import OptimizedRetriever
from src.services.cache import AnswerCache, get_answer_cache
from src.services.cypher_templates import (
    CypherTemplateRouter,
    TemplateResultFormatter,
    QueryIntent,
)

# Module logger
logger = get_logger(__name__)


class QueryAnswerer:
    """Answers user questions using an optimized hybrid strategy.

    Strategy:
        1) Template-first routing: Pattern matching classifies intent and
           executes pre-validated Cypher templates for most queries.
           This is deterministic, fast, and reliable.

        2) For general queries: GraphRAG with optimized retrieval:
           - Pattern-based query expansion (no LLM)
           - Cross-encoder reranking (faster than LLM)
           - Single LLM call for synthesis only
    """

    # Default retrieval settings
    DEFAULT_K = 6

    # Optimized synthesis prompt (simpler, more focused)
    SYNTHESIS_PROMPT = """You are an expert analyst for industrial project reports.

## Question
{question}

## Retrieved Document Excerpts
{context}

## Graph Database Context
{graph_context}

## Instructions
1. Answer directly and concisely based on the evidence
2. If information is incomplete, acknowledge what's missing
3. For comparison questions, structure answer by project
4. Use citations like [1], [2] to reference sources
5. For challenges/risks, consider: cancellation reasons, delays, funding issues, permitting

Answer:""".strip()

    def __init__(
        self,
        k: int = DEFAULT_K,
        use_optimized_retrieval: bool = True,
        use_caching: bool = True,
        cache_ttl: float = 3600,
        use_reranking: bool = True,
    ) -> None:
        """Initialize query answerer.

        Args:
            k: Number of chunks to retrieve for similarity search.
            use_optimized_retrieval: If True, uses fast pattern-based expansion
                and cross-encoder reranking. If False, uses original LLM-based.
            use_caching: If True, caches answers for repeated queries.
            cache_ttl: Cache time-to-live.
            use_reranking: If True, uses cross-encoder reranking.
        """
        self.k = k
        self.use_optimized_retrieval = use_optimized_retrieval
        self.use_caching = use_caching
        self.use_reranking = use_reranking
        self._retriever: Optional[OptimizedRetriever] = None
        self._cache: Optional[AnswerCache] = None

        # Initialize template router for fast intent classification
        self._template_router = CypherTemplateRouter()

        if use_caching:
            self._cache = get_answer_cache(default_ttl=cache_ttl)

    def _format_citations(self, docs: List[Document]) -> str:
        """Format unique citations from retrieved chunk documents.

        Args:
            docs: List of retrieved documents.

        Returns:
            Formatted citation string.
        """
        seen: Set[Tuple[str, Optional[int]]] = set()
        lines: List[str] = []

        for doc in docs:
            src = doc.metadata.get("source", "")
            page = doc.metadata.get("page", None)
            key = (src, page)

            if key in seen:
                continue
            seen.add(key)

            if page is not None:
                lines.append(f"- {src} p.{page}")
            else:
                lines.append(f"- {src}")

        return "\n".join(lines)

    def _format_budget_value(
        self,
        budget: Optional[Any],
        currency: Optional[str]
    ) -> str:
        """Format budget value for display.

        Args:
            budget: Budget amount (may be None or numeric).
            currency: Currency code.

        Returns:
            Formatted budget string.
        """
        if isinstance(budget, (int, float)) and currency:
            return f"{budget:,.0f} {currency}"
        elif budget:
            return str(budget)
        return "—"

    def _format_location(self, row: Dict[str, Any]) -> str:
        """Format location components into a string.

        Args:
            row: Query result row with location fields.

        Returns:
            Formatted location string.
        """
        loc_parts = [
            x for x in [
                row.get("address"),
                row.get("city"),
                row.get("state"),
                row.get("postal"),
                row.get("country"),
            ] if x
        ]
        return ", ".join(loc_parts) if loc_parts else "—"

    def _budget_location(self, graph: Any) -> str:
        """Deterministic answer for budget allocation and location.

        Args:
            graph: Neo4jGraph instance.

        Returns:
            Formatted budget and location answer.
        """
        rows = graph.query(self.CYPHER_BUDGET_LOCATION)

        if not rows:
            return "No structured budget/location data found in the graph yet."

        out = ["**Budget allocation (TIV) and location**"]
        for row in rows:
            budget_str = self._format_budget_value(
                row.get("budget"),
                row.get("currency"),
            )
            loc = self._format_location(row)
            out.append(f"- **{row.get('project')}**: {budget_str}; {loc}")

        return "\n".join(out)

    def _timelines(self, graph: Any) -> str:
        """Deterministic timeline comparison using extracted milestones.

        Args:
            graph: Neo4jGraph instance.

        Returns:
            Formatted timeline answer.
        """
        rows = graph.query(self.CYPHER_TIMELINES)
        logger.info(f"Timeline query returned {len(rows) if rows else 0} rows")

        if not rows:
            return "No structured timeline data found in the graph yet."

        out = ["**Timelines (milestones extracted from Schedule)**"]
        for row in rows:
            project_name = row.get('project') or 'Unknown Project'
            out.append(f"\n### {project_name}")
            milestones = row.get("milestones") or []
            logger.info(f"Project '{project_name}': {len(milestones)} milestones raw")

            # Filter out null milestones (from OPTIONAL MATCH returning nulls)
            valid_milestones = [m for m in milestones if m and m.get("name")]
            logger.info(f"Project '{project_name}': {len(valid_milestones)} valid milestones")

            if not valid_milestones:
                out.append("- No milestones extracted")
            else:
                for m in valid_milestones[:14]:  # Limit display
                    dt = (m.get("dateText") or "").strip()
                    nm = (m.get("name") or "Milestone").strip()
                    if dt:
                        out.append(f"- {nm}: {dt}")
                    else:
                        sent = m.get('sentence') or ''
                        out.append(f"- {nm}: {sent[:100]}")

        result = "\n".join(out)
        logger.info(f"Timeline result: {len(result)} chars")
        return result

    def _challenges(self, graph: Any) -> str:
        """Deterministic challenges listing from structured Challenge nodes.

        Args:
            graph: Neo4jGraph instance.

        Returns:
            Formatted challenges answer.
        """
        rows = graph.query(self.CYPHER_CHALLENGES)

        if not rows:
            return "No structured challenges found yet."

        out = [
            "**Potential challenges / constraints "
            "(from Status reason + Details + schedule heuristics)**"
        ]
        for row in rows:
            out.append(f"\n### {row['project']}")
            challenges = [x for x in (row.get("challenges") or []) if x]

            if not challenges:
                out.append("- —")
            else:
                for ch in challenges[:14]:  # Limit display
                    out.append(f"- {ch}")

        return "\n".join(out)

    def _get_graph_context(self, question: str, graph: Any) -> str:
        """Get relevant graph context without LLM Cypher generation.

        Uses simple pattern matching to find related entities.

        Args:
            question: User question
            graph: Neo4j graph instance

        Returns:
            Formatted graph context string
        """
        import re

        # Extract potential project names from question
        potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', question)

        if not potential_names:
            return ""

        context_parts = []

        for name in potential_names[:2]:
            try:
                results = graph.query("""
                    MATCH (p:Project)
                    WHERE toLower(p.name) CONTAINS toLower($name)
                    OPTIONAL MATCH (p)-[:HAS_BUDGET]->(b:Budget)
                    OPTIONAL MATCH (p)-[:LOCATED_IN]->(l:Location)
                    RETURN p.name AS project,
                           p.status AS status,
                           b.amount AS budget,
                           b.currency AS currency,
                           l.city AS city,
                           l.country AS country
                    LIMIT 3
                """, {"name": name.lower()})

                for r in results:
                    parts = [f"**{r['project']}**"]
                    if r.get('status'):
                        parts.append(f"Status: {r['status']}")
                    if r.get('budget'):
                        parts.append(f"Budget: {r['budget']:,.0f} {r.get('currency', '')}")
                    if r.get('city'):
                        parts.append(f"Location: {r['city']}, {r.get('country', '')}")
                    context_parts.append(" | ".join(parts))

            except Exception:
                pass

        return "\n".join(context_parts) if context_parts else ""

    def _get_retriever(self, state: AppState) -> OptimizedRetriever:
        """Get or create the optimized retriever.

        Args:
            state: Application state with vector store.

        Returns:
            OptimizedRetriever instance (fast pattern-based + cross-encoder).
        """
        if self._retriever is None:
            self._retriever = OptimizedRetriever(
                vector_store=state.vector,
                k_initial=self.k * 2,  # Retrieve more initially for reranking
                k_final=self.k,
                use_expansion=True,
                use_reranking=self.use_reranking,
                use_cache=True,
            )
        return self._retriever

    def _format_context(self, docs: List[Document]) -> str:
        """Format retrieved documents into context string.

        Args:
            docs: List of retrieved documents.

        Returns:
            Formatted context string with source attribution.
        """
        context_parts = []
        for i, doc in enumerate(docs, 1):
            source = doc.metadata.get('source', 'Unknown')
            page = doc.metadata.get('page', '?')
            section = doc.metadata.get('section', '')

            header = f"[{i}] Source: {source}, Page {page}"
            if section:
                header += f", Section: {section}"

            context_parts.append(f"{header}\n{doc.page_content}")

        return "\n\n---\n\n".join(context_parts)

    def _graphrag_answer(
        self,
        question: str,
        state: AppState,
    ) -> str:
        """Generate answer using optimized GraphRAG approach.

        Optimized flow:
        1. Retrieve with optimized retriever (pattern expansion + cross-encoder)
        2. Get graph context (no LLM Cypher generation)
        3. Single LLM call for synthesis

        Args:
            question: User question.
            state: Application state.

        Returns:
            Synthesized answer with citations.
        """
        with log_step(logger, "GraphRAG answer generation"):
            # Retrieve relevant chunks with optimized retriever
            with log_step(logger, "Retrieve relevant chunks"):
                if self.use_optimized_retrieval:
                    logger.substep("Using optimized retrieval (pattern expansion + cross-encoder)")
                    retriever = self._get_retriever(state)
                    docs = retriever.retrieve(question)
                else:
                    logger.substep("Using simple similarity search")
                    docs = state.vector.similarity_search(question, k=self.k)
                logger.info(f"Retrieved {len(docs)} chunks")

            # Get graph context (fast, no LLM)
            with log_step(logger, "Get graph context"):
                graph = state.get_graph()
                graph_context = self._get_graph_context(question, graph)
                if graph_context:
                    logger.substep(f"Found graph context")
                else:
                    logger.substep("No direct graph context found")

            # Format context
            context = self._format_context(docs)

            # Single LLM call for synthesis
            with log_step(logger, "Synthesize answer"):
                logger.substep("Invoking LLM for synthesis")
                synthesis_prompt = self.SYNTHESIS_PROMPT.format(
                    question=question,
                    context=context,
                    graph_context=graph_context if graph_context else "(No structured data found)",
                )

                resp = state.llm.invoke(synthesis_prompt)
                answer = getattr(resp, "content", str(resp))

            # Cache the answer
            if self._cache and self.use_caching:
                logger.substep("Caching answer")
                self._cache.set_answer(
                    query=question,
                    answer=answer,
                    documents=docs,
                    cypher_result=graph_context,
                )

        return answer

    def clear_cache(self) -> int:
        """Clear the answer cache.

        Returns:
            Number of cached entries cleared.
        """
        if self._cache:
            return self._cache.invalidate_all()
        return 0

    def get_cache_stats(self) -> Dict[str, Any]:
        """Get cache statistics.

        Returns:
            Dictionary with cache metrics.
        """
        if self._cache:
            return self._cache.get_stats()
        return {"caching_enabled": False}

    @trace_flow("Query Processing")
    def answer(self, question: str, state: AppState) -> str:
        """Answer a user question using optimized hybrid approach.

        Flow:
        1. Check answer cache
        2. Template routing with pattern classification
        3. For structured queries: Execute template + format
        4. For general queries: Vector search + rerank + synthesis

        Args:
            question: Natural language user query.
            state: AppState initialized after successful ingestion.

        Returns:
            Markdown response suitable for display.
        """
        logger.info(f"Processing question: {question[:80]}...")

        if not state or not state.is_ready():
            logger.warning("State not ready - PDFs not ingested")
            return "Please ingest PDFs first."

        # Check cache first
        if self._cache and self.use_caching:
            with log_step(logger, "Check cache"):
                cached = self._cache.get_answer(question)
                if cached:
                    logger.info("Cache hit")
                    return cached.answer

        graph = state.get_graph()

        # Try template routing first (handles 70-80% of queries)
        with log_step(logger, "Template routing"):
            results, intent = self._template_router.route_query(question, graph)

            if intent != QueryIntent.GENERAL and results is not None:
                # Format template results (no LLM needed)
                answer = TemplateResultFormatter.format(results, intent)

                # Cache the answer
                if self._cache and self.use_caching:
                    self._cache.set_answer(
                        query=question,
                        answer=answer,
                        documents=[],
                        cypher_result=str(results[:3]) if results else "",
                    )

                logger.info(f"Template answer (intent: {intent.value})")
                return answer

            logger.info(f"Intent: {intent.value} - using RAG fallback")

        # GraphRAG fallback for general queries
        answer = self._graphrag_answer(question, state)
        logger.info("RAG answer generated")
        return answer