Spaces:

niloydebbarma
/

allycat

Runtime error

App Files Files Community

niloydebbarma commited on Oct 27, 2025

Commit

9e5bc69

verified ·

1 Parent(s): c9efcfc

Upload 8 files

Browse files

Files changed (8) hide show

query_graph_functions/__init__.py +112 -0
query_graph_functions/answer_synthesis.py +408 -0
query_graph_functions/follow_up_search.py +429 -0
query_graph_functions/knowledge_retrieval.py +843 -0
query_graph_functions/query_preprocessing.py +592 -0
query_graph_functions/response_management.py +259 -0
query_graph_functions/setup.py +361 -0
query_graph_functions/vector_augmentation.py +262 -0

query_graph_functions/__init__.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+Query Graph Functions Package
+Core modules for graph-based retrieval augmentation implementation.
+Package contents:
+- setup.py: Initialization and connection functionality (Phase A: Steps 1-2)
+- query_preprocessing.py: Query analysis, routing, and vectorization (Phase B: Steps 3-5)
+- knowledge_retrieval.py: Community search and data extraction (Phase C: Steps 6-8)
+- follow_up_search.py: Follow-up search and entity extraction (Phase D: Steps 9-12)
+- vector_augmentation.py: Vector search enhancement (Phase E: Steps 13-14)
+- answer_synthesis.py: Final answer generation (Phase F: Steps 15-16)
+- response_management.py: Metadata generation and file persistence (Phase G: Steps 17-20)
+"""
+# Phase A: Initialization (Steps 1-2)
+from .setup import GraphRAGSetup, create_graphrag_setup
+# Phase B: Query Preprocessing (Steps 3-5)
+from .query_preprocessing import (
+    QueryAnalyzer,
+    DriftRouter,
+    QueryVectorizer,
+    QueryPreprocessor,
+    create_query_preprocessor,
+    preprocess_query_pipeline,
+    QueryAnalysis,
+    DriftRoutingResult,
+    VectorizedQuery,
+    QueryType,
+    SearchStrategy
+)
+# Phase C: Knowledge Retrieval (Steps 6-8)
+from .knowledge_retrieval import (
+    CommunitySearchEngine,
+    CommunityResult,
+    EntityResult,
+    RelationshipResult
+)
+# Phase D: Follow-up Search (Steps 9-12)
+from .follow_up_search import (
+    FollowUpSearch,
+    FollowUpQuestion,
+    LocalSearchResult,
+    IntermediateAnswer
+)
+# Phase E: Vector Search Augmentation (Steps 13-14)
+from .vector_augmentation import (
+    VectorAugmentationEngine,
+    VectorSearchResult,
+    AugmentationResult
+)
+# Phase F: Answer Synthesis (Steps 15-16)
+from .answer_synthesis import (
+    AnswerSynthesisEngine,
+    SynthesisResult,
+    SourceEvidence
+)
+# Phase G: Response Management (Steps 17-20)
+from .response_management import (
+    ResponseManager,
+    ResponseMetadata
+)
+__version__ = "1.3.0"
+__author__ = "AllyCat GraphRAG Team"
+__description__ = "Graph-based retrieval augmentation implementation for AllyCat"
+# Export main classes and functions
+__all__ = [
+    # Phase A: Initialization
+    "GraphRAGSetup",
+    "create_graphrag_setup",
+    # Phase B: Query Preprocessing
+    "QueryAnalyzer",
+    "DriftRouter",
+    "QueryVectorizer",
+    "QueryPreprocessor",
+    "create_query_preprocessor",
+    "preprocess_query_pipeline",
+    "QueryAnalysis",
+    "DriftRoutingResult",
+    "VectorizedQuery",
+    "QueryType",
+    "SearchStrategy",
+    # Phase C: Knowledge Retrieval
+    "CommunitySearchEngine",
+    "CommunityResult",
+    "EntityResult",
+    "RelationshipResult",
+    # Phase D: Follow-up Search
+    "FollowUpSearch",
+    "FollowUpQuestion",
+    "LocalSearchResult",
+    "IntermediateAnswer",
+    # Phase E: Vector Augmentation
+    "VectorAugmentationEngine",
+    "VectorSearchResult",
+    "AugmentationResult",
+    # Phase F: Answer Synthesis
+    "AnswerSynthesisEngine",
+    "SynthesisResult",
+    "SourceEvidence",
+    # Phase G: Response Management
+    "ResponseManager",
+    "ResponseMetadata"
+]

query_graph_functions/answer_synthesis.py ADDED Viewed

	@@ -0,0 +1,408 @@

+"""Answer synthesis module for final response generation. - Phase F (Steps 15-16)"""
+import logging
+import json
+from typing import Dict, List, Any, Optional
+from dataclasses import dataclass
+from datetime import datetime
+from .setup import GraphRAGSetup
+from .query_preprocessing import DriftRoutingResult, QueryAnalysis
+from .vector_augmentation import AugmentationResult
+@dataclass
+class SourceEvidence:
+    """Evidence source with attribution and confidence."""
+    source_type: str  # 'community', 'entity', 'relationship', 'vector_doc'
+    source_id: str
+    content: str
+    confidence: float
+    phase: str  # 'C', 'D', 'E'
+@dataclass
+class SynthesisResult:
+    """Phase F synthesis result with comprehensive answer."""
+    final_answer: str
+    confidence_score: float
+    source_evidence: List[SourceEvidence]
+    synthesis_strategy: str
+    coverage_assessment: Dict[str, float]
+    execution_time: float
+    metadata: Dict[str, Any]
+class AnswerSynthesisEngine:
+    """
+    Answer synthesis engine implementing Phase F (Steps 15-16).
+    Handles final answer generation process:
+    - Context assembly and evidence ranking (Step 15)
+    - Final answer generation with confidence scoring (Step 16)
+    """
+    def __init__(self, setup: GraphRAGSetup):
+        self.setup = setup
+        self.llm = setup.llm
+        self.config = setup.config
+        self.logger = logging.getLogger(self.__class__.__name__)
+        # Synthesis parameters
+        self.min_confidence_threshold = 0.7
+        self.max_synthesis_length = 2000
+    async def execute_answer_synthesis_phase(self,
+                                           analysis: QueryAnalysis,
+                                           routing: DriftRoutingResult,
+                                           community_results: Dict[str, Any],
+                                           follow_up_results: Dict[str, Any],
+                                           augmentation_results: AugmentationResult) -> SynthesisResult:
+        """
+        Execute answer synthesis phase with comprehensive integration.
+        Args:
+            analysis: Query analysis results
+            routing: Routing decision parameters
+            community_results: Community search results
+            follow_up_results: Follow-up search results
+            augmentation_results: Vector augmentation results
+        Returns:
+            Synthesis result with final answer
+        """
+        start_time = datetime.now()
+        try:
+            # Context assembly
+            self.logger.info("Starting Step 15: Context Assembly and Ranking")
+            assembled_context = await self._assemble_and_rank_context(
+                analysis, community_results, follow_up_results, augmentation_results
+            )
+            # Final answer generation
+            self.logger.info("Starting Step 16: Final Answer Generation")
+            final_answer, confidence = await self._generate_final_answer(
+                analysis, routing, assembled_context
+            )
+            execution_time = (datetime.now() - start_time).total_seconds()
+            synthesis_result = SynthesisResult(
+                final_answer=final_answer,
+                confidence_score=confidence,
+                source_evidence=assembled_context['evidence'],
+                synthesis_strategy='comprehensive_drift',
+                coverage_assessment=assembled_context['coverage'],
+                execution_time=execution_time,
+                metadata={
+                    'sources_integrated': len(assembled_context['evidence']),
+                    'phase_coverage': assembled_context['phase_coverage'],
+                    'synthesis_method': 'llm_guided',
+                    'phase': 'answer_synthesis',
+                    'step_range': '15-16'
+                }
+            )
+            self.logger.info(f"Phase F completed: confidence {confidence:.3f}, {len(assembled_context['evidence'])} sources integrated")
+            return synthesis_result
+        except Exception as e:
+            self.logger.error(f"Answer synthesis phase failed: {e}")
+            # Return fallback synthesis on failure
+            return self._create_fallback_synthesis(
+                community_results, follow_up_results,
+                (datetime.now() - start_time).total_seconds(), str(e)
+            )
+    async def _assemble_and_rank_context(self,
+                                       analysis: QueryAnalysis,
+                                       community_results: Dict[str, Any],
+                                       follow_up_results: Dict[str, Any],
+                                       augmentation_results: AugmentationResult) -> Dict[str, Any]:
+        """
+        Step 15: Assemble and rank all context from Phases C, D, and E.
+        Prioritizes information by relevance, confidence, and source diversity.
+        """
+        evidence_sources = []
+        # Extract community evidence
+        if 'communities' in community_results:
+            for community in community_results['communities']:
+                evidence_sources.append(SourceEvidence(
+                    source_type='community',
+                    source_id=community.community_id,
+                    content=community.summary,
+                    confidence=community.similarity_score,
+                    phase='C'
+                ))
+        # Extract follow-up evidence
+        if 'intermediate_answers' in follow_up_results:
+            for answer in follow_up_results['intermediate_answers']:
+                evidence_sources.append(SourceEvidence(
+                    source_type='entity_search',
+                    source_id=f"followup_{len(evidence_sources)}",
+                    content=f"Q: {answer.question}\nA: {answer.answer}",
+                    confidence=answer.confidence,
+                    phase='D'
+                ))
+        # Extract vector evidence
+        if augmentation_results and augmentation_results.vector_results:
+            for i, vector_result in enumerate(augmentation_results.vector_results):
+                evidence_sources.append(SourceEvidence(
+                    source_type='vector_doc',
+                    source_id=f"vector_{i}",
+                    content=vector_result.content,
+                    confidence=vector_result.similarity_score,
+                    phase='E'
+                ))
+        # Rank evidence
+        ranked_evidence = sorted(evidence_sources, key=lambda x: x.confidence, reverse=True)
+        # Calculate coverage
+        coverage = {
+            'community_coverage': len([e for e in ranked_evidence if e.phase == 'C']) / max(1, len(community_results.get('communities', []))),
+            'entity_coverage': len([e for e in ranked_evidence if e.phase == 'D']) / max(1, len(follow_up_results.get('intermediate_answers', []))),
+            'vector_coverage': len([e for e in ranked_evidence if e.phase == 'E']) / max(1, len(augmentation_results.vector_results) if augmentation_results else 1),
+            'overall_confidence': sum(e.confidence for e in ranked_evidence) / max(1, len(ranked_evidence))
+        }
+        phase_coverage = {
+            'phase_c': len([e for e in ranked_evidence if e.phase == 'C']),
+            'phase_d': len([e for e in ranked_evidence if e.phase == 'D']),
+            'phase_e': len([e for e in ranked_evidence if e.phase == 'E'])
+        }
+        return {
+            'evidence': ranked_evidence[:15],  # Top 15 pieces of evidence
+            'coverage': coverage,
+            'phase_coverage': phase_coverage
+        }
+    async def _generate_final_answer(self,
+                                   analysis: QueryAnalysis,
+                                   routing: DriftRoutingResult,
+                                   assembled_context: Dict[str, Any]) -> tuple[str, float]:
+        """
+        Step 16: Generate comprehensive final answer using LLM synthesis.
+        Creates structured, comprehensive response with proper source attribution.
+        """
+        try:
+            # Prepare prompt
+            synthesis_prompt = self._create_synthesis_prompt(
+                routing.original_query,
+                assembled_context['evidence']
+            )
+            # Generate answer
+            response = self.llm.complete(synthesis_prompt)
+            final_answer = str(response).strip()
+            # Calculate confidence
+            synthesis_confidence = self._calculate_synthesis_confidence(
+                assembled_context['evidence'], assembled_context['coverage']
+            )
+            # Format final answer
+            formatted_answer = self._format_final_answer(
+                final_answer, assembled_context['evidence'], synthesis_confidence
+            )
+            return formatted_answer, synthesis_confidence
+        except Exception as e:
+            self.logger.error(f"Final answer generation failed: {e}")
+            return self._create_fallback_answer(assembled_context['evidence']), 0.5
+    def _create_synthesis_prompt(self, original_query: str, evidence: List[SourceEvidence]) -> str:
+        """Create comprehensive synthesis prompt for LLM."""
+        prompt_parts = [
+            f"# Query: {original_query}",
+            "",
+            "You are an expert synthesizing information from multiple sources.",
+            "Create a comprehensive, accurate answer using the following evidence:",
+            "",
+            "## Evidence Sources:",
+            ""
+        ]
+        for i, source in enumerate(evidence[:10], 1):  # Top 10 sources
+            prompt_parts.extend([
+                f"### Source {i} ({source.phase} - {source.source_type}, confidence: {source.confidence:.3f})",
+                source.content[:500] + ("..." if len(source.content) > 500 else ""),
+                ""
+            ])
+        prompt_parts.extend([
+            "## Instructions:",
+            "1. Synthesize a comprehensive answer addressing the original query",
+            "2. Prioritize high-confidence sources (>0.8)",
+            "3. Include specific details and examples from the evidence",
+            "4. Structure the response clearly with sections if appropriate",
+            "5. Do not mention source IDs or technical details",
+            "6. Focus on factual accuracy and completeness",
+            "",
+            "## Comprehensive Answer:"
+        ])
+        return "\n".join(prompt_parts)
+    def _calculate_synthesis_confidence(self, evidence: List[SourceEvidence], coverage: Dict[str, float]) -> float:
+        """Calculate overall synthesis confidence based on evidence quality and coverage."""
+        if not evidence:
+            return 0.0
+        # Weight evidence
+        evidence_confidence = sum(e.confidence for e in evidence) / len(evidence)
+        coverage_score = sum(coverage.values()) / len(coverage)
+        # Coverage bonus
+        phase_diversity = len(set(e.phase for e in evidence)) / 3.0  # 3 phases max
+        # Combined score
+        synthesis_confidence = (evidence_confidence * 0.5) + (coverage_score * 0.3) + (phase_diversity * 0.2)
+        return min(synthesis_confidence, 1.0)
+    def _format_final_answer(self, answer: str, evidence: List[SourceEvidence], confidence: float) -> str:
+        """Format the final answer with proper structure and attribution."""
+        formatted_parts = [
+            "# Comprehensive Answer",
+            "",
+            answer,
+            "",
+            "---",
+            "",
+            f"**Answer Confidence**: {confidence:.1%}",
+            f"**Sources Integrated**: {len(evidence)} evidence sources",
+            f"**Multi-Phase Coverage**: {len(set(e.phase for e in evidence))} phases (C: Community, D: Entity, E: Vector)",
+            ""
+        ]
+        return "\n".join(formatted_parts)
+    def _create_fallback_answer(self, evidence: List[SourceEvidence]) -> str:
+        """Create fallback answer when LLM synthesis fails."""
+        if not evidence:
+            return "Unable to generate answer due to insufficient evidence."
+        # Simple concatenation of top evidence
+        fallback_parts = [
+            "# Answer Summary",
+            "",
+            "Based on available evidence:",
+            ""
+        ]
+        for i, source in enumerate(evidence[:3], 1):
+            fallback_parts.extend([
+                f"## Source {i} (Confidence: {source.confidence:.2f})",
+                source.content[:300] + ("..." if len(source.content) > 300 else ""),
+                ""
+            ])
+        return "\n".join(fallback_parts)
+    def _create_fallback_synthesis(self, community_results: Dict, follow_up_results: Dict,
+                                 execution_time: float, error: str) -> SynthesisResult:
+        """Create fallback synthesis result when phase fails."""
+        return SynthesisResult(
+            final_answer=" Response failed due to technical error. Please try again.",
+            confidence_score=0.0,
+            source_evidence=[],
+            synthesis_strategy='fallback',
+            coverage_assessment={'overall_confidence': 0.0},
+            execution_time=execution_time,
+            metadata={'error': error, 'fallback': True}
+        )
+    def combine_phase_results(self,
+                            phase_c_answer: str,
+                            follow_up_results: Dict[str, Any],
+                            augmentation_results=None) -> str:
+        """
+        Combine Phase C, D, and E results into enhanced answer.
+        Creates comprehensive response by integrating results from multiple phases.
+        """
+        try:
+            intermediate_answers = follow_up_results.get('intermediate_answers', [])
+            if not intermediate_answers:
+                return phase_c_answer
+            # Start with Phase C answer
+            enhanced_parts = [
+                "## Global Context (Phase C)",
+                phase_c_answer.strip(),
+                "",
+                "## Detailed Information (Phase D)"
+            ]
+            # Add intermediate answers from Phase D
+            for i, answer in enumerate(intermediate_answers, 1):
+                enhanced_parts.extend([
+                    f"**{i}. {answer.question}**",
+                    answer.answer,
+                    f"*Confidence: {answer.confidence:.2f}*",
+                    ""
+                ])
+            # Add Phase E vector augmentation if available
+            if augmentation_results and hasattr(augmentation_results, 'vector_results') and augmentation_results.vector_results:
+                enhanced_parts.extend([
+                    "## Vector Augmentation (Phase E)",
+                    f"**Semantic Enhancement** (Confidence: {augmentation_results.augmentation_confidence:.2f})",
+                    ""
+                ])
+                # Add top vector results
+                for i, vector_result in enumerate(augmentation_results.vector_results[:3], 1):
+                    enhanced_parts.extend([
+                        f"**Vector Result {i}** (Similarity: {vector_result.similarity_score:.3f})",
+                        vector_result.content,  # Show full content without truncation
+                        ""
+                    ])
+            # Add supporting evidence if available
+            if intermediate_answers:
+                enhanced_parts.extend([
+                    "## Supporting Evidence",
+                    "**Key Entities Found:** " + ", ".join(
+                        set(entity for answer in intermediate_answers
+                            for entity in answer.supporting_entities[:3])
+                    ),
+                    ""
+                ])
+            return "\n".join(enhanced_parts)
+        except Exception as e:
+            self.logger.error(f"Failed to combine phase results: {e}")
+            return phase_c_answer
+    def generate_error_response(self, error_message: str) -> Dict[str, Any]:
+        """
+        Generate standardized error response.
+        Creates consistent error format for failed synthesis operations.
+        """
+        return {
+            "answer": f"Sorry, I encountered an error during answer synthesis: {error_message}",
+            "metadata": {
+                "status": "synthesis_error",
+                "error_message": error_message,
+                "synthesis_stage": "failed",
+                "confidence_score": 0.0,
+                "timestamp": datetime.now().isoformat()
+            }
+        }
+# Exports
+__all__ = ['AnswerSynthesisEngine', 'SynthesisResult', 'SourceEvidence']

query_graph_functions/follow_up_search.py ADDED Viewed

	@@ -0,0 +1,429 @@

+"""Follow-up search module for local graph traversal. - Phase D (Steps 9-12)"""
+import logging
+from typing import Dict, List, Any
+from dataclasses import dataclass
+import re
+from datetime import datetime
+# Project imports
+from .setup import GraphRAGSetup
+from .query_preprocessing import DriftRoutingResult
+from .knowledge_retrieval import CommunityResult, EntityResult, RelationshipResult
+@dataclass
+class FollowUpQuestion:
+    """Represents a follow-up question from Phase C."""
+    question: str
+    question_id: int
+    extracted_entities: List[str]
+    query_type: str
+    confidence: float
+@dataclass
+class LocalSearchResult:
+    """Results from local graph traversal."""
+    seed_entities: List[EntityResult]
+    traversed_entities: List[EntityResult]
+    traversed_relationships: List[RelationshipResult]
+    search_depth: int
+    total_nodes_visited: int
+@dataclass
+class IntermediateAnswer:
+    """Intermediate answer for a follow-up question."""
+    question_id: int
+    question: str
+    answer: str
+    confidence: float
+    reasoning: str
+    supporting_entities: List[str]
+    supporting_evidence: List[str]
+class FollowUpSearch:
+    """Follow-up search module for local graph traversal."""
+    def __init__(self, setup: GraphRAGSetup):
+        self.setup = setup
+        self.neo4j_conn = setup.neo4j_conn
+        self.logger = logging.getLogger(__name__)
+        # Configuration
+        self.max_traversal_depth = 2
+        self.max_entities_per_hop = 20
+        self.min_entity_confidence = 0.7
+        self.min_relationship_confidence = 0.6
+    async def execute_follow_up_phase(self,
+                                    phase_c_results: Dict[str, Any],
+                                    routing_result: DriftRoutingResult) -> Dict[str, Any]:
+        """
+        Execute follow-up search pipeline based on initial results.
+        Args:
+            phase_c_results: Results from community search with follow-up questions
+            routing_result: Routing configuration parameters
+        Returns:
+            Dictionary with intermediate answers and entity information
+        """
+        try:
+            self.logger.info("Starting Follow-up Search (Steps 9-12)")
+            # Process follow-up questions
+            self.logger.info("Starting Step 9: Follow-up Question Processing")
+            follow_up_questions = await self._process_follow_up_questions(
+                phase_c_results.get('initial_answer', {}).get('follow_up_questions', []),
+                routing_result
+            )
+            self.logger.info(f"Step 9 completed: {len(follow_up_questions)} questions processed")
+            # Local graph traversal
+            self.logger.info("Starting Step 10: Local Graph Traversal")
+            local_search_results = await self._execute_local_traversal(
+                follow_up_questions,
+                phase_c_results.get('communities', []),
+                routing_result
+            )
+            self.logger.info(f"Step 10 completed: {len(local_search_results)} searches performed")
+            # Entity extraction
+            self.logger.info("Starting Step 11: Detailed Entity Extraction")
+            detailed_entities = await self._extract_detailed_entities(
+                local_search_results,
+                routing_result
+            )
+            self.logger.info(f"Step 11 completed: {len(detailed_entities)} detailed entities extracted")
+            # Generate intermediate answers
+            self.logger.info("Starting Step 12: Intermediate Answer Generation")
+            intermediate_answers = await self._generate_intermediate_answers(
+                follow_up_questions,
+                local_search_results,
+                detailed_entities,
+                routing_result
+            )
+            self.logger.info(f"Step 12 completed: {len(intermediate_answers)} intermediate answers generated")
+            # Compile results
+            phase_d_results = {
+                'follow_up_questions': follow_up_questions,
+                'local_search_results': local_search_results,
+                'detailed_entities': detailed_entities,
+                'intermediate_answers': intermediate_answers,
+                'execution_stats': {
+                    'questions_processed': len(follow_up_questions),
+                    'local_searches_executed': len(local_search_results),
+                    'entities_extracted': len(detailed_entities),
+                    'answers_generated': len(intermediate_answers),
+                    'timestamp': datetime.now().isoformat()
+                }
+            }
+            self.logger.info(f"Phase D completed: {len(intermediate_answers)} detailed answers generated")
+            return phase_d_results
+        except Exception as e:
+            self.logger.error(f"Phase D execution failed: {e}")
+            return {'error': str(e), 'intermediate_answers': []}
+    async def _process_follow_up_questions(self,
+                                         questions: List[str],
+                                         routing_result: DriftRoutingResult) -> List[FollowUpQuestion]:
+        """Simple: just wrap questions in FollowUpQuestion objects."""
+        processed_questions = []
+        for i, question in enumerate(questions):
+            # Extract keywords
+            keywords = re.findall(r'\b[A-Z][a-z]+\b|\b[A-Z]{2,}\b', question)
+            keywords = [k for k in keywords if k not in ['What', 'Which', 'Who', 'How', 'Are', 'The']]
+            follow_up = FollowUpQuestion(
+                question=question,
+                question_id=i + 1,
+                extracted_entities=keywords[:3],  # Top 3 keywords
+                query_type='search',
+                confidence=0.8
+            )
+            processed_questions.append(follow_up)
+            self.logger.info(f"Question {i+1}: {question} -> Keywords: {keywords[:3]}")
+        return processed_questions
+    async def _execute_local_traversal(self,
+                                     questions: List[FollowUpQuestion],
+                                     communities: List[CommunityResult],
+                                     routing_result: DriftRoutingResult) -> List[LocalSearchResult]:
+        """
+        Step 10: Execute local graph traversal for each follow-up question.
+        Performs multi-hop traversal from seed entities to find detailed information.
+        """
+        local_results = []
+        for question in questions:
+            try:
+                # Find seed entities
+                seed_entities = await self._find_seed_entities(
+                    question.extracted_entities,
+                    communities
+                )
+                if not seed_entities:
+                    self.logger.warning(f"No seed entities found for question: {question.question}")
+                    continue
+                # Multi-hop traversal
+                traversal_result = await self._multi_hop_traversal(
+                    seed_entities,
+                    question,
+                    routing_result
+                )
+                local_results.append(traversal_result)
+                self.logger.info(f"   Traversal for Q{question.question_id}: {traversal_result.total_nodes_visited} nodes visited")
+            except Exception as e:
+                self.logger.error(f"Local traversal failed for question {question.question_id}: {e}")
+        return local_results
+    async def _find_seed_entities(self,
+                                entity_names: List[str],
+                                communities: List[CommunityResult]) -> List[EntityResult]:
+        """Just search the graph for entities matching the keywords."""
+        if not entity_names:
+            return []
+        # Search query
+        conditions = " OR ".join([f"n.name CONTAINS '{name}'" for name in entity_names])
+        query = f"""
+        MATCH (n)
+        WHERE n.name IS NOT NULL AND ({conditions})
+        RETURN n.id as entity_id, n.name as name, n.content as content,
+               n.confidence as confidence,
+               n.degree_centrality as degree_centrality,
+               n.betweenness_centrality as betweenness_centrality,
+               n.closeness_centrality as closeness_centrality,
+               labels(n) as node_types
+        ORDER BY n.degree_centrality DESC
+        LIMIT 20
+        """
+        try:
+            results = self.neo4j_conn.execute_query(query, {})
+            entities = []
+            for record in results:
+                entity = EntityResult(
+                    entity_id=record['entity_id'],
+                    name=record['name'],
+                    content=record['content'],
+                    confidence=record['confidence'],
+                    degree_centrality=record['degree_centrality'],
+                    betweenness_centrality=record['betweenness_centrality'],
+                    closeness_centrality=record['closeness_centrality'],
+                    # Set community info
+                    community_id='found',
+                    node_type=', '.join(record['node_types']) if record['node_types'] else 'Entity'
+                )
+                entities.append(entity)
+            return entities
+        except Exception as e:
+            self.logger.error(f"Search failed: {e}")
+            return []
+    async def _multi_hop_traversal(self,
+                                 seed_entities: List[EntityResult],
+                                 question: FollowUpQuestion,
+                                 routing_result: DriftRoutingResult) -> LocalSearchResult:
+        """Execute multi-hop graph traversal from seed entities."""
+        all_entities = list(seed_entities)
+        all_relationships = []
+        visited_node_ids = {entity.entity_id for entity in seed_entities}
+        current_entities = seed_entities
+        for hop in range(self.max_traversal_depth):
+            if not current_entities:
+                break
+            # Get entity IDs for this hop
+            current_ids = [entity.entity_id for entity in current_entities]
+            # Multi-hop traversal query
+            traversal_query = """
+            MATCH (seed)-[r]-(neighbor)
+            WHERE seed.id IN $current_ids
+              AND NOT (neighbor.id IN $visited_ids)
+              AND r.confidence >= $min_rel_confidence
+              AND neighbor.confidence >= $min_entity_confidence
+              AND neighbor.name IS NOT NULL
+              AND neighbor.content IS NOT NULL
+            RETURN DISTINCT
+                   seed.id as seed_id,
+                   neighbor.id as neighbor_id,
+                   neighbor.name as neighbor_name,
+                   neighbor.content as neighbor_content,
+                   neighbor.confidence as neighbor_confidence,
+                   neighbor.degree_centrality as degree_centrality,
+                   neighbor.betweenness_centrality as betweenness_centrality,
+                   neighbor.closeness_centrality as closeness_centrality,
+                   labels(neighbor) as neighbor_types,
+                   type(r) as relationship_type,
+                   r.confidence as relationship_confidence
+            ORDER BY neighbor.degree_centrality DESC, r.confidence DESC
+            LIMIT $max_results
+            """
+            try:
+                results = self.neo4j_conn.execute_query(
+                    traversal_query,
+                    {
+                        'current_ids': current_ids,
+                        'visited_ids': list(visited_node_ids),
+                        'min_rel_confidence': self.min_relationship_confidence,
+                        'min_entity_confidence': self.min_entity_confidence,
+                        'max_results': self.max_entities_per_hop
+                    }
+                )
+                next_hop_entities = []
+                for record in results:
+                    neighbor_id = record['neighbor_id']
+                    if neighbor_id not in visited_node_ids:
+                        # Create entity result
+                        entity = EntityResult(
+                            entity_id=neighbor_id,
+                            name=record['neighbor_name'],
+                            content=record['neighbor_content'],
+                            confidence=record['neighbor_confidence'],
+                            degree_centrality=record['degree_centrality'] or 0.0,
+                            betweenness_centrality=record['betweenness_centrality'] or 0.0,
+                            closeness_centrality=record['closeness_centrality'] or 0.0,
+                            # Set community info
+                            community_id='unknown',
+                            node_type=', '.join(record['neighbor_types']) if record['neighbor_types'] else 'Entity'
+                        )
+                        all_entities.append(entity)
+                        next_hop_entities.append(entity)
+                        visited_node_ids.add(neighbor_id)
+                    # Create relationship result using REAL schema attributes
+                    relationship = RelationshipResult(
+                        start_node=record['seed_id'],
+                        end_node=neighbor_id,
+                        relationship_type=record['relationship_type'],
+                        confidence=record['relationship_confidence']
+                        # Using REAL schema: startNode, endNode
+                    )
+                    all_relationships.append(relationship)
+                current_entities = next_hop_entities
+                self.logger.info(f"     Hop {hop + 1}: Found {len(next_hop_entities)} new entities")
+            except Exception as e:
+                self.logger.error(f"Multi-hop traversal failed at hop {hop + 1}: {e}")
+                break
+        return LocalSearchResult(
+            seed_entities=seed_entities,
+            traversed_entities=all_entities,
+            traversed_relationships=all_relationships,
+            search_depth=min(hop + 1, self.max_traversal_depth),
+            total_nodes_visited=len(visited_node_ids)
+        )
+    async def _extract_detailed_entities(self,
+                                       local_results: List[LocalSearchResult],
+                                       routing_result: DriftRoutingResult) -> List[EntityResult]:
+        """
+        Step 11: Extract detailed entity information from local search results.
+        Combines and ranks entities from all local searches.
+        """
+        all_entities = []
+        entity_scores = {}
+        # Collect all entities and calculate importance scores
+        for search_result in local_results:
+            for entity in search_result.traversed_entities:
+                if entity.entity_id not in entity_scores:
+                    # Calculate entity importance score
+                    importance_score = (
+                        0.4 * entity.confidence +
+                        0.3 * entity.degree_centrality +
+                        0.2 * entity.betweenness_centrality +
+                        0.1 * entity.closeness_centrality
+                    )
+                    entity_scores[entity.entity_id] = {
+                        'entity': entity,
+                        'importance_score': importance_score,
+                        'appearance_count': 1
+                    }
+                    all_entities.append(entity)
+                else:
+                    # Increment appearance count for entities found in multiple searches
+                    entity_scores[entity.entity_id]['appearance_count'] += 1
+        # Sort entities by importance score and appearance frequency
+        sorted_entities = sorted(
+            entity_scores.values(),
+            key=lambda x: (x['appearance_count'], x['importance_score']),
+            reverse=True
+        )
+        # Return top entities
+        max_entities = routing_result.parameters.get('max_detailed_entities', 50)
+        detailed_entities = [item['entity'] for item in sorted_entities[:max_entities]]
+        self.logger.info(f"Extracted {len(detailed_entities)} detailed entities from {len(all_entities)} total")
+        return detailed_entities
+    async def _generate_intermediate_answers(self,
+                                           questions: List[FollowUpQuestion],
+                                           local_results: List[LocalSearchResult],
+                                           detailed_entities: List[EntityResult],
+                                           routing_result: DriftRoutingResult) -> List[IntermediateAnswer]:
+        """Simple: just list the entity names we found."""
+        answers = []
+        for i, question in enumerate(questions):
+            # Get entities from search result
+            entities = local_results[i].traversed_entities if i < len(local_results) else []
+            entity_names = [e.name for e in entities[:10]]
+            # Simple answer with entity names
+            answer_text = f"Found entities: {', '.join(entity_names)}" if entity_names else "No specific entities found."
+            answer = IntermediateAnswer(
+                question_id=question.question_id,
+                question=question.question,
+                answer=answer_text,
+                confidence=0.8,
+                reasoning=f"Found {len(entity_names)} entities matching the search criteria.",
+                supporting_entities=entity_names,
+                supporting_evidence=[]
+            )
+            answers.append(answer)
+        return answers
+# Exports
+__all__ = ['FollowUpSearch', 'FollowUpQuestion', 'LocalSearchResult', 'IntermediateAnswer']

query_graph_functions/knowledge_retrieval.py ADDED Viewed

	@@ -0,0 +1,843 @@

+"""
+Knowledge Retrieval Module - Phase C (Steps 6-8)
+Performs community search and data extraction using graph database structures.
+Handles community retrieval, data extraction, and initial answer generation.
+"""
+import logging
+import numpy as np
+import json
+from typing import Dict, List, Tuple, Any
+from dataclasses import dataclass
+from datetime import datetime
+from .setup import GraphRAGSetup
+from .query_preprocessing import DriftRoutingResult
+@dataclass
+class CommunityResult:
+    """Enhanced community result with comprehensive properties."""
+    community_id: str
+    similarity_score: float
+    summary: str
+    key_entities: List[str]
+    member_ids: List[str]  # Direct member access
+    modularity_score: float  # Community quality
+    level: int
+    internal_edges: int
+    member_count: int
+    centrality_stats: Dict[str, float]  # Aggregated centrality measures
+    confidence_score: float
+    search_index: str  # Optimized search key
+    termination_criteria: Dict[str, Any]
+@dataclass
+class EntityResult:
+    """Entity result with attributes from graph database."""
+    entity_id: str
+    name: str
+    content: str
+    confidence: float
+    degree_centrality: float
+    betweenness_centrality: float
+    closeness_centrality: float
+    community_id: str
+    node_type: str
+@dataclass
+class RelationshipResult:
+    """Relationship result with graph database attributes."""
+    start_node: str
+    end_node: str
+    relationship_type: str
+    confidence: float
+class CommunitySearchEngine:
+    """Knowledge retrieval engine for community search and entity extraction."""
+    def __init__(self, setup: GraphRAGSetup):
+        self.setup = setup
+        self.neo4j_conn = setup.neo4j_conn
+        self.config = setup.config
+        self.logger = logging.getLogger(self.__class__.__name__)
+        # Initialize search optimization
+        self.community_search_index = {}
+        self.centrality_cache = {}
+    async def execute_primer_phase(self,
+                                 query_embedding: List[float],
+                                 routing_result: DriftRoutingResult) -> Dict[str, Any]:
+        """Execute community search and knowledge retrieval."""
+        start_time = datetime.now()
+        try:
+            # Community retrieval
+            self.logger.info("Starting community retrieval")
+            communities = await self._retrieve_communities_enhanced(
+                query_embedding, routing_result
+            )
+            # Data extraction
+            self.logger.info("Starting data extraction")
+            extracted_data = await self._extract_community_data_enhanced(communities)
+            # Answer generation
+            self.logger.info("Starting answer generation")
+            initial_answer = await self._generate_initial_answer_enhanced(
+                extracted_data, routing_result
+            )
+            execution_time = (datetime.now() - start_time).total_seconds()
+            return {
+                'communities': communities,
+                'extracted_data': extracted_data,
+                'initial_answer': initial_answer,
+                'execution_time': execution_time,
+                'metadata': {
+                    'communities_retrieved': len(communities),
+                    'entities_extracted': len(extracted_data.get('entities', [])),
+                    'relationships_extracted': len(extracted_data.get('relationships', [])),
+                    'phase': 'primer',
+                    'step_range': '6-8'
+                }
+            }
+        except Exception as e:
+            self.logger.error(f"Primer phase execution failed: {e}")
+            raise
+    async def _retrieve_communities_enhanced(self,
+                                           query_embedding: List[float],
+                                           routing_result: DriftRoutingResult) -> List[CommunityResult]:
+        """
+        Step 6: Enhanced community retrieval using comprehensive properties.
+        Retrieves relevant communities based on query embedding similarity.
+        """
+        try:
+            # Retrieve HyDE embeddings
+            hyde_embeddings = await self._retrieve_hyde_embeddings_enhanced()
+            if not hyde_embeddings:
+                self.logger.warning("No HyDE embeddings found")
+                return []
+            # Compute similarities
+            similarities = self._compute_hyde_similarities_enhanced(
+                query_embedding, hyde_embeddings
+            )
+            # Rank communities
+            ranked_communities = self._rank_communities_enhanced(
+                similarities, routing_result
+            )
+            # Apply criteria
+            filtered_communities = self._apply_termination_criteria(
+                ranked_communities, routing_result
+            )
+            # Fetch community details
+            community_results = await self._fetch_community_details_enhanced(
+                filtered_communities
+            )
+            self.logger.info(f"Retrieved {len(community_results)} enhanced communities")
+            return community_results
+        except Exception as e:
+            self.logger.error(f"Enhanced community retrieval failed: {e}")
+            return []
+    async def _load_community_search_index(self):
+        """Load optimized community search index from Neo4j."""
+        try:
+            query = """
+            MATCH (meta:DriftMetadata)
+            WHERE meta.community_search_index IS NOT NULL
+            RETURN meta.community_search_index as search_index,
+                   meta.total_communities as total_communities
+            """
+            results = self.neo4j_conn.execute_query(query)
+            for record in results:
+                # The search index is a nested JSON structure with community IDs as keys
+                search_index_data = record['search_index']
+                if isinstance(search_index_data, dict):
+                    # Each community in the search index
+                    for community_id, community_data in search_index_data.items():
+                        self.community_search_index[community_id] = community_data
+                else:
+                    self.logger.warning(f"Unexpected search index format: {type(search_index_data)}")
+            self.logger.info(f"Loaded search index for {len(self.community_search_index)} communities")
+        except Exception as e:
+            self.logger.error(f"Failed to load community search index: {e}")
+    async def _retrieve_hyde_embeddings_enhanced(self) -> Dict[str, Dict[str, Any]]:
+        """Retrieve HyDE embeddings and metadata."""
+        try:
+            # Retrieve community embeddings
+            query = """
+            MATCH (c:Community)
+            WHERE c.hyde_embeddings IS NOT NULL
+            OPTIONAL MATCH (meta:CommunitiesMetadata)
+            RETURN c.id as community_id,
+                   c.hyde_embeddings as hyde_embeddings,
+                   c.summary as summary,
+                   c.key_entities as key_entities,
+                   c.member_ids as member_ids,
+                   size(c.hyde_embeddings) as embedding_size,
+                   meta.modularity_score as global_modularity_score
+            """
+            results = self.neo4j_conn.execute_query(query)
+            hyde_embeddings = {}
+            for record in results:
+                community_id = record['community_id']
+                embeddings_data = record.get('hyde_embeddings')
+                if embeddings_data and community_id:
+                    hyde_embeddings[community_id] = {
+                        'embeddings': embeddings_data,
+                        'summary': record.get('summary', ''),
+                        'key_entities': record.get('key_entities', []),
+                        'member_ids': record.get('member_ids', []),
+                        'embedding_size': record.get('embedding_size', 0),
+                        'global_modularity_score': record.get('global_modularity_score', 0.0),
+                        'embedding_type': 'hyde'
+                    }
+            self.logger.info(f"Retrieved enhanced HyDE embeddings for {len(hyde_embeddings)} communities")
+            return hyde_embeddings
+        except Exception as e:
+            self.logger.error(f"Failed to retrieve enhanced HyDE embeddings: {e}")
+            # Retry logic for embeddings
+            self.logger.info("Attempting retry for HyDE embeddings...")
+            try:
+                import time
+                time.sleep(2)  # Brief delay before retry
+                results = self.neo4j_conn.execute_query(query)
+                hyde_embeddings = {}
+                for record in results:
+                    community_id = record['community_id']
+                    embeddings_data = record.get('hyde_embeddings')
+                    if embeddings_data and community_id:
+                        hyde_embeddings[community_id] = {
+                            'embeddings': embeddings_data,
+                            'summary': record.get('summary', ''),
+                            'key_entities': record.get('key_entities', []),
+                            'member_ids': record.get('member_ids', []),
+                            'embedding_size': record.get('embedding_size', 0),
+                            'global_modularity': record.get('global_modularity_score', 0.0)
+                        }
+                self.logger.info(f"Retry successful: Retrieved enhanced HyDE embeddings for {len(hyde_embeddings)} communities")
+                return hyde_embeddings
+            except Exception as retry_error:
+                self.logger.error(f"Retry also failed: {retry_error}")
+                return {}
+    def _compute_hyde_similarities_enhanced(self,
+                                          query_embedding: List[float],
+                                          hyde_embeddings: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, float]]:
+        """
+        Enhanced similarity computation with global modularity weighting.
+        Calculates similarity scores between query embedding and community embeddings.
+        """
+        similarities = {}
+        query_vec = np.array(query_embedding)
+        query_norm = np.linalg.norm(query_vec)
+        if query_norm == 0:
+            self.logger.warning("Query embedding has zero norm")
+            return similarities
+        for community_id, embedding_data in hyde_embeddings.items():
+            embeddings_list = embedding_data['embeddings']
+            global_modularity = embedding_data.get('global_modularity_score', 0.0)
+            max_similarity = 0.0
+            # Compute similarity
+            try:
+                # Parse embedding string
+                if isinstance(embeddings_list, str):
+                    embeddings_list = json.loads(embeddings_list)
+                # Process embeddings
+                if isinstance(embeddings_list, list) and len(embeddings_list) > 0:
+                    # Use first embedding
+                    hyde_vec = np.array(embeddings_list[0] if isinstance(embeddings_list[0], list) else embeddings_list)
+                else:
+                    hyde_vec = np.array(embeddings_list)
+                hyde_norm = np.linalg.norm(hyde_vec)
+                if hyde_norm > 0:
+                    # Calculate similarity
+                    base_similarity = np.dot(query_vec, hyde_vec) / (query_norm * hyde_norm)
+                    # Apply weighting
+                    weighted_similarity = base_similarity * (1 + 0.2 * global_modularity)
+                    max_similarity = weighted_similarity
+            except Exception as e:
+                self.logger.warning(f"Error computing similarity for community {community_id}: {e}")
+                continue
+            similarities[community_id] = {
+                'similarity': max_similarity,
+                'global_modularity_score': global_modularity,
+                'embedding_size': embedding_data.get('embedding_size', 0)
+            }
+        self.logger.info(f"Computed enhanced similarities for {len(similarities)} communities")
+        return similarities
+    def _rank_communities_enhanced(self,
+                                 similarities: Dict[str, Dict[str, float]],
+                                 routing_result: DriftRoutingResult) -> List[Tuple[str, Dict[str, float]]]:
+        """
+        Enhanced ranking using global modularity and similarity.
+        Ranks communities based on a weighted combination of similarity score and modularity.
+        """
+        # Rank primarily by similarity, with modularity as secondary factor
+        def ranking_score(item):
+            _, scores = item
+            similarity = scores['similarity']
+            global_modularity = scores['global_modularity_score']
+            # Weighted combination (similarity is primary)
+            return 0.8 * similarity + 0.2 * global_modularity
+        # Sort by combined ranking score
+        ranked = sorted(similarities.items(), key=ranking_score, reverse=True)
+        # Apply similarity threshold
+        similarity_threshold = routing_result.parameters.get('similarity_threshold', 0.7)
+        filtered_ranked = [
+            (cid, scores) for cid, scores in ranked
+            if scores['similarity'] >= similarity_threshold
+        ]
+        self.logger.info(f"Enhanced ranking: {len(filtered_ranked)} communities above threshold {similarity_threshold}")
+        return filtered_ranked
+    def _apply_termination_criteria(self,
+                                  ranked_communities: List[Tuple[str, Dict[str, float]]],
+                                  routing_result: DriftRoutingResult) -> List[Tuple[str, Dict[str, float]]]:
+        """
+        Apply termination criteria for community selection.
+        Limits the number of communities selected based on threshold parameters.
+        """
+        # Get termination criteria from routing or defaults
+        max_communities = routing_result.parameters.get('max_communities', 3)
+        min_global_modularity = routing_result.parameters.get('min_global_modularity', 0.3)
+        # Apply criteria
+        filtered = []
+        for community_id, scores in ranked_communities:
+            if len(filtered) >= max_communities:
+                break
+            # Check global modularity threshold
+            if scores['global_modularity_score'] >= min_global_modularity:
+                filtered.append((community_id, scores))
+        self.logger.info(f"Applied termination criteria: {len(filtered)} communities selected")
+        return filtered
+    async def _fetch_community_details_enhanced(self,
+                                              ranked_communities: List[Tuple[str, Dict[str, float]]]) -> List[CommunityResult]:
+        """
+        Fetch comprehensive community details with all properties.
+        Retrieves detailed information about selected communities including summaries,
+        key entities, and member IDs.
+        """
+        community_results = []
+        for community_id, scores in ranked_communities:
+            try:
+                # Query the Community node directly by ID (since embedding communities have id=community_id)
+                detail_query = """
+                MATCH (c:Community)
+                WHERE c.id = $community_id AND c.hyde_embeddings IS NOT NULL
+                OPTIONAL MATCH (meta:CommunitiesMetadata)
+                RETURN c.summary as summary,
+                       c.key_entities as key_entities,
+                       c.member_ids as member_ids,
+                       c.internal_edges as internal_edges,
+                       c.density as density,
+                       c.avg_degree as avg_degree,
+                       c.level as level,
+                       meta.modularity_score as modularity_score,
+                       CASE WHEN c.member_ids IS NOT NULL THEN size(c.member_ids) ELSE 0 END as member_count,
+                       c.id as id
+                LIMIT 1
+                """
+                results = self.neo4j_conn.execute_query(
+                    detail_query,
+                    {'community_id': community_id}
+                )
+                if results:
+                    record = results[0]
+                    # Create enhanced community result with actual available data from Neo4j
+                    community_result = CommunityResult(
+                        community_id=community_id,
+                        similarity_score=scores['similarity'],
+                        summary=record.get('summary', ''),
+                        key_entities=record.get('key_entities', []),
+                        member_ids=record.get('member_ids', []),
+                        modularity_score=record.get('modularity_score', 0.0),
+                        level=record.get('level', 1),
+                        internal_edges=record.get('internal_edges', 0),
+                        member_count=record.get('member_count', 0),
+                        confidence_score=scores.get('confidence_score', 0.5),
+                        search_index='',
+                        termination_criteria={},
+                        centrality_stats={
+                            'avg_degree': record.get('avg_degree', 0.0),
+                            'density': record.get('density', 0.0)
+                        }
+                    )
+                    community_results.append(community_result)
+            except Exception as e:
+                self.logger.error(f"Failed to fetch details for community {community_id}: {e}")
+                continue
+        self.logger.info(f"Fetched enhanced details for {len(community_results)} communities")
+        return community_results
+    async def _extract_community_data_enhanced(self,
+                                             communities: List[CommunityResult]) -> Dict[str, Any]:
+        """
+        Step 7: Enhanced data extraction with centrality measures.
+        Extracts:
+        - Entities with degree/betweenness/closeness centrality
+        - Relationships with confidence scores
+        - Community statistics and properties
+        """
+        try:
+            all_entities = []
+            all_relationships = []
+            community_stats = []
+            for community in communities:
+                # Extract entities with centrality measures
+                entities = await self._extract_entities_with_centrality(community)
+                all_entities.extend(entities)
+                # Extract relationships with properties
+                relationships = await self._extract_relationships_enhanced(community)
+                all_relationships.extend(relationships)
+                # Collect community statistics
+                community_stats.append({
+                    'community_id': community.community_id,
+                    'member_count': community.member_count,
+                    'modularity_score': community.modularity_score,
+                    'confidence_score': community.confidence_score,
+                    'centrality_stats': community.centrality_stats
+                })
+            extracted_data = {
+                'entities': all_entities,
+                'relationships': all_relationships,
+                'community_stats': community_stats,
+                'extraction_metadata': {
+                    'communities_processed': len(communities),
+                    'entities_extracted': len(all_entities),
+                    'relationships_extracted': len(all_relationships),
+                    'timestamp': datetime.now().isoformat()
+                }
+            }
+            self.logger.info(f"Enhanced extraction completed: {len(all_entities)} entities, {len(all_relationships)} relationships")
+            return extracted_data
+        except Exception as e:
+            self.logger.error(f"Enhanced data extraction failed: {e}")
+            return {'entities': [], 'relationships': [], 'community_stats': []}
+    async def _extract_entities_with_centrality(self,
+                                              community: CommunityResult) -> List[EntityResult]:
+        """
+        Extract entities with comprehensive centrality measures.
+        Retrieves entities from the community with their associated centrality metrics.
+        """
+        try:
+            # Use member_ids for direct access if available
+            member_ids = community.member_ids if community.member_ids else []
+            if member_ids:
+                # Direct member access query based on actual schema
+                entity_query = """
+                MATCH (n)
+                WHERE n.id IN $member_ids
+                  AND n.name IS NOT NULL
+                  AND n.content IS NOT NULL
+                RETURN n.id as entity_id,
+                       n.name as name,
+                       n.content as content,
+                       n.confidence as confidence,
+                       n.degree_centrality as degree_centrality,
+                       n.betweenness_centrality as betweenness_centrality,
+                       n.closeness_centrality as closeness_centrality,
+                       labels(n) as node_types
+                ORDER BY n.degree_centrality DESC
+                """
+                results = self.neo4j_conn.execute_query(
+                    entity_query,
+                    {'member_ids': member_ids}
+                )
+            else:
+                # Fallback: find entities using community_id pattern matching
+                entity_query = """
+                MATCH (n)
+                WHERE n.community_id IS NOT NULL
+                  AND n.name IS NOT NULL
+                  AND n.content IS NOT NULL
+                RETURN n.id as entity_id,
+                       n.name as name,
+                       n.content as content,
+                       n.confidence as confidence,
+                       n.degree_centrality as degree_centrality,
+                       n.betweenness_centrality as betweenness_centrality,
+                       n.closeness_centrality as closeness_centrality,
+                       labels(n) as node_types
+                ORDER BY n.degree_centrality DESC
+                LIMIT 20
+                """
+                results = self.neo4j_conn.execute_query(entity_query)
+            entities = []
+            for record in results:
+                entity = EntityResult(
+                    entity_id=record['entity_id'],
+                    name=record.get('name', ''),
+                    content=record.get('content', ''),
+                    confidence=record.get('confidence', 0.0),
+                    degree_centrality=record.get('degree_centrality', 0.0),
+                    betweenness_centrality=record.get('betweenness_centrality', 0.0),
+                    closeness_centrality=record.get('closeness_centrality', 0.0),
+                    community_id=community.community_id,
+                    node_type=record.get('node_types', ['Unknown'])[0] if record.get('node_types') else 'Unknown'
+                )
+                entities.append(entity)
+            return entities
+        except Exception as e:
+            self.logger.error(f"Failed to extract entities for community {community.community_id}: {e}")
+            return []
+    async def _extract_relationships_enhanced(self,
+                                            community: CommunityResult) -> List[RelationshipResult]:
+        """
+        Extract relationships with enhanced properties.
+        Retrieves relationship data between entities within the specified community.
+        """
+        try:
+            relationship_query = """
+            MATCH (a)-[r]->(b)
+            WHERE a.community_id = $community_id
+              AND b.community_id = $community_id
+              AND r.confidence > 0.5
+            RETURN startNode(r).id as start_node,
+                   endNode(r).id as end_node,
+                   type(r) as relationship_type,
+                   r.confidence as confidence
+            ORDER BY r.confidence DESC
+            LIMIT 50
+            """
+            results = self.neo4j_conn.execute_query(
+                relationship_query,
+                {'community_id': community.community_id}
+            )
+            relationships = []
+            for record in results:
+                relationship = RelationshipResult(
+                    start_node=record['start_node'],
+                    end_node=record['end_node'],
+                    relationship_type=record['relationship_type'],
+                    confidence=record.get('confidence', 0.0)
+                )
+                relationships.append(relationship)
+            return relationships
+        except Exception as e:
+            self.logger.error(f"Failed to extract relationships for community {community.community_id}: {e}")
+            return []
+    async def _generate_initial_answer_enhanced(self,
+                                              extracted_data: Dict[str, Any],
+                                              routing_result: DriftRoutingResult) -> Dict[str, Any]:
+        """
+        Step 8: Context-aware initial answer generation.
+        Uses:
+        - Entity importance from centrality measures
+        - Relationship confidence for evidence strength
+        - Community statistics for context sizing
+        """
+        try:
+            entities = extracted_data['entities']
+            relationships = extracted_data['relationships']
+            community_stats = extracted_data['community_stats']
+            # Rank entities by importance (centrality measures)
+            important_entities = sorted(
+                entities,
+                key=lambda e: (e.degree_centrality + e.betweenness_centrality) / 2,
+                reverse=True
+            )[:10]
+            # Select high-confidence relationships
+            strong_relationships = [
+                r for r in relationships
+                if r.confidence >= 0.7
+            ]
+            # Prepare context for LLM
+            llm_context = self._prepare_llm_context_enhanced(
+                important_entities, strong_relationships, community_stats, routing_result
+            )
+            # Generate initial answer using configured LLM
+            llm_response = await self._generate_llm_answer(llm_context, routing_result)
+            initial_answer = {
+                'content': llm_response['answer'],
+                'llm_context': llm_context,
+                'context_used': {
+                    'important_entities': len(important_entities),
+                    'strong_relationships': len(strong_relationships),
+                    'communities_analyzed': len(community_stats)
+                },
+                'confidence_metrics': {
+                    'avg_entity_centrality': np.mean([e.degree_centrality for e in important_entities]) if important_entities else 0,
+                    'avg_relationship_confidence': np.mean([r.confidence for r in strong_relationships]) if strong_relationships else 0,
+                    'avg_community_modularity': np.mean([c['modularity_score'] for c in community_stats]) if community_stats else 0,
+                    'llm_confidence': llm_response['confidence']
+                },
+                'follow_up_questions': llm_response['follow_up_questions'],
+                'reasoning': llm_response['reasoning']
+            }
+            self.logger.info("Enhanced initial answer generated with comprehensive context")
+            return initial_answer
+        except Exception as e:
+            self.logger.error(f"Enhanced answer generation failed: {e}")
+            return {'content': 'Error generating initial answer', 'error': str(e)}
+    def _prepare_llm_context_enhanced(self,
+                                    entities: List[EntityResult],
+                                    relationships: List[RelationshipResult],
+                                    community_stats: List[Dict[str, Any]],
+                                    routing_result: DriftRoutingResult) -> str:
+        """Prepare enhanced context for LLM with comprehensive information."""
+        context_parts = [
+            f"Query: {routing_result.original_query}",
+            f"Search Strategy: {routing_result.search_strategy.value}",
+            "",
+            "=== IMPORTANT ENTITIES (Use these specific names in your answer) ===",
+        ]
+        for i, entity in enumerate(entities[:10], 1):  # Show more entities
+            context_parts.append(
+                f"{i}. NAME: '{entity.name}' | Description: {entity.content[:100]}... "
+                f"| Centrality: {entity.degree_centrality:.3f} | Confidence: {entity.confidence:.3f}"
+            )
+        context_parts.extend([
+            "",
+            "=== KEY RELATIONSHIPS (Use these connections in your answer) ===",
+        ])
+        for i, rel in enumerate(relationships[:8], 1):  # Show more relationships
+            context_parts.append(
+                f"{i}. '{rel.start_node}' --[{rel.relationship_type}]--> '{rel.end_node}' "
+                f"| Confidence: {rel.confidence:.3f}"
+            )
+        # Add quick reference list of all entity names
+        entity_names = [entity.name for entity in entities[:15]]
+        context_parts.extend([
+            "",
+            "=== ENTITY NAMES FOR REFERENCE ===",
+            f"Available entities: {', '.join(entity_names)}",
+            "",
+            "=== COMMUNITY STATISTICS ===",
+        ])
+        for stat in community_stats:
+            context_parts.append(
+                f"Community {stat['community_id']}: {stat['member_count']} members, "
+                f"modularity: {stat['modularity_score']:.3f}"
+            )
+        context_parts.extend([
+            "",
+            "REMEMBER: Use the specific entity names listed above in your answer!"
+        ])
+        return "\n".join(context_parts)
+    async def _generate_llm_answer(self,
+                                 context: str,
+                                 routing_result: DriftRoutingResult) -> Dict[str, Any]:
+        """
+        Generate actual LLM response using the configured LLM.
+        Uses the LLM from GraphRAGSetup to generate answers with follow-up questions.
+        """
+        try:
+            # Construct comprehensive prompt for LLM
+            prompt = f"""
+You are an expert knowledge analyst. Answer the user's query using SPECIFIC NAMES and information from the graph data provided below.
+IMPORTANT: Use the actual entity names, organization names, and relationship details from the graph data. Do not give generic answers.
+GRAPH DATA CONTEXT:
+{context}
+INSTRUCTIONS:
+1. Answer using SPECIFIC ENTITY NAMES from the "IMPORTANT ENTITIES" section above
+2. Reference actual relationships and organizations mentioned in the graph data
+3. If the query asks for members/organizations, LIST THE ACTUAL NAMES from the entities
+4. Use confidence scores and centrality measures as evidence strength indicators
+5. Generate follow-up questions based on the specific entities found
+RESPONSE FORMAT:
+Answer: [Use specific names and details from the graph data above]
+Confidence: [0.0-1.0]
+Reasoning: [Why these specific entities answer the query]
+Follow-up Questions:
+1. [Specific question about entities found]
+2. [Question about relationships discovered]
+3. [Question about community connections]
+4. [Question for deeper exploration]
+5. [Question about related entities]
+"""
+            # Call the configured LLM
+            llm_response = await self.setup.llm.acomplete(prompt)
+            response_text = llm_response.text
+            # Parse LLM response
+            parsed_response = self._parse_llm_response(response_text)
+            self.logger.info(f"LLM generated answer with confidence: {parsed_response['confidence']}")
+            return parsed_response
+        except Exception as e:
+            self.logger.error(f"LLM answer generation failed: {e}")
+            # Fallback response
+            return {
+                'answer': f"Based on the graph analysis, I found relevant information but encountered an issue generating the full response: {str(e)}",
+                'confidence': 0.3,
+                'reasoning': "LLM generation encountered an error, providing basic analysis from graph data.",
+                'follow_up_questions': [
+                    "What specific aspects would you like me to explore further?",
+                    "Are there particular entities or relationships of interest?",
+                    "Should I focus on a specific community or time period?"
+                ]
+            }
+    def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
+        """Parse structured LLM response into components."""
+        try:
+            lines = response_text.strip().split('\n')
+            answer = ""
+            confidence = 0.5
+            reasoning = ""
+            follow_up_questions = []
+            current_section = None
+            for line in lines:
+                line = line.strip()
+                if line.startswith("Answer:"):
+                    current_section = "answer"
+                    answer = line.replace("Answer:", "").strip()
+                elif line.startswith("Confidence:"):
+                    confidence_text = line.replace("Confidence:", "").strip()
+                    try:
+                        confidence = float(confidence_text)
+                    except (ValueError, TypeError):
+                        confidence = 0.5
+                elif line.startswith("Reasoning:"):
+                    current_section = "reasoning"
+                    reasoning = line.replace("Reasoning:", "").strip()
+                elif line.startswith("Follow-up Questions:"):
+                    current_section = "questions"
+                elif current_section == "answer" and line:
+                    answer += " " + line
+                elif current_section == "reasoning" and line:
+                    reasoning += " " + line
+                elif current_section == "questions" and line.startswith(("1.", "2.", "3.", "4.", "5.")):
+                    question = line[2:].strip()  # Remove "1. " etc.
+                    follow_up_questions.append(question)
+            return {
+                'answer': answer.strip() if answer else "Unable to generate answer from available context.",
+                'confidence': max(0.0, min(1.0, confidence)),  # Clamp between 0-1
+                'reasoning': reasoning.strip() if reasoning else "Analysis based on graph structure and entity relationships.",
+                'follow_up_questions': follow_up_questions if follow_up_questions else [
+                    "What additional information would be helpful?",
+                    "Are there specific aspects to explore further?",
+                    "Should I analyze different communities or relationships?"
+                ]
+            }
+        except Exception as e:
+            self.logger.error(f"Failed to parse LLM response: {e}")
+            return {
+                'answer': response_text[:500] if response_text else "No response generated.",
+                'confidence': 0.4,
+                'reasoning': "Direct LLM output due to parsing issues.",
+                'follow_up_questions': ["What would you like to know more about?"]
+            }
+# Exports
+__all__ = ['CommunitySearchEngine', 'CommunityResult', 'EntityResult', 'RelationshipResult']

query_graph_functions/query_preprocessing.py ADDED Viewed

	@@ -0,0 +1,592 @@

+"""Query preprocessing for analysis, routing, and vectorization - Phase B (Steps 3-5)."""
+import logging
+from typing import Dict, List, Any, Tuple, Optional
+from dataclasses import dataclass
+from enum import Enum
+import re
+# System imports
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+from my_config import MY_CONFIG
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+class QueryType(Enum):
+    """Query type classifications for DRIFT routing."""
+    SPECIFIC_ENTITY = "specific_entity"
+    RELATIONSHIP_QUERY = "relationship_query"
+    BROAD_THEMATIC = "broad_thematic"
+    COMPARATIVE = "comparative"
+    COMPLEX_REASONING = "complex_reasoning"
+    FACTUAL_LOOKUP = "factual_lookup"
+class SearchStrategy(Enum):
+    """Search strategy determined by DRIFT routing."""
+    LOCAL_SEARCH = "local_search"
+    GLOBAL_SEARCH = "global_search"
+    HYBRID_SEARCH = "hybrid_search"
+@dataclass
+class QueryAnalysis:
+    """Results of query analysis step."""
+    query_type: QueryType
+    complexity_score: float  # 0.0 to 1.0
+    entities_mentioned: List[str]
+    key_concepts: List[str]
+    intent_description: str
+    context_requirements: Dict[str, Any]
+    estimated_scope: str  # "narrow", "moderate", "broad"
+@dataclass
+@dataclass
+class DriftRoutingResult:
+    """Results of DRIFT routing decision."""
+    search_strategy: SearchStrategy
+    reasoning: str
+    confidence: float  # 0.0 to 1.0
+    parameters: Dict[str, Any]
+    original_query: str  # Added to fix answer generation
+    fallback_strategy: Optional[SearchStrategy] = None
+@dataclass
+class VectorizedQuery:
+    """Results of query vectorization."""
+    embedding: List[float]
+    embedding_model: str
+    normalized_query: str
+    semantic_keywords: List[str]
+    similarity_threshold: float
+class QueryAnalyzer:
+    """Handles Step 3: Query Analysis with intent detection and complexity assessment."""
+    def __init__(self, config: Any):
+        self.config = config
+        self.logger = logging.getLogger('graphrag_query')
+        # Entity extraction patterns
+        self.entity_patterns = [
+            r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b',  # Proper nouns
+            r'\b(?:company|organization|person|place|event)\s+(?:named|called)?\s*["\']?([^"\']+)["\']?',
+            r'\bwho\s+is\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)',
+            r'\bwhat\s+is\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)',
+        ]
+        # Complexity indicators
+        self.complexity_indicators = {
+            'high': ['compare', 'analyze', 'evaluate', 'relationship', 'impact', 'why', 'how'],
+            'medium': ['describe', 'explain', 'summarize', 'list', 'identify'],
+            'low': ['who', 'what', 'when', 'where', 'is', 'are']
+        }
+        self.logger.info("QueryAnalyzer initialized for Step 3 processing")
+    async def analyze_query(self, query: str) -> QueryAnalysis:
+        """Analyze query for intent, complexity, and entities."""
+        self.logger.info(f"Starting Step 3: Query Analysis for: {query[:100]}...")
+        try:
+            # Extract entities and concepts
+            entities = self._extract_entities(query)
+            concepts = self._extract_key_concepts(query)
+            query_type = self._classify_query_type(query, entities, concepts)
+            complexity = self._calculate_complexity(query, query_type)
+            intent = self._determine_intent(query, query_type)
+            scope = self._estimate_scope(query, entities, concepts, complexity)
+            # Build context
+            context_reqs = self._analyze_context_requirements(query, query_type, entities)
+            analysis = QueryAnalysis(
+                query_type=query_type,
+                complexity_score=complexity,
+                entities_mentioned=entities,
+                key_concepts=concepts,
+                intent_description=intent,
+                context_requirements=context_reqs,
+                estimated_scope=scope
+            )
+            self.logger.info(f"Step 3 completed: Query type={query_type.value}, "
+                           f"complexity={complexity:.2f}, entities={len(entities)}, scope={scope}")
+            return analysis
+        except Exception as e:
+            self.logger.error(f"Step 3 Query Analysis failed: {e}")
+            raise
+    def _extract_entities(self, query: str) -> List[str]:
+        """Extract named entities from query text."""
+        entities = set()
+        for pattern in self.entity_patterns:
+            matches = re.findall(pattern, query, re.IGNORECASE)
+            entities.update(matches)
+        # Filter entities
+        filtered_entities = [
+            entity.strip() for entity in entities
+            if len(entity.strip()) > 2 and entity.lower() not in
+            {'the', 'and', 'are', 'is', 'was', 'were', 'this', 'that', 'what', 'who', 'how'}
+        ]
+        return list(set(filtered_entities))
+    def _extract_key_concepts(self, query: str) -> List[str]:
+        """Extract key conceptual terms from query."""
+        # Extract concepts
+        concepts = []
+        # Find domain terms
+        domain_terms = [
+            'revenue', 'profit', 'growth', 'market', 'strategy', 'technology',
+            'product', 'service', 'customer', 'partnership', 'acquisition',
+            'investment', 'research', 'development', 'innovation', 'competition'
+        ]
+        query_lower = query.lower()
+        for term in domain_terms:
+            if term in query_lower:
+                concepts.append(term)
+        return concepts
+    def _classify_query_type(self, query: str, entities: List[str], concepts: List[str]) -> QueryType:
+        """Classify the type of query for routing decisions."""
+        query_lower = query.lower()
+        # Check patterns
+        if any(word in query_lower for word in ['compare', 'versus', 'vs', 'difference']):
+            return QueryType.COMPARATIVE
+        if any(word in query_lower for word in ['relationship', 'connect', 'related', 'between']):
+            return QueryType.RELATIONSHIP_QUERY
+        if len(entities) > 0 and any(word in query_lower for word in ['who is', 'what is', 'about']):
+            return QueryType.SPECIFIC_ENTITY
+        if any(word in query_lower for word in ['analyze', 'evaluate', 'why', 'how', 'impact']):
+            return QueryType.COMPLEX_REASONING
+        if len(concepts) > 2 or any(word in query_lower for word in ['overall', 'general', 'trend']):
+            return QueryType.BROAD_THEMATIC
+        return QueryType.FACTUAL_LOOKUP
+    def _calculate_complexity(self, query: str, query_type: QueryType) -> float:
+        """Calculate query complexity score (0.0 to 1.0)."""
+        base_score = 0.3
+        query_lower = query.lower()
+        # Base complexity
+        type_scores = {
+            QueryType.FACTUAL_LOOKUP: 0.2,
+            QueryType.SPECIFIC_ENTITY: 0.3,
+            QueryType.RELATIONSHIP_QUERY: 0.6,
+            QueryType.BROAD_THEMATIC: 0.7,
+            QueryType.COMPARATIVE: 0.8,
+            QueryType.COMPLEX_REASONING: 0.9
+        }
+        base_score = type_scores.get(query_type, 0.5)
+        # Adjust complexity
+        for level, indicators in self.complexity_indicators.items():
+            count = sum(1 for indicator in indicators if indicator in query_lower)
+            if level == 'high':
+                base_score += count * 0.2
+            elif level == 'medium':
+                base_score += count * 0.1
+            else:
+                base_score -= count * 0.05
+        # Query length and structure
+        if len(query.split()) > 15:
+            base_score += 0.1
+        if '?' in query and len(query.split('?')) > 2:
+            base_score += 0.15
+        return min(1.0, max(0.0, base_score))
+    def _determine_intent(self, query: str, query_type: QueryType) -> str:
+        """Determine the user's intent based on query analysis."""
+        intent_map = {
+            QueryType.FACTUAL_LOOKUP: "Seeking specific factual information",
+            QueryType.SPECIFIC_ENTITY: "Requesting details about a particular entity",
+            QueryType.RELATIONSHIP_QUERY: "Exploring connections and relationships",
+            QueryType.BROAD_THEMATIC: "Understanding broad themes or patterns",
+            QueryType.COMPARATIVE: "Comparing entities or concepts",
+            QueryType.COMPLEX_REASONING: "Requiring analytical reasoning and insights"
+        }
+        return intent_map.get(query_type, "General information seeking")
+    def _estimate_scope(self, query: str, entities: List[str], concepts: List[str], complexity: float) -> str:
+        """Estimate the scope of information needed."""
+        if len(entities) == 1 and complexity < 0.4:
+            return "narrow"
+        elif len(entities) > 3 or len(concepts) > 3 or complexity > 0.7:
+            return "broad"
+        else:
+            return "moderate"
+    def _analyze_context_requirements(self, query: str, query_type: QueryType, entities: List[str]) -> Dict[str, Any]:
+        """Analyze what context information is needed."""
+        return {
+            "requires_entity_details": len(entities) > 0,
+            "requires_relationships": query_type in [QueryType.RELATIONSHIP_QUERY, QueryType.COMPARATIVE],
+            "requires_historical_context": any(word in query.lower() for word in ['history', 'past', 'previous', 'before']),
+            "requires_quantitative_data": any(word in query.lower() for word in ['number', 'amount', 'count', 'revenue', 'profit']),
+            "primary_entities": entities[:3]  # Focus on top 3 entities
+        }
+class DriftRouter:
+    """Handles Step 4: DRIFT Routing for optimal search strategy selection."""
+    def __init__(self, config: Any, graph_stats: Dict[str, Any]):
+        self.config = config
+        self.graph_stats = graph_stats
+        self.logger = logging.getLogger('graphrag_query')
+        # Routing thresholds
+        self.local_search_threshold = 0.4
+        self.global_search_threshold = 0.7
+        self.entity_count_threshold = 10  # Based on graph size
+        self.logger.info("DriftRouter initialized for Step 4 processing")
+    async def determine_search_strategy(self, query_analysis: QueryAnalysis, original_query: str) -> DriftRoutingResult:
+        """
+        Determine optimal search strategy using DRIFT methodology (Step 4).
+        Args:
+            query_analysis: Results from Step 3 query analysis
+            original_query: The original user query
+        Returns:
+            DriftRoutingResult with search strategy and parameters
+        """
+        self.logger.info(f"Starting Step 4: DRIFT Routing for {query_analysis.query_type.value}")
+        try:
+            # Apply routing logic
+            strategy, reasoning, confidence, params = self._apply_drift_logic(query_analysis)
+            # Fallback strategy
+            fallback = self._determine_fallback_strategy(strategy)
+            result = DriftRoutingResult(
+                search_strategy=strategy,
+                reasoning=reasoning,
+                confidence=confidence,
+                parameters=params,
+                original_query=original_query,
+                fallback_strategy=fallback
+            )
+            self.logger.info(f"Step 4 completed: Strategy={strategy.value}, "
+                           f"confidence={confidence:.2f}, reasoning={reasoning[:50]}...")
+            return result
+        except Exception as e:
+            self.logger.error(f"Step 4 DRIFT Routing failed: {e}")
+            raise
+    def _apply_drift_logic(self, analysis: QueryAnalysis) -> Tuple[SearchStrategy, str, float, Dict[str, Any]]:
+        """Apply DRIFT (Distributed Retrieval and Information Filtering Technique) logic."""
+        # Decision factors
+        complexity = analysis.complexity_score
+        entity_count = len(analysis.entities_mentioned)
+        scope = analysis.estimated_scope
+        query_type = analysis.query_type
+        # Local search conditions
+        if (query_type == QueryType.SPECIFIC_ENTITY and
+            entity_count <= 2 and
+            complexity < self.local_search_threshold):
+            return (
+                SearchStrategy.LOCAL_SEARCH,
+                f"Specific entity query with low complexity ({complexity:.2f})",
+                0.9,
+                {
+                    "max_depth": 2,
+                    "entity_focus": analysis.entities_mentioned,
+                    "include_neighbors": True,
+                    "max_results": 20
+                }
+            )
+        # Global search conditions
+        if (complexity > self.global_search_threshold or
+            scope == "broad" or
+            query_type in [QueryType.BROAD_THEMATIC, QueryType.COMPLEX_REASONING]):
+            return (
+                SearchStrategy.GLOBAL_SEARCH,
+                f"High complexity ({complexity:.2f}) or broad scope requiring global context",
+                0.85,
+                {
+                    "community_level": "high",
+                    "max_communities": 10,
+                    "include_summary": True,
+                    "max_results": 50
+                }
+            )
+        # Hybrid search for intermediate cases
+        if (query_type == QueryType.RELATIONSHIP_QUERY or
+            query_type == QueryType.COMPARATIVE or
+            entity_count > 2):
+            return (
+                SearchStrategy.HYBRID_SEARCH,
+                f"Relationship/comparative query or multiple entities ({entity_count})",
+                0.75,
+                {
+                    "local_depth": 2,
+                    "global_communities": 5,
+                    "balance_weight": 0.6,  # Favor local over global
+                    "max_results": 35
+                }
+            )
+        # Default to local search with moderate confidence
+        return (
+            SearchStrategy.LOCAL_SEARCH,
+            "Default local search for moderate complexity query",
+            0.6,
+            {
+                "max_depth": 3,
+                "entity_focus": analysis.entities_mentioned,
+                "include_neighbors": True,
+                "max_results": 25
+            }
+        )
+    def _determine_fallback_strategy(self, primary_strategy: SearchStrategy) -> Optional[SearchStrategy]:
+        """Determine fallback strategy if primary fails."""
+        fallback_map = {
+            SearchStrategy.LOCAL_SEARCH: SearchStrategy.GLOBAL_SEARCH,
+            SearchStrategy.GLOBAL_SEARCH: SearchStrategy.LOCAL_SEARCH,
+            SearchStrategy.HYBRID_SEARCH: SearchStrategy.LOCAL_SEARCH
+        }
+        return fallback_map.get(primary_strategy)
+class QueryVectorizer:
+    """Handles Step 5: Query Vectorization for semantic similarity matching."""
+    def __init__(self, config: Any):
+        self.config = config
+        self.logger = logging.getLogger('graphrag_query')
+        # Initialize embedding model using same pattern as other files
+        self.embedding_model = HuggingFaceEmbedding(
+            model_name=MY_CONFIG.EMBEDDING_MODEL
+        )
+        self.model_name = MY_CONFIG.EMBEDDING_MODEL
+        self.embedding_dimension = MY_CONFIG.EMBEDDING_LENGTH
+        self.logger.info(f"QueryVectorizer initialized with {self.model_name}")
+    async def vectorize_query(self, query: str, query_analysis: QueryAnalysis) -> VectorizedQuery:
+        """
+        Generate query embeddings for similarity matching (Step 5).
+        Args:
+            query: Original query text
+            query_analysis: Results from Step 3
+        Returns:
+            VectorizedQuery with embeddings and metadata
+        """
+        self.logger.info(f"Starting Step 5: Query Vectorization for: {query[:100]}...")
+        try:
+            # Normalize query
+            normalized_query = self._normalize_query(query, query_analysis)
+            # Generate embedding
+            embedding = await self._generate_embedding(normalized_query)
+            # Extract keywords
+            semantic_keywords = self._extract_semantic_keywords(query, query_analysis)
+            # Set similarity threshold
+            similarity_threshold = self._calculate_similarity_threshold(query_analysis)
+            result = VectorizedQuery(
+                embedding=embedding,
+                embedding_model=self.model_name,
+                normalized_query=normalized_query,
+                semantic_keywords=semantic_keywords,
+                similarity_threshold=similarity_threshold
+            )
+            self.logger.info(f"Step 5 completed: Embedding dimension={len(embedding)}, "
+                           f"threshold={similarity_threshold:.3f}, keywords={len(semantic_keywords)}")
+            return result
+        except Exception as e:
+            self.logger.error(f"Step 5 Query Vectorization failed: {e}")
+            raise
+    def _normalize_query(self, query: str, analysis: QueryAnalysis) -> str:
+        """Normalize query text for better embedding quality."""
+        # Start with original query
+        normalized = query.strip()
+        # Add important entities and concepts for context
+        if analysis.entities_mentioned:
+            entity_context = " ".join(analysis.entities_mentioned[:3])
+            normalized = f"{normalized} [Entities: {entity_context}]"
+        if analysis.key_concepts:
+            concept_context = " ".join(analysis.key_concepts[:3])
+            normalized = f"{normalized} [Concepts: {concept_context}]"
+        return normalized
+    async def _generate_embedding(self, text: str) -> List[float]:
+        """Generate embedding for text using configured model."""
+        try:
+            embedding = await self.embedding_model.aget_text_embedding(text)
+            return embedding
+        except Exception as e:
+            self.logger.error(f"Embedding generation failed: {e}")
+            # Fallback to synchronous call if async fails
+            return self.embedding_model.get_text_embedding(text)
+    def _extract_semantic_keywords(self, query: str, analysis: QueryAnalysis) -> List[str]:
+        """Extract semantic keywords for additional matching."""
+        keywords = set()
+        # Add entities and concepts
+        keywords.update(analysis.entities_mentioned)
+        keywords.update(analysis.key_concepts)
+        # Add query-specific terms based on type
+        if analysis.query_type == QueryType.RELATIONSHIP_QUERY:
+            keywords.update(['relationship', 'connection', 'related', 'linked'])
+        elif analysis.query_type == QueryType.COMPARATIVE:
+            keywords.update(['comparison', 'versus', 'difference', 'similar'])
+        elif analysis.query_type == QueryType.BROAD_THEMATIC:
+            keywords.update(['theme', 'pattern', 'trend', 'overview'])
+        # Filter and return as list
+        return [kw for kw in keywords if len(kw) > 2]
+    def _calculate_similarity_threshold(self, analysis: QueryAnalysis) -> float:
+        """Calculate appropriate similarity threshold based on query characteristics."""
+        base_threshold = 0.7
+        # Adjust based on query complexity
+        if analysis.complexity_score > 0.7:
+            base_threshold -= 0.1  # Lower threshold for complex queries
+        elif analysis.complexity_score < 0.3:
+            base_threshold += 0.1  # Higher threshold for simple queries
+        # Adjust based on scope
+        if analysis.estimated_scope == "narrow":
+            base_threshold += 0.05
+        elif analysis.estimated_scope == "broad":
+            base_threshold -= 0.05
+        # Ensure reasonable bounds
+        return max(0.5, min(0.9, base_threshold))
+class QueryPreprocessor:
+    """Main class coordinating all query preprocessing steps (Steps 3-5)."""
+    def __init__(self, config: Any, graph_stats: Dict[str, Any]):
+        self.config = config
+        self.graph_stats = graph_stats
+        self.logger = logging.getLogger('graphrag_query')
+        # Initialize component processors
+        self.analyzer = QueryAnalyzer(config)
+        self.router = DriftRouter(config, graph_stats)
+        self.vectorizer = QueryVectorizer(config)
+        self.logger.info("QueryPreprocessor initialized for Steps 3-5")
+    async def preprocess_query(self, query: str) -> Tuple[QueryAnalysis, DriftRoutingResult, VectorizedQuery]:
+        """
+        Execute complete query preprocessing pipeline (Steps 3-5).
+        Args:
+            query: User's natural language query
+        Returns:
+            Tuple of (analysis, routing, vectorization) results
+        """
+        self.logger.info(f"Starting Phase B: Query Preprocessing Pipeline for: {query[:100]}...")
+        try:
+            # Query analysis
+            analysis = await self.analyzer.analyze_query(query)
+            # Query routing
+            routing = await self.router.determine_search_strategy(analysis, query)
+            # Query vectorization
+            vectorization = await self.vectorizer.vectorize_query(query, analysis)
+            self.logger.info(f"Phase B completed successfully: "
+                           f"Type={analysis.query_type.value}, "
+                           f"Strategy={routing.search_strategy.value}, "
+                           f"Embedding_dim={len(vectorization.embedding)}")
+            return analysis, routing, vectorization
+        except Exception as e:
+            self.logger.error(f"Query preprocessing pipeline failed: {e}")
+            raise
+# Exports
+async def create_query_preprocessor(config: Any, graph_stats: Dict[str, Any]) -> QueryPreprocessor:
+    """Create and initialize QueryPreprocessor."""
+    return QueryPreprocessor(config, graph_stats)
+async def preprocess_query_pipeline(query: str, config: Any, graph_stats: Dict[str, Any]) -> Tuple[QueryAnalysis, DriftRoutingResult, VectorizedQuery]:
+    """
+    Convenience function for complete query preprocessing.
+    Args:
+        query: User's natural language query
+        config: Application configuration
+        graph_stats: Graph database statistics
+    Returns:
+        Complete preprocessing results
+    """
+    preprocessor = await create_query_preprocessor(config, graph_stats)
+    return await preprocessor.preprocess_query(query)
+__all__ = [
+    'QueryAnalyzer', 'DriftRouter', 'QueryVectorizer', 'QueryPreprocessor',
+    'create_query_preprocessor', 'preprocess_query_pipeline',
+    'QueryAnalysis', 'DriftRoutingResult', 'VectorizedQuery',
+    'QueryType', 'SearchStrategy'
+]

query_graph_functions/response_management.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""Response management module for metadata generation and file I/O operations - Phase G (Steps 17-20)."""
+import time
+import json
+import logging
+from typing import Dict, List, Any
+from dataclasses import dataclass
+from datetime import datetime
+from .setup import GraphRAGSetup
+from .query_preprocessing import QueryAnalysis, DriftRoutingResult, VectorizedQuery
+from .answer_synthesis import SynthesisResult
+@dataclass
+class ResponseMetadata:
+    """Complete response metadata structure."""
+    query_type: str
+    search_strategy: str
+    complexity_score: float
+    total_time_seconds: float
+    phases_completed: List[str]
+    status: str
+    phase_details: Dict[str, Any]
+    database_stats: Dict[str, Any]
+class ResponseManager:
+    def __init__(self, setup: GraphRAGSetup):
+        self.setup = setup
+        self.config = setup.config
+        self.logger = logging.getLogger(self.__class__.__name__)
+    def generate_comprehensive_metadata(self,
+                                      analysis: QueryAnalysis,
+                                      routing: DriftRoutingResult,
+                                      vectorization: VectorizedQuery,
+                                      community_results: Dict[str, Any],
+                                      follow_up_results: Dict[str, Any],
+                                      augmentation_results: Any,
+                                      synthesis_results: SynthesisResult,
+                                      total_time: float) -> Dict[str, Any]:
+        """
+        Generate comprehensive metadata for query response.
+        Consolidates all phase results into structured metadata format.
+        """
+        try:
+            communities = community_results.get('communities', [])
+            metadata = {
+                # Execution Summary
+                "query_type": analysis.query_type.value,
+                "search_strategy": routing.search_strategy.value,
+                "complexity_score": analysis.complexity_score,
+                "total_time_seconds": round(total_time, 2),
+                "phases_completed": ["A-Init", "B-Preprocess", "C-Communities", "D-Followup", "E-Vector", "F-Synthesis"],
+                "status": "success",
+                # Phase A: Initialization
+                "phase_a": self._generate_phase_a_metadata(),
+                # Phase B: Query Preprocessing
+                "phase_b": self._generate_phase_b_metadata(analysis, vectorization, routing),
+                # Phase C: Community Search
+                "phase_c": self._generate_phase_c_metadata(communities, community_results),
+                # Phase D: Follow-up Search
+                "phase_d": self._generate_phase_d_metadata(follow_up_results),
+                # Phase E: Vector Augmentation
+                "phase_e": self._generate_phase_e_metadata(augmentation_results),
+                # Phase F: Answer Synthesis
+                "phase_f": self._generate_phase_f_metadata(synthesis_results),
+                # Database Statistics
+                "database_stats": self._generate_database_stats(follow_up_results, communities, augmentation_results)
+            }
+            self.logger.info("Generated comprehensive metadata with all phase details")
+            return metadata
+        except Exception as e:
+            self.logger.error(f"Failed to generate metadata: {e}")
+            return self._generate_fallback_metadata(str(e))
+    def _generate_phase_a_metadata(self) -> Dict[str, Any]:
+        """Generate Phase A initialization metadata."""
+        from my_config import MY_CONFIG
+        return {
+            "neo4j_connected": bool(self.setup.neo4j_conn),
+            "vector_db_ready": bool(self.setup.query_engine),
+            "llm_model": getattr(MY_CONFIG, 'LLM_MODEL', 'unknown'),
+            "embedding_model": getattr(MY_CONFIG, 'EMBEDDING_MODEL', 'unknown'),
+            "drift_config_loaded": bool(self.setup.drift_config)
+        }
+    def _generate_phase_b_metadata(self, analysis: QueryAnalysis, vectorization: VectorizedQuery, routing: DriftRoutingResult) -> Dict[str, Any]:
+        """Generate Phase B query preprocessing metadata."""
+        return {
+            "entities_extracted": len(analysis.entities_mentioned),
+            "semantic_keywords": len(vectorization.semantic_keywords),
+            "embedding_dimensions": len(vectorization.embedding),
+            "similarity_threshold": vectorization.similarity_threshold,
+            "routing_confidence": round(routing.confidence, 3)
+        }
+    def _generate_phase_c_metadata(self, communities: List[Any], community_results: Dict[str, Any]) -> Dict[str, Any]:
+        """Generate Phase C community search metadata."""
+        return {
+            "communities_found": len(communities),
+            "community_ids": [c.community_id for c in communities[:5]],
+            "similarities": [round(c.similarity_score, 3) for c in communities[:5]],
+            "entities_extracted": len(community_results.get('extracted_data', {}).get('entities', [])),
+            "relationships_extracted": len(community_results.get('extracted_data', {}).get('relationships', []))
+        }
+    def _generate_phase_d_metadata(self, follow_up_results: Dict[str, Any]) -> Dict[str, Any]:
+        """Generate Phase D follow-up search metadata."""
+        intermediate_answers = follow_up_results.get('intermediate_answers', [])
+        avg_confidence = 0.0
+        if intermediate_answers:
+            avg_confidence = sum(a.confidence for a in intermediate_answers) / len(intermediate_answers)
+        return {
+            "questions_generated": len(follow_up_results.get('follow_up_questions', [])),
+            "graph_traversals": len(follow_up_results.get('local_search_results', [])),
+            "entities_found": len(follow_up_results.get('detailed_entities', [])),
+            "intermediate_answers": len(intermediate_answers),
+            "avg_confidence": round(avg_confidence, 3)
+        }
+    def _generate_phase_e_metadata(self, augmentation_results: Any) -> Dict[str, Any]:
+        """Generate Phase E vector augmentation metadata."""
+        if not augmentation_results:
+            return {"vector_results_count": 0, "augmentation_confidence": 0.0}
+        vector_files = []
+        if hasattr(augmentation_results, 'vector_results'):
+            for i, result in enumerate(augmentation_results.vector_results):
+                file_info = {
+                    "file_id": i + 1,
+                    "file_path": getattr(result, 'file_path', 'unknown'),
+                    "similarity": round(result.similarity_score, 3),
+                    "content_length": len(result.content),
+                    "relevance": round(getattr(result, 'relevance_score', 0.0), 3)
+                }
+                vector_files.append(file_info)
+        return {
+            "vector_results_count": len(augmentation_results.vector_results) if hasattr(augmentation_results, 'vector_results') else 0,
+            "augmentation_confidence": round(augmentation_results.augmentation_confidence, 3) if hasattr(augmentation_results, 'augmentation_confidence') else 0.0,
+            "execution_time": round(augmentation_results.execution_time, 2) if hasattr(augmentation_results, 'execution_time') else 0.0,
+            "similarity_threshold": 0.75,
+            "vector_files": vector_files
+        }
+    def _generate_phase_f_metadata(self, synthesis_results: SynthesisResult) -> Dict[str, Any]:
+        """Generate Phase F answer synthesis metadata."""
+        return {
+            "synthesis_confidence": round(synthesis_results.confidence_score, 3),
+            "sources_integrated": len(synthesis_results.source_evidence),
+            "final_answer_length": len(synthesis_results.final_answer),
+            "synthesis_method": getattr(synthesis_results, 'synthesis_method', 'comprehensive_fusion')
+        }
+    def _generate_database_stats(self, follow_up_results: Dict[str, Any], communities: List[Any], augmentation_results: Any) -> Dict[str, Any]:
+        """Generate database statistics metadata."""
+        vector_docs_used = 0
+        if augmentation_results and hasattr(augmentation_results, 'vector_results'):
+            vector_docs_used = len(augmentation_results.vector_results)
+        return {
+            "total_nodes": self.setup.graph_stats.get('node_count', 0),
+            "total_relationships": self.setup.graph_stats.get('relationship_count', 0),
+            "total_communities": self.setup.graph_stats.get('community_count', 0),
+            "nodes_accessed": len(follow_up_results.get('detailed_entities', [])),
+            "communities_searched": len(communities),
+            "vector_docs_used": vector_docs_used
+        }
+    def _generate_fallback_metadata(self, error: str) -> Dict[str, Any]:
+        """Generate minimal metadata when full generation fails."""
+        return {
+            "status": "metadata_generation_error",
+            "error": error,
+            "phases_completed": "incomplete",
+            "total_time_seconds": 0.0
+        }
+    def save_response_to_files(self, user_query: str, result: Dict[str, Any]) -> None:
+        """
+        Save query response and metadata to separate files.
+        Handles file I/O operations for response persistence.
+        """
+        try:
+            timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
+            # Save response to response file
+            self._save_response_file(user_query, result, timestamp)
+            # Save metadata to metadata file
+            self._save_metadata_file(user_query, result, timestamp)
+            self.logger.info(f"Saved response and metadata for query: {user_query[:50]}...")
+        except Exception as e:
+            self.logger.error(f"Failed to save response files: {e}")
+    def _save_response_file(self, user_query: str, result: Dict[str, Any], timestamp: str) -> None:
+        """Save response content to response file."""
+        try:
+            with open('logs/graphrag_query/graphrag_responses.txt', 'a', encoding='utf-8') as f:
+                f.write(f"\n{'='*80}\n")
+                f.write(f"QUERY [{timestamp}]: {user_query}\n")
+                f.write(f"{'='*80}\n")
+                f.write(f"RESPONSE: {result['answer']}\n")
+                f.write(f"{'='*80}\n\n")
+        except Exception as e:
+            self.logger.error(f"Failed to save response file: {e}")
+    def _save_metadata_file(self, user_query: str, result: Dict[str, Any], timestamp: str) -> None:
+        """Save metadata to metadata file."""
+        try:
+            with open('logs/graphrag_query/graphrag_metadata.txt', 'a', encoding='utf-8') as f:
+                f.write(f"\n{'='*80}\n")
+                f.write(f"METADATA [{timestamp}]: {user_query}\n")
+                f.write(f"{'='*80}\n")
+                f.write(json.dumps(result['metadata'], indent=2, default=str))
+                f.write(f"\n{'='*80}\n\n")
+        except Exception as e:
+            self.logger.error(f"Failed to save metadata file: {e}")
+    def format_error_response(self, error_message: str) -> Dict[str, Any]:
+        """
+        Generate standardized error response with metadata.
+        Creates consistent error format for failed queries.
+        """
+        return {
+            "answer": f"Sorry, I encountered an error: {error_message}",
+            "metadata": {
+                "status": "error",
+                "error_message": error_message,
+                "phases_completed": "incomplete",
+                "neo4j_connected": bool(self.setup.neo4j_conn) if self.setup.neo4j_conn else False,
+                "vector_engine_ready": bool(self.setup.query_engine) if self.setup.query_engine else False,
+                "timestamp": datetime.now().isoformat()
+            }
+        }
+# Exports
+__all__ = ['ResponseManager', 'ResponseMetadata']

query_graph_functions/setup.py ADDED Viewed

	@@ -0,0 +1,361 @@

+"""Graph setup module for database and model initialization. Phase A (Steps 1-2)"""
+import os
+import logging
+from typing import Dict, Optional, Any
+import sys
+sys.path.append('..')  # Add parent directory to path for imports
+from my_config import MY_CONFIG
+from neo4j import GraphDatabase
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core import Settings, VectorStoreIndex, StorageContext
+from llama_index.vector_stores.milvus import MilvusVectorStore
+from llama_index.llms.litellm import LiteLLM
+# Set up environment
+os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
+# Configure logging
+logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+class Neo4jConnection:
+    """
+    Neo4j database connection manager.
+    """
+    def __init__(self):
+        self.uri = MY_CONFIG.NEO4J_URI
+        self.username = MY_CONFIG.NEO4J_USER
+        self.password = MY_CONFIG.NEO4J_PASSWORD
+        self.database = getattr(MY_CONFIG, "NEO4J_DATABASE", None)
+        # Validate required configuration
+        if not self.uri:
+            raise ValueError("NEO4J_URI config is required")
+        if not self.username:
+            raise ValueError("NEO4J_USERNAME config is required")
+        if not self.password:
+            raise ValueError("NEO4J_PASSWORD config is required")
+        if not self.database:
+            raise ValueError("NEO4J_DATABASE config is required")
+        self.driver: Optional[GraphDatabase.driver] = None
+    def connect(self):
+        """STEP 1.2: Initialize Neo4j driver with verification"""
+        if self.driver is None:
+            try:
+                self.driver = GraphDatabase.driver(
+                    self.uri,
+                    auth=(self.username, self.password)
+                )
+                self.driver.verify_connectivity()
+                logger.info(f"Connected to Neo4j at {self.uri}")
+            except Exception as e:
+                logger.error(f"❌ STEP 1.2 FAILED: Neo4j connection error: {e}")
+                self.driver = None
+    def disconnect(self):
+        """Clean up Neo4j connection"""
+        if self.driver:
+            self.driver.close()
+            self.driver = None
+            logger.info("Neo4j connection closed")
+    def execute_query(self, query: str, parameters: Optional[Dict[str, Any]] = None):
+        """Execute Cypher query with error handling"""
+        if not self.driver:
+            raise ConnectionError("Not connected to Neo4j database")
+        with self.driver.session(database=self.database) as session:
+            result = session.run(query, parameters or {})
+            records = [record.data() for record in result]
+        return records
+class GraphRAGSetup:
+    """
+    Main setup class for graph-based retrieval system.
+    Handles core initialization and configuration:
+    - Database connections (Neo4j and vector database)
+    - Model initialization and configuration
+    - Graph statistics and validation
+    - Search configuration loading
+    """
+    def __init__(self):
+        logger.info("Starting graph system initialization")
+        # Initialize core components
+        self.config = MY_CONFIG  # Add config attribute for GraphQueryEngine
+        self.neo4j_conn = None
+        self.query_engine = None
+        self.graph_stats = {}
+        self.drift_config = {}
+        self.llm = None
+        self.embedding_model = None
+        # Execute Step 1 initialization sequence
+        self._execute_step1_sequence()
+        logger.info("Graph system initialization complete")
+    def _execute_step1_sequence(self):
+        """Execute complete Step 1 initialization sequence"""
+        # STEP 1.1-1.6: Initialize all components
+        self._setup_neo4j()          # STEP 1.2
+        self._setup_vector_search()  # STEP 1.3-1.6
+        self._load_graph_statistics() # STEP 2.1-2.4
+        self._load_drift_configuration() # STEP 2.5
+    def _setup_neo4j(self):
+        """STEP 1.2: Initialize Neo4j driver with verification"""
+        try:
+            logger.info("Initializing Neo4j connection...")
+            self.neo4j_conn = Neo4jConnection()
+            self.neo4j_conn.connect()
+            # Verify connection with test query
+            if self.neo4j_conn.driver:
+                test_result = self.neo4j_conn.execute_query("MATCH (n) RETURN count(n) as total_nodes LIMIT 1")
+                node_count = test_result[0]['total_nodes'] if test_result else 0
+                logger.info(f"Neo4j connected - {node_count} nodes found")
+        except Exception as e:
+            logger.error(f"Neo4j connection error: {e}")
+            self.neo4j_conn = None
+    def _setup_vector_search(self):
+        """STEP 1.3-1.5: Initialize vector database and LLM components"""
+        try:
+            logger.info("Setting up vector search and LLM...")
+            # STEP 1.5: Load embedding model
+            self.embedding_model = HuggingFaceEmbedding(
+                model_name=MY_CONFIG.EMBEDDING_MODEL
+            )
+            Settings.embed_model = self.embedding_model
+            logger.info(f"Embedding model loaded: {MY_CONFIG.EMBEDDING_MODEL}")
+            # STEP 1.6: Connect to vector database based on configuration
+            if MY_CONFIG.VECTOR_DB_TYPE == "cloud_zilliz":
+                if not MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT or not MY_CONFIG.ZILLIZ_TOKEN:
+                    raise ValueError("Cloud database configuration missing. Set ZILLIZ_CLUSTER_ENDPOINT and ZILLIZ_TOKEN in .env")
+                vector_store = MilvusVectorStore(
+                    uri=MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT,
+                    token=MY_CONFIG.ZILLIZ_TOKEN,
+                    dim=MY_CONFIG.EMBEDDING_LENGTH,
+                    collection_name=MY_CONFIG.COLLECTION_NAME,
+                    overwrite=False
+                )
+                storage_context = StorageContext.from_defaults(vector_store=vector_store)
+                logger.info("Connected to cloud vector database")
+            else:
+                vector_store = MilvusVectorStore(
+                    uri=MY_CONFIG.MILVUS_URI_HYBRID_GRAPH,
+                    dim=MY_CONFIG.EMBEDDING_LENGTH,
+                    collection_name=MY_CONFIG.COLLECTION_NAME,
+                    overwrite=False
+                )
+                storage_context = StorageContext.from_defaults(vector_store=vector_store)
+                logger.info("Connected to local vector database")
+            index = VectorStoreIndex.from_vector_store(
+                vector_store=vector_store, storage_context=storage_context)
+            logger.info("Vector index loaded successfully")
+            # STEP 1.4: Initialize LLM provider
+            llm_model = MY_CONFIG.LLM_MODEL
+            self.llm = LiteLLM(model=llm_model)
+            Settings.llm = self.llm
+            logger.info(f"LLM initialized: {llm_model}")
+            self.query_engine = index.as_query_engine()
+        except Exception as e:
+            logger.error(f"Vector setup error: {e}")
+            self.query_engine = None
+    def _load_graph_statistics(self):
+        """STEP 2.1-2.4: Load and validate graph data structure"""
+        try:
+            logger.info("Loading graph statistics and validation...")
+            if not self.neo4j_conn or not self.neo4j_conn.driver:
+                logger.warning("No Neo4j connection for statistics")
+                return
+            # STEP 2.1: Get node and relationship counts
+            stats_query = """
+            MATCH (n)
+            OPTIONAL MATCH ()-[r]-()
+            RETURN count(DISTINCT n) as node_count,
+                   count(DISTINCT r) as relationship_count,
+                   count(DISTINCT n.community_id) as community_count
+            """
+            result = self.neo4j_conn.execute_query(stats_query)
+            if result:
+                stats = result[0]
+                self.graph_stats = {
+                    'node_count': stats.get('node_count', 0),
+                    'relationship_count': stats.get('relationship_count', 0),
+                    'community_count': stats.get('community_count', 0)
+                }
+                logger.info(f"Graph validated - {self.graph_stats['node_count']} nodes, "
+                           f"{self.graph_stats['relationship_count']} relationships, "
+                           f"{self.graph_stats['community_count']} communities")
+        except Exception as e:
+            logger.error(f"Graph statistics error: {e}")
+            self.graph_stats = {}
+    def _load_drift_configuration(self):
+        """STEP 2.5: Load DRIFT search metadata and configuration"""
+        logger.info("Loading search configuration...")
+        if not self.neo4j_conn or not self.neo4j_conn.driver:
+            logger.warning("No Neo4j connection for search configuration")
+            self.drift_config = {}
+            return
+        # Query for all DRIFT-related nodes
+        drift_metadata_query = """
+        OPTIONAL MATCH (dm:DriftMetadata)
+        OPTIONAL MATCH (dc:DriftConfiguration)
+        OPTIONAL MATCH (csi:CommunitySearchIndex)
+        OPTIONAL MATCH (gm:GraphMetadata)
+        OPTIONAL MATCH (cm:CommunitiesMetadata)
+        RETURN dm, dc, csi, gm, cm
+        """
+        result = self.neo4j_conn.execute_query(drift_metadata_query)
+        if result and result[0]:
+            record = result[0]
+            drift_config = {}
+            # Extract DriftMetadata properties
+            if record.get('dm'):
+                dm_props = dict(record['dm'])
+                drift_config.update(dm_props)
+                logger.info("DriftMetadata node found")
+            # Extract DriftConfiguration properties
+            if record.get('dc'):
+                dc_props = dict(record['dc'])
+                drift_config['configuration'] = dc_props
+                logger.info("DriftConfiguration node found")
+            # Extract CommunitySearchIndex properties
+            if record.get('csi'):
+                csi_props = dict(record['csi'])
+                drift_config['community_search_index'] = csi_props
+                logger.info("CommunitySearchIndex node found")
+            # Extract GraphMetadata properties
+            if record.get('gm'):
+                gm_props = dict(record['gm'])
+                drift_config['graph_metadata'] = gm_props
+                logger.info("GraphMetadata node found")
+            # Extract CommunitiesMetadata properties
+            if record.get('cm'):
+                cm_props = dict(record['cm'])
+                drift_config['communities_metadata'] = cm_props
+                logger.info("CommunitiesMetadata node found")
+            self.drift_config = drift_config
+            logger.info("Search configuration loaded from Neo4j nodes")
+        else:
+            logger.warning("No metadata nodes found in Neo4j")
+            self.drift_config = {}
+    def validate_system_readiness(self):
+        """Validate all required components are initialized"""
+        ready = True
+        if not self.neo4j_conn or not self.neo4j_conn.driver:
+            logger.error("Neo4j connection not available")
+            ready = False
+        if not self.query_engine:
+            logger.error("Vector query engine not available")
+            ready = False
+        if not self.graph_stats:
+            logger.warning("Graph statistics not loaded")
+        if ready:
+            logger.info("System readiness validated")
+        return ready
+    def get_system_status(self):
+        """Get detailed system status information"""
+        return {
+            "neo4j_connected": bool(self.neo4j_conn and self.neo4j_conn.driver),
+            "vector_engine_ready": bool(self.query_engine),
+            "graph_stats_loaded": bool(self.graph_stats),
+            "drift_config_loaded": bool(self.drift_config),
+            "llm_ready": bool(self.llm),
+            "graph_stats": self.graph_stats,
+            "drift_config": self.drift_config
+        }
+    async def cleanup_async_tasks(self, timeout: float = 2.0) -> None:
+        """
+        Clean up async tasks and pending operations.
+        Handles proper cleanup of LiteLLM and other async tasks to prevent
+        'Task was destroyed but it is pending!' warnings.
+        """
+        try:
+            import asyncio
+            # Import cleanup function if available
+            try:
+                from litellm_patch import cleanup_all_async_tasks
+                await cleanup_all_async_tasks(timeout=timeout)
+                logger.info(f"Cleaned up async tasks with timeout {timeout}s")
+            except ImportError:
+                # Fallback: Cancel pending tasks manually
+                pending_tasks = [task for task in asyncio.all_tasks() if not task.done()]
+                if pending_tasks:
+                    logger.info(f"Cancelling {len(pending_tasks)} pending tasks")
+                    for task in pending_tasks:
+                        task.cancel()
+                    # Wait for cancellation with timeout
+                    try:
+                        await asyncio.wait_for(
+                            asyncio.gather(*pending_tasks, return_exceptions=True),
+                            timeout=timeout
+                        )
+                    except asyncio.TimeoutError:
+                        logger.warning("Some tasks did not complete within timeout")
+        except Exception as e:
+            logger.error(f"Error during async cleanup: {e}")
+    def close(self):
+        """Clean up all connections"""
+        if self.neo4j_conn:
+            self.neo4j_conn.disconnect()
+        logger.info("Setup cleanup complete")
+def create_graphrag_setup():
+    """Factory function to create GraphRAG setup instance"""
+    return GraphRAGSetup()
+# Exports
+__all__ = ['GraphRAGSetup', 'create_graphrag_setup']

query_graph_functions/vector_augmentation.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""
+    Vector augmentation engine implementing Phase E (Steps 13-14).
+    Handles vector search operations and result fusion:
+    - Vector similarity search for additional context (Step 13)
+    - Result fusion strategy for enhanced answers (Step 14)
+"""
+import logging
+import numpy as np
+from typing import Dict, List, Any, Tuple
+from dataclasses import dataclass
+from datetime import datetime
+from .setup import GraphRAGSetup
+from .query_preprocessing import DriftRoutingResult
+@dataclass
+class VectorSearchResult:
+    """Vector search result with similarity and content."""
+    document_id: str
+    content: str
+    similarity_score: float
+    metadata: Dict[str, Any]
+    source_type: str  # 'vector_db', 'semantic_search'
+    relevance_score: float
+@dataclass
+class AugmentationResult:
+    """Phase E augmentation result with enhanced context."""
+    vector_results: List[VectorSearchResult]
+    enhanced_context: str
+    fusion_strategy: str
+    augmentation_confidence: float
+    execution_time: float
+    metadata: Dict[str, Any]
+class VectorAugmentationEngine:
+    def __init__(self, setup: GraphRAGSetup):
+        self.setup = setup
+        self.vector_engine = setup.query_engine  # Milvus vector engine
+        self.embedding_model = setup.embedding_model
+        self.config = setup.config
+        self.logger = logging.getLogger(self.__class__.__name__)
+        # Vector search parameters
+        self.similarity_threshold = 0.75
+        self.max_vector_results = 10
+    async def execute_vector_augmentation_phase(self,
+                                              query_embedding: List[float],
+                                              graph_results: Dict[str, Any],
+                                              routing_result: DriftRoutingResult) -> AugmentationResult:
+        """
+        Execute vector augmentation phase with similarity search.
+        Args:
+            query_embedding: Query vector for similarity matching
+            graph_results: Results from graph-based search
+            routing_result: Routing decision parameters
+        Returns:
+            Augmentation results with vector context
+        """
+        start_time = datetime.now()
+        try:
+            # Step 13: Vector Similarity Search
+            self.logger.info("Starting Step 13: Vector Similarity Search")
+            vector_results = await self._perform_vector_search(
+                query_embedding, routing_result
+            )
+            # Step 14: Result Fusion and Enhancement
+            self.logger.info("Starting Step 14: Result Fusion and Enhancement")
+            enhanced_context = await self._fuse_results(
+                vector_results, graph_results, routing_result
+            )
+            execution_time = (datetime.now() - start_time).total_seconds()
+            augmentation_result = AugmentationResult(
+                vector_results=vector_results,
+                enhanced_context=enhanced_context,
+                fusion_strategy='graph_vector_hybrid',
+                augmentation_confidence=self._calculate_augmentation_confidence(vector_results),
+                execution_time=execution_time,
+                metadata={
+                    'vector_results_count': len(vector_results),
+                    'avg_similarity': np.mean([r.similarity_score for r in vector_results]) if vector_results else 0,
+                    'phase': 'vector_augmentation',
+                    'step_range': '13-14'
+                }
+            )
+            self.logger.info(f"Phase E completed: {len(vector_results)} vector results, augmentation confidence: {augmentation_result.augmentation_confidence:.3f}")
+            return augmentation_result
+        except Exception as e:
+            self.logger.error(f"Vector augmentation phase failed: {e}")
+            # Return empty augmentation on failure
+            return AugmentationResult(
+                vector_results=[],
+                enhanced_context="",
+                fusion_strategy='graph_only',
+                augmentation_confidence=0.0,
+                execution_time=(datetime.now() - start_time).total_seconds(),
+                metadata={'error': str(e), 'fallback': True}
+            )
+    async def _perform_vector_search(self,
+                                   query_embedding: List[float],
+                                   routing_result: DriftRoutingResult) -> List[VectorSearchResult]:
+        """
+        Step 13: Perform comprehensive vector similarity search.
+        Uses the Milvus vector database to find semantically similar content.
+        """
+        try:
+            vector_results = []
+            # Use the existing vector query engine for similarity search
+            if self.vector_engine:
+                # Query the vector database with the embedding
+                search_results = self.vector_engine.query(routing_result.original_query)
+                # Extract vector search results from the response
+                if hasattr(search_results, 'source_nodes') and search_results.source_nodes:
+                    for i, node in enumerate(search_results.source_nodes[:self.max_vector_results]):
+                        # Calculate similarity score (handle different node types)
+                        similarity_score = 0.8  # Default similarity
+                        if hasattr(node, 'score'):
+                            similarity_score = node.score
+                        elif hasattr(node, 'similarity'):
+                            similarity_score = node.similarity
+                        elif hasattr(node, 'metadata') and 'score' in node.metadata:
+                            similarity_score = node.metadata['score']
+                        # Extract content (handle different node types)
+                        content = ""
+                        if hasattr(node, 'text'):
+                            content = node.text
+                        elif hasattr(node, 'content'):
+                            content = node.content
+                        elif hasattr(node, 'get_content'):
+                            content = node.get_content()
+                        else:
+                            content = str(node)
+                        # Extract metadata safely
+                        node_metadata = {}
+                        if hasattr(node, 'metadata') and node.metadata:
+                            node_metadata = node.metadata
+                        elif hasattr(node, 'extra_info') and node.extra_info:
+                            node_metadata = node.extra_info
+                        vector_result = VectorSearchResult(
+                            document_id=node_metadata.get('doc_id', f"doc_{i}"),
+                            content=content,
+                            similarity_score=similarity_score,
+                            metadata=node_metadata,
+                            source_type='vector_db',
+                            relevance_score=similarity_score * 0.9  # Slightly weighted down
+                        )
+                        # Only include results above similarity threshold
+                        if similarity_score >= self.similarity_threshold:
+                            vector_results.append(vector_result)
+                self.logger.info(f"Vector search completed: {len(vector_results)} results above threshold {self.similarity_threshold}")
+            else:
+                self.logger.warning("Vector engine not available, skipping vector search")
+            return vector_results
+        except Exception as e:
+            self.logger.error(f"Vector search failed: {e}")
+            return []
+    async def _fuse_results(self,
+                          vector_results: List[VectorSearchResult],
+                          graph_results: Dict[str, Any],
+                          routing_result: DriftRoutingResult) -> str:
+        """
+        Step 14: Fuse vector and graph results for enhanced context.
+        Combines graph-based entity relationships with vector similarity content.
+        """
+        try:
+            fusion_parts = []
+            # Start with graph-based context (Phase C & D results)
+            if 'initial_answer' in graph_results:
+                initial_answer = graph_results['initial_answer']
+                if isinstance(initial_answer, dict) and 'content' in initial_answer:
+                    fusion_parts.extend([
+                        "=== GRAPH-BASED KNOWLEDGE ===",
+                        initial_answer['content'],
+                        ""
+                    ])
+            # Add vector-based augmentation
+            if vector_results:
+                fusion_parts.extend([
+                    "=== SEMANTIC AUGMENTATION ===",
+                    "Additional relevant information from vector similarity search:",
+                    ""
+                ])
+                for i, result in enumerate(vector_results[:5], 1):  # Top 5 vector results
+                    fusion_parts.extend([
+                        f"**{i}. Vector Result (Similarity: {result.similarity_score:.3f})**",
+                        result.content,  # Show full content without truncation
+                        ""
+                    ])
+            # Add fusion methodology explanation
+            fusion_parts.extend([
+                "=== FUSION METHODOLOGY ===",
+                "This enhanced answer combines graph-based entity relationships with vector semantic similarity search.",
+                "Graph results provide structured knowledge connections, while vector search adds contextual depth.",
+                ""
+            ])
+            enhanced_context = "\n".join(fusion_parts)
+            self.logger.info(f"Result fusion completed: {len(fusion_parts)} context sections")
+            return enhanced_context
+        except Exception as e:
+            self.logger.error(f"Result fusion failed: {e}")
+            return "Graph-based results only (vector fusion failed)"
+    def _calculate_augmentation_confidence(self, vector_results: List[VectorSearchResult]) -> float:
+        """Calculate confidence score for the augmentation results."""
+        if not vector_results:
+            return 0.0
+        # Base confidence on average similarity and result count
+        avg_similarity = np.mean([r.similarity_score for r in vector_results])
+        count_factor = min(len(vector_results) / 10, 1.0)  # Normalize to max 10 results
+        # Combined confidence
+        confidence = (avg_similarity * 0.7) + (count_factor * 0.3)
+        return min(confidence, 1.0)
+    def get_augmentation_stats(self) -> Dict[str, Any]:
+        """Get statistics about vector augmentation performance."""
+        return {
+            'similarity_threshold': self.similarity_threshold,
+            'max_vector_results': self.max_vector_results,
+            'vector_engine_ready': bool(self.vector_engine),
+            'embedding_model': str(self.embedding_model) if self.embedding_model else None
+        }
+# Export main class
+__all__ = ['VectorAugmentationEngine', 'VectorSearchResult', 'AugmentationResult']