from pathlib import Path import re # ===================================================== # 1. Remove BOM from Python files # ===================================================== for path in Path("app").rglob("*.py"): text = path.read_text(encoding="utf-8-sig") text = text.replace("\ufeff", "") path.write_text(text, encoding="utf-8") print("BOM cleanup completed.") # ===================================================== # 2. Graph context service # ===================================================== Path("app/graph/graph_context_service.py").write_text(r''' import re from typing import Dict, Any, List, Optional from app.graph.graph_storage import read_document_graph STOPWORDS = { "what", "is", "are", "the", "a", "an", "of", "to", "and", "or", "in", "on", "for", "with", "from", "by", "how", "why", "explain", "define", "meaning", "does", "do", "it", "this", "that" } def tokenize_query(query: str) -> List[str]: words = re.findall(r"[a-zA-Z0-9_]+", (query or "").lower()) return [ word for word in words if word not in STOPWORDS and len(word) > 1 ] def entity_relevance_score(entity, query_terms: List[str]) -> float: if not query_terms: return 0.0 name_lower = entity.name.lower() entity_id_lower = entity.entity_id.lower() score = 0.0 for term in query_terms: if term == name_lower or term == entity_id_lower: score += 8.0 elif term in name_lower: score += 4.0 elif term in entity_id_lower: score += 3.0 score += min(entity.mention_count, 10) * 0.15 return score def build_graph_context_for_query( document_id: Optional[str], query: str, limit: int = 8 ) -> Dict[str, Any]: """ Finds graph entities and relations related to the query. This does not replace vector retrieval. It adds structured graph context to the final answer pipeline. """ if not document_id: return { "graph_available": False, "reason": "No document_id provided.", "matched_entities": [], "matched_relations": [], "context_text": "" } graph = read_document_graph(document_id) if graph is None: return { "graph_available": False, "reason": "Graph not built for this document.", "matched_entities": [], "matched_relations": [], "context_text": "" } query_terms = tokenize_query(query) scored_entities = [] for entity in graph.entities: score = entity_relevance_score(entity, query_terms) if score > 0: scored_entities.append((score, entity)) scored_entities.sort(key=lambda item: item[0], reverse=True) matched_entities = [ entity for score, entity in scored_entities[:limit] ] matched_entity_ids = { entity.entity_id for entity in matched_entities } matched_relations = [] for relation in graph.relations: if ( relation.source_entity_id in matched_entity_ids or relation.target_entity_id in matched_entity_ids ): matched_relations.append(relation) matched_relations = sorted( matched_relations, key=lambda relation: relation.weight, reverse=True )[:limit] context_text = build_graph_context_text( matched_entities=matched_entities, matched_relations=matched_relations ) return { "graph_available": True, "document_id": document_id, "source_file_name": graph.source_file_name, "query_terms": query_terms, "matched_entities": [ { "entity_id": entity.entity_id, "name": entity.name, "entity_type": entity.entity_type, "mention_count": entity.mention_count, "pages": entity.pages[:10], "chunk_ids": entity.chunk_ids[:10] } for entity in matched_entities ], "matched_relations": [ { "relation_id": relation.relation_id, "source": relation.source_name, "relation_type": relation.relation_type, "target": relation.target_name, "weight": relation.weight, "pages": relation.pages[:10], "chunk_ids": relation.chunk_ids[:10] } for relation in matched_relations ], "context_text": context_text } def build_graph_context_text( matched_entities, matched_relations ) -> str: lines = [] if matched_entities: lines.append("Relevant graph entities:") for entity in matched_entities: pages = ", ".join(str(page) for page in entity.pages[:5]) lines.append( f"- {entity.name} ({entity.entity_type}), mentions={entity.mention_count}, pages={pages}" ) if matched_relations: lines.append("") lines.append("Relevant graph relations:") for relation in matched_relations: lines.append( f"- {relation.source_name} --{relation.relation_type}--> {relation.target_name} " f"(weight={relation.weight})" ) return "\n".join(lines).strip() ''', encoding="utf-8") # ===================================================== # 3. Patch query_schema.py # ===================================================== Path("app/schemas/query_schema.py").write_text(r''' from pydantic import BaseModel, Field from typing import Optional, Literal class AskRequest(BaseModel): query: str = Field(..., min_length=1) document_id: Optional[str] = None top_k: int = Field(default=5, ge=1, le=20) retrieval_mode: Literal["vector", "keyword", "hybrid"] = "hybrid" use_reranker: bool = True use_llm: bool = True # Phase 15: # Adds graph context from entities and relations when document graph exists. use_graph: bool = True graph_entity_limit: int = Field(default=8, ge=1, le=30) ''', encoding="utf-8") # ===================================================== # 4. Patch prompt_builder.py # ===================================================== Path("app/generation/prompt_builder.py").write_text(r''' from app.generation.question_classifier import get_answer_instruction def build_grounded_prompt( query: str, evidence_context: str, question_type: str ) -> str: """ Builds a compact prompt. In Phase 15, evidence_context may contain: - retrieved source evidence - graph entity context - graph relation context The LLM still must answer only from supplied context. """ instruction = get_answer_instruction(question_type) return f""" Answer the question using only the supplied context. Question type: {question_type} Instruction: {instruction} Rules: - Do not use outside knowledge. - Preserve citations like [S1] and [S2] when making factual claims from retrieved sources. - Graph context can help explain entity relationships, but do not invent facts from it. - If retrieved source evidence and graph context disagree, trust retrieved source evidence. - Give a clear final answer, not notes. Question: {query} Context: {evidence_context} Final answer: """.strip() ''', encoding="utf-8") # ===================================================== # 5. Patch answer_service.py safely # ===================================================== answer_path = Path("app/generation/answer_service.py") text = answer_path.read_text(encoding="utf-8-sig") text = text.replace("\ufeff", "") if "from app.graph.graph_context_service import build_graph_context_for_query" not in text: text = "from app.graph.graph_context_service import build_graph_context_for_query\n" + text # Add graph params to function signature text = text.replace( ''' use_reranker: bool = True, use_llm: bool = True ) -> Dict[str, Any]: ''', ''' use_reranker: bool = True, use_llm: bool = True, use_graph: bool = True, graph_entity_limit: int = 8 ) -> Dict[str, Any]: ''' ) # Add graph context after evidence_context construction old_context_line = ''' evidence_context = build_evidence_context(evidence_items) ''' new_context_block = ''' evidence_context = build_evidence_context(evidence_items) graph_context = build_graph_context_for_query( document_id=document_id, query=query, limit=graph_entity_limit ) if use_graph else { "graph_available": False, "reason": "Graph usage disabled.", "matched_entities": [], "matched_relations": [], "context_text": "" } graph_context_text = graph_context.get("context_text", "") if graph_context_text: evidence_context = ( evidence_context + "\\n\\nStructured graph context:\\n" + graph_context_text ) ''' if old_context_line in text and "Structured graph context" not in text: text = text.replace(old_context_line, new_context_block) # Add graph info to final return dictionary before citations old_return_part = ''' "citations": citations, "evidence": evidence_items, "sources": sourced_results ''' new_return_part = ''' "graph_used": bool(graph_context.get("matched_entities") or graph_context.get("matched_relations")), "graph_context": graph_context, "citations": citations, "evidence": evidence_items, "sources": sourced_results ''' if old_return_part in text and '"graph_context": graph_context' not in text: text = text.replace(old_return_part, new_return_part) answer_path.write_text(text, encoding="utf-8") # ===================================================== # 6. Patch main.py # ===================================================== main_path = Path("app/main.py") text = main_path.read_text(encoding="utf-8-sig") text = text.replace("\ufeff", "") old_call = ''' use_reranker=request.use_reranker, use_llm=request.use_llm ''' new_call = ''' use_reranker=request.use_reranker, use_llm=request.use_llm, use_graph=request.use_graph, graph_entity_limit=request.graph_entity_limit ''' if old_call in text and "graph_entity_limit=request.graph_entity_limit" not in text: text = text.replace(old_call, new_call) if "from app.graph.graph_context_service import build_graph_context_for_query" not in text: text = "from app.graph.graph_context_service import build_graph_context_for_query\n" + text old_phases = [ "Phase 14.1 - Graph Visualization UI", "Phase 14 - Graph Foundation Entity Relation Extraction", "Phase 13 - Deployment Demo Stabilization", "Phase 12 - Hugging Face Hosted LLM Provider Hardening", ] for old in old_phases: text = text.replace(old, "Phase 15 - Graph-Augmented Answering") if "# Graph context debug endpoint" not in text: text += ''' # Graph context debug endpoint @app.get("/documents/{document_id}/graph/context") def get_graph_context_for_question( document_id: str, query: str = Query(..., min_length=1), limit: int = Query(8, ge=1, le=30) ): return build_graph_context_for_query( document_id=document_id, query=query, limit=limit ) ''' main_path.write_text(text, encoding="utf-8") print("Phase 15 graph-augmented answering patch applied successfully.")