Spaces:

yugbirla
/

GraphResearcher

Sleeping

App Files Files Community

GraphResearcher / scripts /phase37_backend_compare_endpoint.py

yugbirla

Add backend document comparison endpoint

9b441aa 14 days ago

Raw

History Blame Contribute Delete

16.6 kB

	from pathlib import Path

	# Clean BOM
	for path in Path("app").rglob("*.py"):
	text = path.read_text(encoding="utf-8-sig")
	text = text.replace("\ufeff", "")
	path.write_text(text, encoding="utf-8")

	Path("app/product").mkdir(parents=True, exist_ok=True)

	# =====================================================
	# 1. Backend compare service
	# =====================================================

	Path("app/product/document_compare_service.py").write_text(r'''
	import inspect
	import json
	from typing import Any, Dict, List, Optional

	from fastapi import HTTPException
	from pydantic import BaseModel, Field


	class CompareDocumentsRequest(BaseModel):
	primary_document_id: str = Field(..., description="First document ID")
	compare_document_id: str = Field(..., description="Second document ID")
	query: str = Field(..., description="User comparison question")

	retrieval_mode: str = "hybrid"
	top_k: int = 8
	use_reranker: bool = True
	use_llm: bool = True
	use_graph: bool = True
	graph_entity_limit: int = 12
	use_graph_retrieval: bool = True
	graph_retrieval_top_k: int = 6
	answer_style: str = "comparison"


	def response_to_dict(value: Any) -> Dict[str, Any]:
	if value is None:
	return {}

	if isinstance(value, dict):
	return value

	if hasattr(value, "body"):
	try:
	body = value.body
	if isinstance(body, bytes):
	body = body.decode("utf-8")
	return json.loads(body)
	except Exception:
	pass

	if hasattr(value, "model_dump"):
	try:
	return value.model_dump()
	except Exception:
	pass

	if hasattr(value, "dict"):
	try:
	return value.dict()
	except Exception:
	pass

	return {
	"raw_response": str(value)
	}


	def get_model_fields(model_cls) -> set:
	fields = getattr(model_cls, "model_fields", None)

	if fields is None:
	fields = getattr(model_cls, "__fields__", {})

	return set(fields.keys())


	def build_ask_payload(
	document_id: str,
	query: str,
	request: CompareDocumentsRequest
	) -> Dict[str, Any]:
	return {
	"query": query,
	"document_id": document_id,
	"top_k": request.top_k,
	"retrieval_mode": request.retrieval_mode,
	"use_reranker": request.use_reranker,
	"use_llm": request.use_llm,
	"use_graph": request.use_graph,
	"graph_entity_limit": request.graph_entity_limit,
	"use_graph_retrieval": request.use_graph_retrieval,
	"graph_retrieval_top_k": request.graph_retrieval_top_k
	}


	def extract_sources(response: Dict[str, Any]) -> List[Dict[str, Any]]:
	sources = []

	for item in response.get("citations", []) or []:
	if isinstance(item, dict):
	sources.append(item)

	fusion = response.get("retrieval_fusion") or {}

	for item in fusion.get("fused_results", []) or []:
	if isinstance(item, dict):
	sources.append(item)

	for key in ["sources", "source_chunks", "retrieved_sources"]:
	for item in response.get(key, []) or []:
	if isinstance(item, dict):
	sources.append(item)

	cleaned = []
	seen = set()

	for index, src in enumerate(sources):
	source_id = (
	src.get("source_id")
	or src.get("citation_id")
	or src.get("id")
	or f"S{index + 1}"
	)

	chunk_id = (
	src.get("chunk_id")
	or src.get("source_chunk_id")
	or src.get("chunk")
	or source_id
	)

	page = (
	src.get("page")
	or src.get("page_number")
	or src.get("page_no")
	or "Not available"
	)

	key = f"{source_id}\|{chunk_id}\|{page}"

	if key in seen:
	continue

	seen.add(key)

	cleaned.append({
	"source_id": source_id,
	"chunk_id": chunk_id,
	"page": page,
	"document_name": (
	src.get("document_name")
	or src.get("source_file_name")
	or src.get("file_name")
	or src.get("filename")
	or "Selected document"
	),
	"preview": (
	src.get("text_preview")
	or src.get("preview")
	or src.get("chunk_preview")
	or src.get("text")
	or src.get("content")
	or ""
	),
	"raw": src
	})

	return cleaned[:8]


	def make_compare_question(user_query: str) -> str:
	"""
	Keep retrieval query clean. Do not inject long formatting prompt.
	Long prompts hurt semantic retrieval.
	"""
	return user_query.strip()


	async def call_existing_ask_endpoint(app, payload: Dict[str, Any]) -> Dict[str, Any]:
	ask_route = None

	for route in app.routes:
	route_path = getattr(route, "path", "")
	methods = getattr(route, "methods", set()) or set()

	if route_path == "/ask" and "POST" in methods:
	ask_route = route
	break

	if ask_route is None:
	raise HTTPException(
	status_code=500,
	detail="Could not find existing POST /ask endpoint."
	)

	try:
	from app.schemas.query_schema import AskRequest
	except Exception as exc:
	raise HTTPException(
	status_code=500,
	detail=f"Could not import AskRequest schema: {exc}"
	)

	allowed_fields = get_model_fields(AskRequest)
	filtered_payload = {
	key: value
	for key, value in payload.items()
	if key in allowed_fields
	}

	try:
	ask_request = AskRequest(**filtered_payload)
	except Exception as exc:
	raise HTTPException(
	status_code=400,
	detail=f"Could not build AskRequest for compare endpoint: {exc}"
	)

	endpoint = ask_route.endpoint
	signature = inspect.signature(endpoint)
	params = list(signature.parameters.values())

	try:
	if len(params) == 0:
	result = endpoint()
	elif len(params) == 1:
	result = endpoint(ask_request)
	else:
	kwargs = {}

	for param in params:
	param_name = param.name
	annotation = str(param.annotation)

	if "AskRequest" in annotation or param_name in {
	"request",
	"ask_request",
	"payload",
	"body"
	}:
	kwargs[param_name] = ask_request

	result = endpoint(**kwargs)

	if inspect.isawaitable(result):
	result = await result

	return response_to_dict(result)

	except HTTPException:
	raise
	except Exception as exc:
	raise HTTPException(
	status_code=500,
	detail=f"Compare endpoint failed while calling /ask: {exc}"
	)


	def build_rule_based_comparison(
	query: str,
	answer_a: str,
	answer_b: str
	) -> str:
	return (
	"Comparison summary\n"
	"The system answered the same question separately against both documents. "
	"Use the two document-specific answers and source panels to verify the differences.\n\n"
	"How to read this comparison\n"
	"1. Check Document A answer for claims supported by Document A sources.\n"
	"2. Check Document B answer for claims supported by Document B sources.\n"
	"3. If one answer is weaker or says evidence is missing, that document likely does not contain enough relevant indexed context for the question.\n\n"
	"Important limitation\n"
	"This comparison is evidence-grounded per document. It does not merge unsupported information across documents."
	)


	async def compare_documents_with_existing_ask(
	app,
	request: CompareDocumentsRequest
	) -> Dict[str, Any]:
	clean_query = make_compare_question(request.query)

	payload_a = build_ask_payload(
	document_id=request.primary_document_id,
	query=clean_query,
	request=request
	)

	payload_b = build_ask_payload(
	document_id=request.compare_document_id,
	query=clean_query,
	request=request
	)

	response_a = await call_existing_ask_endpoint(app, payload_a)
	response_b = await call_existing_ask_endpoint(app, payload_b)

	answer_a = response_a.get("answer", "")
	answer_b = response_b.get("answer", "")

	return {
	"status": "success",
	"mode": "backend_document_compare",
	"query": request.query,
	"primary_document_id": request.primary_document_id,
	"compare_document_id": request.compare_document_id,
	"comparison_summary": build_rule_based_comparison(
	query=request.query,
	answer_a=answer_a,
	answer_b=answer_b
	),
	"document_a": {
	"document_id": request.primary_document_id,
	"answer": answer_a,
	"sources": extract_sources(response_a),
	"ask_response": response_a
	},
	"document_b": {
	"document_id": request.compare_document_id,
	"answer": answer_b,
	"sources": extract_sources(response_b),
	"ask_response": response_b
	},
	"notes": [
	"Retrieval query is kept clean to preserve semantic search quality.",
	"Each document is queried independently.",
	"Sources are separated per document for verification."
	]
	}
	''', encoding="utf-8")


	# =====================================================
	# 2. Patch main.py with backend compare endpoint
	# =====================================================

	main_path = Path("app/main.py")
	text = main_path.read_text(encoding="utf-8-sig")
	text = text.replace("\ufeff", "")

	if "from app.product.document_compare_service import" not in text:
	text = (
	"from app.product.document_compare_service import CompareDocumentsRequest, compare_documents_with_existing_ask\n"
	+ text
	)

	if "# Backend document comparison endpoint" not in text:
	text += '''

	# Backend document comparison endpoint

	@app.post("/documents/compare")
	async def compare_two_documents(request: CompareDocumentsRequest):
	return await compare_documents_with_existing_ask(
	app=app,
	request=request
	)
	'''

	main_path.write_text(text, encoding="utf-8")


	# =====================================================
	# 3. Patch UI to use backend compare endpoint
	# =====================================================

	hf_path = Path("app/deployment/hf_status.py")
	ui = hf_path.read_text(encoding="utf-8-sig")
	ui = ui.replace("\ufeff", "")

	append_code = r'''
	# =====================================================
	# Phase 37 override: use backend /documents/compare
	# =====================================================

	try:
	_phase37_previous_get_product_app_html = get_product_app_html
	except NameError:
	_phase37_previous_get_product_app_html = None


	def get_product_app_html() -> str:
	if _phase37_previous_get_product_app_html is None:
	return "<h1>GraphResearcher App</h1><p>App UI is unavailable.</p>"

	html = _phase37_previous_get_product_app_html()

	js = """
	<script>
	/*
	Phase 37:
	Compare mode now uses backend /documents/compare instead of doing only browser-side comparison.
	Single-document chat still uses the previous sendMessage logic.
	*/

	if (!window.phase37OriginalSendMessage) {
	window.phase37OriginalSendMessage = window.sendMessage;
	}

	function phase37AnswerHtml(question, askResponse, doc) {
	if (typeof formatProfessionalAnswerPhase34 === 'function') {
	return formatProfessionalAnswerPhase34(question, askResponse, doc);
	}

	const answer = String(askResponse.answer \|\| 'No answer generated.');
	return '<div class="answer-card"><h2>Answer</h2><p>' + htmlEscapePhase37(answer) + '</p></div>';
	}

	function htmlEscapePhase37(value) {
	return String(value \|\| '')
	.replaceAll('&', '&')
	.replaceAll('<', '<')
	.replaceAll('>', '>')
	.replaceAll('\"', '"');
	}

	window.sendMessage = async function() {
	const doc = getSelectedDocument();
	const compareDoc = getCompareDocument ? getCompareDocument() : null;
	const input = document.getElementById('messageInput');
	const userText = input.value.trim();

	if (!doc) {
	alert('Upload or select a document first.');
	return;
	}

	if (!userText) return;

	if (!compareDoc) {
	return window.phase37OriginalSendMessage();
	}

	const convo = getConversation();

	convo.push({
	role: 'user',
	content: userText,
	createdAt: new Date().toISOString()
	});

	input.value = '';
	saveConversations();
	renderMessages();

	setStatus('Comparing documents...');
	document.getElementById('metricsBox').innerHTML = '';

	try {
	const payload = {
	primary_document_id: doc.id,
	compare_document_id: compareDoc.id,
	query: userText,
	retrieval_mode: 'hybrid',
	top_k: 8,
	use_reranker: document.getElementById('useReranker').checked,
	use_llm: document.getElementById('useLLM').checked,
	use_graph: document.getElementById('useGraph').checked,
	graph_entity_limit: 12,
	use_graph_retrieval: document.getElementById('useGraphRetrieval').checked,
	graph_retrieval_top_k: 6,
	answer_style: document.getElementById('answerStyle')?.value \|\| 'comparison'
	};

	const response = await fetch('/documents/compare', {
	method: 'POST',
	headers: {
	'Content-Type': 'application/json'
	},
	body: JSON.stringify(payload)
	});

	const data = await response.json();

	if (!response.ok) {
	throw new Error(JSON.stringify(data));
	}

	const dataA = data.document_a?.ask_response \|\| {
	answer: data.document_a?.answer \|\| 'No answer from first document.',
	citations: data.document_a?.sources \|\| []
	};

	const dataB = data.document_b?.ask_response \|\| {
	answer: data.document_b?.answer \|\| 'No answer from second document.',
	citations: data.document_b?.sources \|\| []
	};

	const answerAHtml = phase37AnswerHtml(userText, dataA, doc);
	const answerBHtml = phase37AnswerHtml(userText, dataB, compareDoc);

	convo.push({
	role: 'assistant',
	type: 'compare',
	question: userText,
	docAName: doc.name \|\| 'Document A',
	docBName: compareDoc.name \|\| 'Document B',
	answerA: data.document_a?.answer \|\| 'No answer from first document.',
	answerB: data.document_b?.answer \|\| 'No answer from second document.',
	answerAHtml,
	answerBHtml,
	comparisonSummary: data.comparison_summary \|\| '',
	rawCompare: data,
	createdAt: new Date().toISOString()
	});

	saveConversations();
	renderMessages();

	if (typeof updateMetrics === 'function') {
	updateMetrics(dataA, doc.name \|\| 'Document A');
	updateMetrics(dataB, compareDoc.name \|\| 'Document B');
	}

	if (typeof updateCitations === 'function' && typeof buildSources === 'function') {
	updateCitations([
	{ label: doc.name \|\| 'Document A', sources: buildSources(dataA, doc) },
	{ label: compareDoc.name \|\| 'Document B', sources: buildSources(dataB, compareDoc) }
	]);
	}

	setStatus('Backend comparison ready');

	} catch (error) {
	convo.push({
	role: 'assistant',
	content: 'Comparison error: ' + error.message,
	createdAt: new Date().toISOString()
	});

	saveConversations();
	renderMessages();
	setStatus('Comparison error');
	}
	}
	</script>
	"""

	if "Phase 37:" not in html:
	html = html.replace("</body>", js + "\n</body>")

	return html
	'''

	if "Phase 37 override: use backend /documents/compare" not in ui:
	ui += "\n\n" + append_code
	print("Phase 37 UI backend compare override added.")
	else:
	print("Phase 37 UI override already exists.")

	hf_path.write_text(ui, encoding="utf-8")

	print("Phase 37 backend compare endpoint added.")