Spaces:

TheQuantEd
/

CTA

Running

App Files Files Community

CTA / backend /graphrag.py

TheQuantEd

Fix: Neo4j 5.26.0 (APOC available) + correct graphrag schema from seeder

b40cc1f 4 days ago

raw

history blame contribute delete

5.95 kB

	from langchain_community.graphs import Neo4jGraph
	from langchain_community.chains.graph_qa.cypher import GraphCypherQAChain
	from langchain_openai import ChatOpenAI
	from langchain_core.prompts import PromptTemplate
	from langchain_core.messages import BaseMessage, AIMessage
	from langchain_core.outputs import ChatResult, ChatGeneration
	import re
	import os
	from dotenv import load_dotenv

	load_dotenv()

	# Lazily initialised — Neo4j may not be ready at import time
	_graph = None
	_graph_chain = None


	def _strip_thinking(text: str) -> str:
	text = re.sub(r"<think(?:ing)?>.*?</think(?:ing)?>", "", text, flags=re.DOTALL \| re.IGNORECASE)
	return text.strip()


	class _ThinkStrippedLLM(ChatOpenAI):
	def _create_chat_result(self, response, generation_info=None) -> ChatResult:
	result: ChatResult = super()._create_chat_result(response, generation_info)
	cleaned = []
	for gen in result.generations:
	raw = gen.message.content or ""
	clean = _strip_thinking(raw)
	cleaned.append(ChatGeneration(message=AIMessage(content=clean), generation_info=gen.generation_info))
	return ChatResult(generations=cleaned, llm_output=result.llm_output)


	def _get_llm():
	return _ThinkStrippedLLM(
	model=os.getenv("OPENAI_MODEL", "qwen/qwen3-32b"),
	openai_api_key=os.getenv("OPENAI_API_KEY"),
	openai_api_base=os.getenv("OPENAI_BASE_URL"),
	temperature=0,
	)


	def _get_graph():
	global _graph
	if _graph is None:
	_graph = Neo4jGraph(
	url=os.getenv("NEO4J_URI") or "bolt://127.0.0.1:7687",
	username=os.getenv("NEO4J_USERNAME") or "neo4j",
	password=os.getenv("NEO4J_PASSWORD") or "clinicalmatch2024",
	database=os.getenv("NEO4J_DATABASE") or "neo4j",
	)
	return _graph


	_CYPHER_GENERATION_TEMPLATE = """You are an expert Neo4j Cypher query writer for a clinical trial matching system.

	Schema:
	{schema}

	Node labels and their exact property names:
	- Patient: id (e.g. "P_C50_000001"), name, age (integer), sex ("MALE"/"FEMALE"), ecog (integer 0-3),
	condition (lowercase, e.g. "breast cancer"), stage ("I"/"II"/"III"/"IV"),
	city, state, ethnicity, insurance, icd10_prefix,
	biomarkers (list of biomarker ids), medications (list of drug names),
	comorbidities (list), prior_chemo (boolean), prior_radiation (boolean),
	prior_surgery (boolean), prior_lines_of_therapy (integer), source
	- Trial: id (NCT id, e.g. "NCT04567890"), title, condition (lowercase), phase, status,
	brief_summary, eligibility_criteria, min_age, max_age, sex, enrollment,
	start_date, completion_date, sponsor, location_count, source
	- Diagnosis: code (ICD-10, e.g. "C50.919"), name (e.g. "Malignant neoplasm of breast"), source
	- Biomarker: id (e.g. "HER2_POS"), name (e.g. "HER2 Positive"), gene (e.g. "ERBB2"), loinc, source
	- Medication: rxcui, name, tty, generic_name, source
	- StudySite: facility, city, state, country, lat, lon, source
	- ConditionNode: name (e.g. "breast cancer")
	- Publication: pmid, title, journal, pub_date, authors, source

	Relationships:
	- (Patient)-[:ELIGIBLE_FOR {{score: float, matched_at: datetime}}]->(Trial)
	- (Patient)-[:HAS_DIAGNOSIS]->(Diagnosis)
	- (Patient)-[:HAS_BIOMARKER]->(Biomarker)
	- (Trial)-[:CONDUCTED_AT]->(StudySite)
	- (ConditionNode)-[:HAS_TRIAL]->(Trial)
	- (Diagnosis)-[:MAPS_TO_CONDITION]->(ConditionNode)
	- (Biomarker)-[:RELEVANT_TO]->(ConditionNode)
	- (Biomarker)-[:MAY_QUALIFY_FOR]->(Trial)
	- (Publication)-[:SUPPORTS_RESEARCH_ON]->(ConditionNode)

	Rules:
	- Biomarker lookups use the `id` property: `{{id: 'HER2_POS'}}`
	- Diagnosis lookups use `code` (not `id`): `{{code: 'C50.919'}}`
	- Medication lookups use `rxcui` or `name` (not `id`)
	- Condition lookups on Trial nodes use lowercase: `t.condition = 'breast cancer'`
	- Patient-to-trial eligibility: `(p:Patient)-[:ELIGIBLE_FOR]->(t:Trial)`
	- ecog property on Patient is `ecog` (integer), NOT `ecog_score`
	- Limit results to 25 unless asked for more

	Question: {question}
	Cypher query:"""

	_CYPHER_PROMPT = PromptTemplate(
	input_variables=["schema", "question"],
	template=_CYPHER_GENERATION_TEMPLATE,
	)


	def _get_chain():
	global _graph_chain
	if _graph_chain is None:
	_graph_chain = GraphCypherQAChain.from_llm(
	llm=_get_llm(),
	graph=_get_graph(),
	verbose=True,
	allow_dangerous_requests=True,
	cypher_prompt=_CYPHER_PROMPT,
	)
	return _graph_chain


	def retrieve_patient_trial_matches(patient_id: str) -> list:
	try:
	return _get_graph().query(f"""
	MATCH (p:Patient {{id: '{patient_id}'}})-[:HAS_DIAGNOSIS]->(d:Diagnosis)-[:ELIGIBLE_FOR]->(t:Trial)
	RETURN p.id as patient, d.name as diagnosis, t.id as trial, t.phase as phase, t.condition as condition
	""")
	except Exception as e:
	print(f"[graphrag] query error: {e}")
	return []


	def rag_query(question: str) -> str:
	try:
	result = _get_chain().run(question)
	return _strip_thinking(result) if result else "No results found."
	except Exception as e:
	err = str(e)
	if "<think>" in err or "SyntaxError" in err:
	return "The query model returned unexpected output. Please rephrase your question."
	return f"Graph query error: {err}"


	def get_graph_stats() -> dict:
	from neo4j_setup import neo4j_conn
	try:
	result = neo4j_conn.run_query("""
	MATCH (p:Patient) WITH count(p) as patients
	MATCH (t:Trial) WITH patients, count(t) as trials
	MATCH (d:Diagnosis) WITH patients, trials, count(d) as diagnoses
	RETURN patients, trials, diagnoses
	""")
	return {**(result[0] if result else {}), "status": "connected"}
	except Exception as e:
	return {"patients": 0, "trials": 0, "diagnoses": 0, "status": str(e)}