readCtrl_lambda / code /reasoning /ressoning_qwen3-30B-a3b_cover_all.py

mshahidul

Initial commit of readCtrl code without large models

030876e about 1 month ago

12.2 kB

	import os
	import json
	import tqdm
	import argparse
	import re
	from openai import OpenAI

	# -----------------------------
	# CONFIGURATION
	# -----------------------------
	API_URL = "http://172.16.34.29:8004/v1"
	API_KEY = "EMPTY"
	MODEL_NAME = "Qwen/Qwen3-30B-A3B-Instruct-2507"

	client = OpenAI(base_url=API_URL, api_key=API_KEY)

	# -----------------------------
	# REASONING PROMPTS
	# -----------------------------
	def get_audit_prompt(task_type, reference_text, subclaim, literacy_level):
	level_guidelines = {
	"low_health_literacy": """
	Level: Low Health Literacy (High Readability)
	Target: Individuals needing simple terms.
	Goal: 'Living room' language. Replace jargon (e.g., 'renal' -> 'kidney').
	Density: Strictly 'need-to-know' info from Gold Summary.
	Strategy: High paraphrasing, analogies, one idea per sentence.
	Faithfulness: Must align with Gold Summary.""",

	"intermediate_health_literacy": """
	Level: Intermediate Health Literacy (Medium Readability)
	Target: General public.
	Goal: Standard vocabulary. Common medical terms okay; technical speak simplified.
	Density: Balanced. Use Gold Summary as lead, supplemented by context from Source.
	Strategy: Moderate paraphrasing. Remove minor technical details.
	Faithfulness: Maintain main narrative of Gold Summary.""",

	"proficient_health_literacy": """
	Level: Proficient Health Literacy (Low Readability)
	Target: Researchers/Clinicians.
	Goal: Technical/Academic. Prioritize clinical nuance and accuracy.
	Density: High. Include data, physiological mechanisms, and statistics from Source.
	Strategy: Minimal paraphrasing. Retain original technical terminology.
	Faithfulness: Adhere to Source Text; add deeper scientific context."""
	}

	guidelines = level_guidelines.get(literacy_level, "Follow standard medical audit practices.")
	level_desc = literacy_level.replace("_", " ")

	base_instructions = f"""
	### Literacy Level Context:
	{guidelines}

	### Task Instructions:"""

	# if task_type == "attribution":
	# return f"""{base_instructions}
	# 1. Compare the Subclaim against the Source Text.
	# 2. Flag as 'supported' if the Source contains this claim, even if highly paraphrased for {level_desc}.
	# SOURCE: {reference_text}
	# SUBCLAIM: {subclaim}
	# Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'."""
	if task_type == "attribution":
	return f"""{base_instructions}
	1. Compare the Subclaim against the Source Text.
	2. Mark 'supported' ONLY IF:
	- The Source Text explicitly states the claim, OR
	- The claim is clearly conveyed through a faithful paraphrase that preserves its meaning.
	3. Do NOT infer support from silence, omission, or related but non-equivalent statements.
	4. For negative or exclusionary claims (e.g., "no complications," "no family history," "absence of signs"),
	the Source Text must explicitly indicate absence.
	5. Mark 'not_supported' if:
	- The claim is missing, OR
	- The Source discusses a related concept but does not confirm the specific claim.

	SOURCE: {reference_text}
	SUBCLAIM: {subclaim}

	Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'.
	"""


	# elif task_type == "completeness":
	# return f"""{base_instructions}
	# 1. Is this Fact from the Gold Standard missing from the {level_desc} summary?
	# 2. Mark 'supported' if: The info is present (paraphrased) OR if the info was omitted because it is too complex for {level_desc} guidelines.
	# SUMMARY: {reference_text}
	# FACT: {subclaim}
	# Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'."""
	elif task_type == "completeness":
	return f"""{base_instructions}
	1. Determine whether this Fact from the Gold Standard is covered in the {level_desc} summary.
	2. Mark 'supported' ONLY IF:
	- The fact is explicitly stated in the summary, OR
	- The fact is clearly paraphrased or simplified in a way that preserves its meaning.
	3. Do NOT mark 'supported' based solely on omission.
	- Absence of mention does NOT imply intentional exclusion.
	- Negative or exclusionary facts (e.g., "no complications," "no family history," "no systemic signs") must be explicitly conveyed.
	4. Mark 'not_supported' if:
	- The fact is completely omitted, OR
	- The summary discusses related information but does not confirm the specific fact.
	5. Literacy-based simplification is allowed, but factual meaning must be preserved.

	SUMMARY: {reference_text}
	FACT: {subclaim}

	Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'.
	"""


	# elif task_type == "conciseness":
	# return f"""{base_instructions}
	# 1. The Subclaim exists in the summary but NOT in the Gold Reference. Is this okay?
	# 2. Mark 'supported' if: The info adds necessary definitions or scientific depth appropriate for {level_desc}.
	# REFERENCE: {reference_text}
	# SUBCLAIM: {subclaim}
	# Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'."""
	elif task_type == "conciseness":
	return f"""{base_instructions}
	1. The Subclaim appears in the summary but NOT in the Gold Reference.
	2. Determine whether this addition is acceptable.
	3. Mark 'supported' ONLY IF:
	- The information is a definition, clarification, or explanatory restatement
	of concepts already present in the Gold Reference, AND
	- It does NOT introduce new clinical findings, test results, diagnoses,
	causes, outcomes, or exclusions.
	4. Do NOT mark 'supported' if the Subclaim:
	- Adds a new medical fact not found in the Gold Reference, OR
	- Draws clinical conclusions or inferences beyond what the source states.
	5. Literacy-based explanation is allowed, but factual content must remain unchanged.

	REFERENCE: {reference_text}
	SUBCLAIM: {subclaim}

	Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'.
	"""


	# NEW: Source Coverage Prompt
	# elif task_type == "source_coverage":
	# return f"""{base_instructions}
	# 1. Check if the following Fact from the ORIGINAL Source Text is covered in the generated {level_desc} summary.
	# 2. Mark 'supported' if the summary includes this information, even if it is simplified or combined with other points.
	# 3. Mark 'not_supported' if the summary completely omits this specific medical fact.
	# GENERATED SUMMARY: {reference_text}
	# SOURCE FACT: {subclaim}
	# Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'."""
	elif task_type == "source_coverage":
	return f"""{base_instructions}
	1. Check whether the following Fact from the ORIGINAL Source Text is explicitly covered in the generated {level_desc} summary.
	2. Mark 'supported' ONLY IF:
	- The summary clearly states the fact, OR
	- The fact is conveyed through an explicit paraphrase or simplification that preserves its meaning.
	3. Do NOT infer support from silence or omission.
	- Absence of mention does NOT count as support.
	- Especially for negative or exclusionary facts (e.g., "no family history," "no extra-renal signs," "no complications"), the summary must explicitly indicate absence.
	4. Mark 'not_supported' if:
	- The summary omits the fact entirely, OR
	- The summary discusses related topics but does not clearly confirm the specific fact.
	5. Simplification for literacy level is allowed, but factual meaning must be preserved.

	GENERATED SUMMARY: {reference_text}
	SOURCE FACT: {subclaim}

	Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'.
	"""

	# -----------------------------
	# LOGIC
	# -----------------------------
	def get_reasoned_verdict(reference, statement, task_type, literacy_level):
	prompt = get_audit_prompt(task_type, reference, statement, literacy_level)
	try:
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[{"role": "user", "content": prompt}],
	temperature=0.1,
	)
	content = response.choices[0].message.content

	# Extracts reasoning from <reasoning> tags specifically
	reasoning = re.search(r"<reasoning>(.*?)</reasoning>", content, re.DOTALL).group(1).strip() if "<reasoning>" in content else "N/A"
	final_text = content.split("</reasoning>")[-1].lower()

	label = "supported" if "supported" in final_text and "not_supported" not in final_text else "not_supported"
	return reasoning, label
	except:
	return "API Error", "not_supported"

	# -----------------------------
	# MAIN PROCESSING
	# -----------------------------
	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--eval_file", type=str, default="/home/mshahidul/readctrl/data/factual_testing/full_details_evaluation_0_20_qwen3-32B_v2.json")
	parser.add_argument("--source_file", type=str, default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json")
	parser.add_argument("--save_path", type=str, default="/home/mshahidul/readctrl/data/reasoning/")
	args = parser.parse_args()

	os.makedirs(args.save_path, exist_ok=True)

	with open(args.eval_file, "r") as f: eval_data = json.load(f)
	with open(args.source_file, "r") as f: source_data = {item['index']: item for item in json.load(f)}

	for doc in tqdm.tqdm(eval_data):
	idx = doc['index']
	original = source_data.get(idx, {})

	for level, content in doc['literacy_levels'].items():
	details = content['details']
	gen_text = original.get('diff_label_texts', {}).get(level, '')

	# 1. Audit Attribution
	for item in details.get('attribution', []):
	if item['status'] == "not_supported":
	res, lbl = get_reasoned_verdict(original.get('fulltext'), item['subclaim'], "attribution", level)
	item.update({"reasoning": res, "status": lbl, "refined": True})

	# 2. Audit Conciseness
	for item in details.get('conciseness', []):
	if item['status'] == "not_supported":
	res, lbl = get_reasoned_verdict(original.get('summary'), item['subclaim'], "conciseness", level)
	item.update({"reasoning": res, "status": lbl, "refined": True})

	# 3. Audit Completeness
	for item in details.get('completeness', []):
	if item['status'] == "not_supported":
	res, lbl = get_reasoned_verdict(gen_text, item['source_fact'], "completeness", level)
	item.update({"reasoning": res, "status": lbl, "refined": True})

	# 4. NEW: Audit Source Coverage
	for item in details.get('source_coverage', []):
	if item['status'] == "not_supported":
	# Comparing Source Fact against the Generated Text
	res, lbl = get_reasoned_verdict(gen_text, item['source_subclaim'], "source_coverage", level)
	item.update({"reasoning": res, "status": lbl, "refined": True})

	# Recalculate Scores
	metrics = ['factual_attribution', 'conciseness', 'completeness', 'source_coverage']
	for m in metrics:
	if m in details:
	content['scores'][m] = sum(1 for x in details[m] if x['status'] == 'supported') / len(details[m]) if details[m] else 0

	save_path = os.path.join(args.save_path, f"REFINED_{os.path.basename(args.eval_file)}")
	with open(save_path, "w") as f:
	json.dump(eval_data, f, indent=2)
	print(f"Refinement complete. Saved to {save_path}")