File size: 7,933 Bytes
030876e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | import os
import json
import tqdm
import argparse
import re
from openai import OpenAI
# -----------------------------
# CONFIGURATION
# -----------------------------
API_URL = "http://172.16.34.29:8004/v1"
API_KEY = "EMPTY"
MODEL_NAME = "Qwen/Qwen3-30B-A3B-Instruct-2507"
client = OpenAI(base_url=API_URL, api_key=API_KEY)
# -----------------------------
# REASONING PROMPTS
# -----------------------------
def get_audit_prompt(task_type, reference_text, subclaim, literacy_level):
# Mapping the specific literacy guidelines to the prompt context
level_guidelines = {
"low_health_literacy": """
Level: Low Health Literacy (High Readability)
Target: Individuals needing simple terms.
Goal: 'Living room' language. Replace jargon (e.g., 'renal' -> 'kidney').
Density: Strictly 'need-to-know' info from Gold Summary.
Strategy: High paraphrasing, analogies, one idea per sentence.
Faithfulness: Must align with Gold Summary.""",
"intermediate_health_literacy": """
Level: Intermediate Health Literacy (Medium Readability)
Target: General public.
Goal: Standard vocabulary. Common medical terms okay; technical speak simplified.
Density: Balanced. Use Gold Summary as lead, supplemented by context from Source.
Strategy: Moderate paraphrasing. Remove minor technical details.
Faithfulness: Maintain main narrative of Gold Summary.""",
"proficient_health_literacy": """
Level: Proficient Health Literacy (Low Readability)
Target: Researchers/Clinicians.
Goal: Technical/Academic. Prioritize clinical nuance and accuracy.
Density: High. Include data, physiological mechanisms, and statistics from Source.
Strategy: Minimal paraphrasing. Retain original technical terminology.
Faithfulness: Adhere to Source Text; add deeper scientific context."""
}
guidelines = level_guidelines.get(literacy_level, "Follow standard medical audit practices.")
level_desc = literacy_level.replace("_", " ")
# Base instructions for the reasoning model
base_instructions = f"""
### Literacy Level Context:
{guidelines}
### Task Instructions:"""
if task_type == "attribution":
return f"""{base_instructions}
1. Compare the Subclaim against the Source Text.
2. Flag as 'supported' if the Source contains this claim, even if highly paraphrased for {level_desc}.
3. Note: Proficient level summaries should be strictly accurate, while Low level summaries use analogies.
SOURCE: {reference_text}
SUBCLAIM: {subclaim}
Provide reasoning in <think> tags, then output: 'supported' or 'not_supported'."""
elif task_type == "completeness":
return f"""{base_instructions}
1. Is this Fact from the Gold Standard missing from the {level_desc} summary?
2. Mark 'supported' if: The info is present (paraphrased) OR if the info was omitted because it is too complex/technical for the {level_desc} guidelines.
3. Mark 'not_supported' ONLY if a critical safety fact or 'need-to-know' item is truly missing.
SUMMARY: {reference_text}
FACT: {subclaim}
Provide reasoning in <think> tags, then output: 'supported' or 'not_supported'."""
elif task_type == "conciseness":
return f"""{base_instructions}
1. The Subclaim exists in the summary but NOT in the Gold Reference. Is this okay?
2. Mark 'supported' if: The info adds necessary definitions for Low/Intermediate readers, or adds scientific depth for Proficient readers.
3. Mark 'not_supported' if: The info is a hallucination or irrelevant 'fluff' that violates the Information Density rules.
REFERENCE: {reference_text}
SUBCLAIM: {subclaim}
Provide reasoning in <think> tags, then output: 'supported' or 'not_supported'."""
# -----------------------------
# LOGIC
# -----------------------------
def get_reasoned_verdict(reference, statement, task_type, literacy_level):
prompt = get_audit_prompt(task_type, reference, statement, literacy_level)
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
)
content = response.choices[0].message.content
# import ipdb; ipdb.set_trace()
reasoning = re.search(r"<reasoning>(.*?)</reasoning>", content, re.DOTALL).group(1).strip() if "<reasoning>" in content else "N/A"
final_text = content.split("</reasoning>")[-1].lower()
label = "supported" if "supported" in final_text and "not_supported" not in final_text else "not_supported"
return reasoning, label
except:
return "API Error", "not_supported"
# -----------------------------
# MAIN PROCESSING
# -----------------------------
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Path to the output of your previous generation script
parser.add_argument("--eval_file", type=str, default="/home/mshahidul/readctrl/data/factual_testing/full_details_evaluation_0_20_qwen3-32B.json")
# Path to the original data file containing 'fulltext' and 'summary'
parser.add_argument("--source_file", type=str, default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json")
parser.add_argument("--save_path", type=str, default="/home/mshahidul/readctrl/data/reasoning/")
args = parser.parse_args()
os.makedirs(args.save_path, exist_ok=True)
with open(args.eval_file, "r") as f: eval_data = json.load(f)
with open(args.source_file, "r") as f: source_data = {item['index']: item for item in json.load(f)}
for doc in tqdm.tqdm(eval_data):
idx = doc['index']
original = source_data.get(idx, {})
for level, content in doc['literacy_levels'].items():
details = content['details']
# import ipdb; ipdb.set_trace()
# 1. Audit Attribution (Check against Full Text)
for item in details['attribution']:
if item['status'] == "not_supported":
res, lbl = get_reasoned_verdict(original.get('fulltext'), item['subclaim'], "attribution", level)
item.update({"reasoning": res, "status": lbl, "refined": True})
# 2. Audit Conciseness (Check against Ref Summary)
for item in details['conciseness']:
if item['status'] == "not_supported":
res, lbl = get_reasoned_verdict(original.get('summary'), item['subclaim'], "conciseness", level)
item.update({"reasoning": res, "status": lbl, "refined": True})
# 3. Audit Completeness (Check Ref facts against Gen Text)
gen_text = original.get('diff_label_texts', {}).get(level, '')
for item in details['completeness']:
if item['status'] == "not_supported":
res, lbl = get_reasoned_verdict(gen_text, item['source_fact'], "completeness", level)
item.update({"reasoning": res, "status": lbl, "refined": True})
# Recalculate Scores after refinement
content['scores']['attribution'] = sum(1 for x in details['attribution'] if x['status'] == 'supported') / len(details['attribution']) if details['attribution'] else 0
content['scores']['conciseness'] = sum(1 for x in details['conciseness'] if x['status'] == 'supported') / len(details['conciseness']) if details['conciseness'] else 0
content['scores']['completeness'] = sum(1 for x in details['completeness'] if x['status'] == 'supported') / len(details['completeness']) if details['completeness'] else 0
save_path = os.path.join(args.save_path, f"REFINED_{os.path.basename(args.eval_file)}")
with open(save_path, "w") as f:
json.dump(eval_data, f, indent=2)
print(f"Refinement complete. Saved to {save_path}") |