File size: 7,933 Bytes
030876e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
import json
import tqdm
import argparse
import re
from openai import OpenAI

# -----------------------------
#  CONFIGURATION
# -----------------------------
API_URL = "http://172.16.34.29:8004/v1" 
API_KEY = "EMPTY"
MODEL_NAME = "Qwen/Qwen3-30B-A3B-Instruct-2507" 

client = OpenAI(base_url=API_URL, api_key=API_KEY)

# -----------------------------
#  REASONING PROMPTS
# -----------------------------
def get_audit_prompt(task_type, reference_text, subclaim, literacy_level):
    # Mapping the specific literacy guidelines to the prompt context
    level_guidelines = {
        "low_health_literacy": """
        Level: Low Health Literacy (High Readability)
        Target: Individuals needing simple terms.
        Goal: 'Living room' language. Replace jargon (e.g., 'renal' -> 'kidney').
        Density: Strictly 'need-to-know' info from Gold Summary.
        Strategy: High paraphrasing, analogies, one idea per sentence.
        Faithfulness: Must align with Gold Summary.""",
        
        "intermediate_health_literacy": """
        Level: Intermediate Health Literacy (Medium Readability)
        Target: General public.
        Goal: Standard vocabulary. Common medical terms okay; technical speak simplified.
        Density: Balanced. Use Gold Summary as lead, supplemented by context from Source.
        Strategy: Moderate paraphrasing. Remove minor technical details.
        Faithfulness: Maintain main narrative of Gold Summary.""",
        
        "proficient_health_literacy": """
        Level: Proficient Health Literacy (Low Readability)
        Target: Researchers/Clinicians.
        Goal: Technical/Academic. Prioritize clinical nuance and accuracy.
        Density: High. Include data, physiological mechanisms, and statistics from Source.
        Strategy: Minimal paraphrasing. Retain original technical terminology.
        Faithfulness: Adhere to Source Text; add deeper scientific context."""
    }

    guidelines = level_guidelines.get(literacy_level, "Follow standard medical audit practices.")
    level_desc = literacy_level.replace("_", " ")

    # Base instructions for the reasoning model
    base_instructions = f"""
### Literacy Level Context:
{guidelines}

### Task Instructions:"""

    if task_type == "attribution":
        return f"""{base_instructions}
1. Compare the Subclaim against the Source Text.
2. Flag as 'supported' if the Source contains this claim, even if highly paraphrased for {level_desc}.
3. Note: Proficient level summaries should be strictly accurate, while Low level summaries use analogies.
SOURCE: {reference_text}
SUBCLAIM: {subclaim}
Provide reasoning in <think> tags, then output: 'supported' or 'not_supported'."""

    elif task_type == "completeness":
        return f"""{base_instructions}
1. Is this Fact from the Gold Standard missing from the {level_desc} summary?
2. Mark 'supported' if: The info is present (paraphrased) OR if the info was omitted because it is too complex/technical for the {level_desc} guidelines.
3. Mark 'not_supported' ONLY if a critical safety fact or 'need-to-know' item is truly missing.
SUMMARY: {reference_text}
FACT: {subclaim}
Provide reasoning in <think> tags, then output: 'supported' or 'not_supported'."""

    elif task_type == "conciseness":
        return f"""{base_instructions}
1. The Subclaim exists in the summary but NOT in the Gold Reference. Is this okay?
2. Mark 'supported' if: The info adds necessary definitions for Low/Intermediate readers, or adds scientific depth for Proficient readers.
3. Mark 'not_supported' if: The info is a hallucination or irrelevant 'fluff' that violates the Information Density rules.
REFERENCE: {reference_text}
SUBCLAIM: {subclaim}
Provide reasoning in <think> tags, then output: 'supported' or 'not_supported'."""

# -----------------------------
#  LOGIC
# -----------------------------
def get_reasoned_verdict(reference, statement, task_type, literacy_level):
    prompt = get_audit_prompt(task_type, reference, statement, literacy_level)
    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1,
        )
        content = response.choices[0].message.content
        # import ipdb; ipdb.set_trace()
        reasoning = re.search(r"<reasoning>(.*?)</reasoning>", content, re.DOTALL).group(1).strip() if "<reasoning>" in content else "N/A"
        final_text = content.split("</reasoning>")[-1].lower()
        
        label = "supported" if "supported" in final_text and "not_supported" not in final_text else "not_supported"
        return reasoning, label
    except:
        return "API Error", "not_supported"

# -----------------------------
#  MAIN PROCESSING
# -----------------------------
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # Path to the output of your previous generation script
    parser.add_argument("--eval_file", type=str, default="/home/mshahidul/readctrl/data/factual_testing/full_details_evaluation_0_20_qwen3-32B.json")
    # Path to the original data file containing 'fulltext' and 'summary'
    parser.add_argument("--source_file", type=str, default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json")
    parser.add_argument("--save_path", type=str, default="/home/mshahidul/readctrl/data/reasoning/")
    args = parser.parse_args()

    os.makedirs(args.save_path, exist_ok=True)

    with open(args.eval_file, "r") as f: eval_data = json.load(f)
    with open(args.source_file, "r") as f: source_data = {item['index']: item for item in json.load(f)}

    for doc in tqdm.tqdm(eval_data):
        idx = doc['index']
        original = source_data.get(idx, {})
        
        for level, content in doc['literacy_levels'].items():
            details = content['details']
            # import ipdb; ipdb.set_trace()
            
            # 1. Audit Attribution (Check against Full Text)
            for item in details['attribution']:
                if item['status'] == "not_supported":
                    res, lbl = get_reasoned_verdict(original.get('fulltext'), item['subclaim'], "attribution", level)
                    item.update({"reasoning": res, "status": lbl, "refined": True})

            # 2. Audit Conciseness (Check against Ref Summary)
            for item in details['conciseness']:
                if item['status'] == "not_supported":
                    res, lbl = get_reasoned_verdict(original.get('summary'), item['subclaim'], "conciseness", level)
                    item.update({"reasoning": res, "status": lbl, "refined": True})

            # 3. Audit Completeness (Check Ref facts against Gen Text)
            gen_text = original.get('diff_label_texts', {}).get(level, '')
            for item in details['completeness']:
                if item['status'] == "not_supported":
                    res, lbl = get_reasoned_verdict(gen_text, item['source_fact'], "completeness", level)
                    item.update({"reasoning": res, "status": lbl, "refined": True})

            # Recalculate Scores after refinement
            content['scores']['attribution'] = sum(1 for x in details['attribution'] if x['status'] == 'supported') / len(details['attribution']) if details['attribution'] else 0
            content['scores']['conciseness'] = sum(1 for x in details['conciseness'] if x['status'] == 'supported') / len(details['conciseness']) if details['conciseness'] else 0
            content['scores']['completeness'] = sum(1 for x in details['completeness'] if x['status'] == 'supported') / len(details['completeness']) if details['completeness'] else 0

    save_path = os.path.join(args.save_path, f"REFINED_{os.path.basename(args.eval_file)}")
    with open(save_path, "w") as f:
        json.dump(eval_data, f, indent=2)
    print(f"Refinement complete. Saved to {save_path}")