readCtrl_lambda / code /reasoning /ressoning_qwen3-30B-a3b_v5.py
mshahidul
Initial commit of readCtrl code without large models
030876e
import os
import json
import tqdm
import argparse
import re
from openai import OpenAI
# -----------------------------
# CONFIGURATION
# -----------------------------
API_URL = "http://172.16.34.29:8004/v1"
API_KEY = "EMPTY"
MODEL_NAME = "Qwen/Qwen3-30B-A3B-Instruct-2507"
client = OpenAI(base_url=API_URL, api_key=API_KEY)
# -----------------------------
# REASONING PROMPTS
# -----------------------------
def get_audit_prompt(task_type, reference_text, subclaim, literacy_level):
level_guidelines = {
"low_health_literacy": """
Level: Low Health Literacy (High Readability)
Target: Individuals needing simple terms.
Goal: 'Living room' language. Replace jargon (e.g., 'renal' -> 'kidney').
Density: Strictly 'need-to-know' info from Gold Summary.
Strategy: High paraphrasing, analogies, one idea per sentence.
Faithfulness: Must align with Gold Summary.""",
"intermediate_health_literacy": """
Level: Intermediate Health Literacy (Medium Readability)
Target: General public.
Goal: Standard vocabulary. Common medical terms okay; technical speak simplified.
Density: Balanced. Use Gold Summary as lead, supplemented by context from Source.
Strategy: Moderate paraphrasing. Remove minor technical details.
Faithfulness: Maintain main narrative of Gold Summary.""",
"proficient_health_literacy": """
Level: Proficient Health Literacy (Low Readability)
Target: Researchers/Clinicians.
Goal: Technical/Academic. Prioritize clinical nuance and accuracy.
Density: High. Include data, physiological mechanisms, and statistics from Source.
Strategy: Minimal paraphrasing. Retain original technical terminology.
Faithfulness: Adhere to Source Text; add deeper scientific context."""
}
guidelines = level_guidelines.get(literacy_level, "Follow standard medical audit practices.")
level_desc = literacy_level.replace("_", " ")
base_instructions = f"""
### Literacy Level Context:
{guidelines}
### Task Instructions:"""
if task_type == "attribution":
return f"""{base_instructions}
1. Compare the Subclaim against the Source Text.
2. Flag as 'supported' if the Source contains this claim, even if highly paraphrased for {level_desc}.
SOURCE: {reference_text}
SUBCLAIM: {subclaim}
Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'."""
elif task_type == "completeness":
return f"""{base_instructions}
1. Is this Fact from the Gold Standard missing from the {level_desc} summary?
2. Mark 'supported' if: The info is present (paraphrased) OR if the info was omitted because it is too complex for {level_desc} guidelines.
SUMMARY: {reference_text}
FACT: {subclaim}
Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'."""
elif task_type == "conciseness":
return f"""{base_instructions}
1. The Subclaim exists in the summary but NOT in the Gold Reference. Is this okay?
2. Mark 'supported' if: The info adds necessary definitions or scientific depth appropriate for {level_desc}.
REFERENCE: {reference_text}
SUBCLAIM: {subclaim}
Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'."""
# NEW: Source Coverage Prompt
elif task_type == "source_coverage":
return f"""{base_instructions}
1. Check if the following Fact from the ORIGINAL Source Text is covered in the generated {level_desc} summary.
2. Mark 'supported' if the summary includes this information, even if it is simplified or combined with other points.
3. Mark 'not_supported' if the summary completely omits this specific medical fact.
GENERATED SUMMARY: {reference_text}
SOURCE FACT: {subclaim}
Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'."""
# -----------------------------
# LOGIC
# -----------------------------
def get_reasoned_verdict(reference, statement, task_type, literacy_level):
prompt = get_audit_prompt(task_type, reference, statement, literacy_level)
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
)
content = response.choices[0].message.content
# Extracts reasoning from <reasoning> tags specifically
reasoning = re.search(r"<reasoning>(.*?)</reasoning>", content, re.DOTALL).group(1).strip() if "<reasoning>" in content else "N/A"
final_text = content.split("</reasoning>")[-1].lower()
label = "supported" if "supported" in final_text and "not_supported" not in final_text else "not_supported"
return reasoning, label
except:
return "API Error", "not_supported"
# -----------------------------
# MAIN PROCESSING
# -----------------------------
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--eval_file", type=str, default="/home/mshahidul/readctrl/data/reasoning/reasoned_updated_results_0_20.json")
parser.add_argument("--source_file", type=str, default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json")
parser.add_argument("--save_path", type=str, default="/home/mshahidul/readctrl/data/reasoning/")
args = parser.parse_args()
os.makedirs(args.save_path, exist_ok=True)
with open(args.eval_file, "r") as f: eval_data = json.load(f)
with open(args.source_file, "r") as f: source_data = {item['index']: item for item in json.load(f)}
for doc in tqdm.tqdm(eval_data):
idx = doc['index']
original = source_data.get(idx, {})
for level, content in doc['literacy_levels'].items():
details = content['details']
gen_text = original.get('diff_label_texts', {}).get(level, '')
# 1. Audit Attribution
for item in details.get('attribution', []):
if item['status'] == "not_supported":
res, lbl = get_reasoned_verdict(original.get('fulltext'), item['subclaim'], "attribution", level)
item.update({"reasoning": res, "status": lbl, "refined": True})
# 2. Audit Conciseness
for item in details.get('conciseness', []):
if item['status'] == "not_supported":
res, lbl = get_reasoned_verdict(original.get('summary'), item['subclaim'], "conciseness", level)
item.update({"reasoning": res, "status": lbl, "refined": True})
# 3. Audit Completeness
# for item in details.get('completeness', []):
# if item['status'] == "not_supported":
# res, lbl = get_reasoned_verdict(gen_text, item['source_fact'], "completeness", level)
# item.update({"reasoning": res, "status": lbl, "refined": True})
# 4. NEW: Audit Source Coverage
# for item in details.get('source_coverage', []):
# if item['status'] == "not_supported":
# # Comparing Source Fact against the Generated Text
# res, lbl = get_reasoned_verdict(gen_text, item['source_subclaim'], "source_coverage", level)
# item.update({"reasoning": res, "status": lbl, "refined": True})
# Recalculate Scores
metrics = ['factual_attribution', 'conciseness']
for m in metrics:
if m in details:
content['scores'][m] = sum(1 for x in details[m] if x['status'] == 'supported') / len(details[m]) if details[m] else 0
save_path = os.path.join(args.save_path, f"REFINED_attr_concise_{os.path.basename(args.eval_file)}")
with open(save_path, "w") as f:
json.dump(eval_data, f, indent=2)
print(f"Refinement complete. Saved to {save_path}")