readctrl / code /finetune-inference /old /attribution_reasoning.py
shahidul034's picture
Add files using upload-large-folder tool
9c6961c verified
import json
import sys
from openai import OpenAI
import ast,os
# ===========================
# CONFIGURATION
# ===========================
MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged"
VLLM_API_URL = "http://localhost:8004/v1"
VLLM_API_KEY = "EMPTY"
# Initialize Client
client = OpenAI(
base_url=VLLM_API_URL,
api_key=VLLM_API_KEY,
)
# ===========================
# INFERENCE FUNCTION
# ===========================
def infer_reasonableness(
fulltext: str,
generated_summary: str,
readability_level: str,
subclaim_text: str,
result: int,
):
"""
Predict reasonableness using the local vLLM server.
No error handling: validation or connection errors will raise exceptions.
"""
# ---- Build inference prompt ----
prompt = f"""
### **SYSTEM / ROLE INSTRUCTION**
You are a **medical factuality and attribution evaluator**.
You will assess whether the **unsupported subclaim** in a generated summary (when `"result": 0"`) is a *reasonable addition* given the readability level (*easy / intermediate / hard*).
The goal is to decide whether this **extra piece of information** is an acceptable simplification or a *hallucination* that reduces factual faithfulness.
---
### **READABILITY & ATTRIBUTION GUIDELINES**
| Level | Audience | Linguistic & Stylistic Profile | Content Goal | Allowable Additions |
| :-- | :-- | :-- | :-- | :-- |
| **Easy (FH 70–100, grade 5–7)** | General public; early secondary readers | Short, direct sentences using common vocabulary and concrete ideas. Avoid subordinate clauses and technical terms. Tone should be explanatory, lively, and highly accessible. | Simplify and clarify events and outcomes without introducing technical or diagnostic details. | General background context or plain-language explanations are acceptable; **no new facts, data, or inferred medical claims.** |
| **Intermediate (FH 50–69, grade 8–12)** | Educated layperson / medical student | Moderate sentence length and complexity. Vocabulary suitable for high-school or introductory science readers. May include limited domain terms with brief clarification. | Present essential medical content with clear logic and limited detail, ensuring readability for non-experts. | Brief clarifications, definitions, or causal links consistent with the source are allowed; **avoid speculative or unconfirmed data.** |
| **Hard (FH 0–49, university / professional)** | Medical professionals / technical audience | Long, multi-clause sentences; formal academic tone. Incorporate precise domain vocabulary, causal and analytical connectors (e.g., *por consiguiente*, *sin embargo*, *en virtud de*, *dado que*), at least one definition, one process description, and one statement of implications or challenges. | Preserve full factual accuracy, diagnostic precision, and interpretive nuance expected in professional discourse. | Additions are **not permitted**; every statement must be directly supported by the reference text. Parenthetical clarifications or relative clauses may be used for cohesion, not new content. |
---
### **Input**
```
Readability Level: {readability_level}
Reference Full Text:
{fulltext}
Generated Summary:
{generated_summary}
Subclaim: "{subclaim_text}"
Result: {result} # 1 = supported (included), 0 = unsupported
```
---
### **TASK INSTRUCTIONS**
If `"result": 0"`, judge whether including this subclaim is **reasonable** for the given readability level.
Choose one of: `"reasonable addition"`, `"unnecessary but harmless"`, `"misleading / hallucinated"`.
Provide a **1–2 sentence justification** describing your reasoning.
---
### **Output Format**
Return structured JSON:
```json
{{
"evaluation": {{
"reasonableness": "<reasonable | partially_reasonable | unreasonable>",
"justification": "<short explanation>"
}}
}}
```
""".strip()
messages = [{"role": "user", "content": prompt}]
# ---- Call vLLM Server ----
response = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
temperature=0.2,
max_tokens=200,
top_p=0.8,
)
output_text = response.choices[0].message.content
# ---- Clean Output (Handle Thinking & Markdown) ----
try:
if "</think>" in output_text:
output_text = output_text.split("</think>")[1]
clean_text = output_text.strip().replace("```json", "").replace("```", "").strip()
# import ipdb; ipdb.set_trace()
t=ast.literal_eval(clean_text)
# ---- Parse JSON (Will raise JSONDecodeError if invalid) ----
return t
except Exception as e:
return output_text
# ===========================
# MAIN EXECUTION
# ===========================
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--data_path", type=str, required=True,
help="Path to the JSON file containing evaluation data.")
args = parser.parse_args()
data_path = args.data_path
# data_path = '/home/mshahidul/readctrl/data/concise_complete_attr_cal_v3/evaluated_metrics_0_100.json'
file_name=os.path.basename(data_path)
# Open file directly (Will raise FileNotFoundError if missing)
with open(data_path, 'r') as f:
dataset = json.load(f)
# print(f"Loaded {len(dataset)} examples. Starting inference...")
save_path = f'/home/mshahidul/readctrl/data/attribution_reasoning_result/{file_name}'
os.makedirs('/home/mshahidul/readctrl/data/attribution_reasoning_result/', exist_ok=True)
full_results = []
if os.path.exists(save_path):
with open(save_path, 'r') as f:
full_results = json.load(f)
import tqdm
for item in tqdm.tqdm(dataset):
if any(d['id'] == item['id'] for d in full_results):
continue
fulltext = item['fulltext']
temp2={}
for label in ['easy', 'intermediate', 'hard']:
generated_summary = item[f'{label}_text']
subclaim_list = item['metrics'][f'{label}']['attribution']['details']
temp=[]
for idx, subclaim in enumerate(subclaim_list):
# Check status (assumes subclaim variable holds the status string)
result = 1 if subclaim['label'] == 'supported' else 0
if result ==0:
output = infer_reasonableness(
fulltext=fulltext,
generated_summary=generated_summary,
readability_level=label,
subclaim_text=subclaim['subclaim'],
result=result,
)
temp.append({
'subclaim': subclaim['subclaim'],
'output': output
})
else:
temp.append({
'subclaim': subclaim['subclaim'],
'output': {
'reasonableness': 'reasonable',
'justification': 'The subclaim is included in the generated summary, hence it is reasonable.'
}
})
temp2[label] = {
'results': temp
}
full_results.append({
'id': item['id'],
'completeness': temp2
})
if len(full_results) % 10 == 0:
with open(save_path, 'w') as f:
json.dump(full_results, f, indent=2, ensure_ascii=False)
with open(save_path, 'w') as f:
json.dump(full_results, f, indent=2, ensure_ascii=False)