readctrl / code /key_subclaims_extract.py
shahidul034's picture
Add files using upload-large-folder tool
1db7196 verified
from openai import OpenAI
import json
import os
import tqdm
# --- 1. Load Paths and Data ---
data_path = '/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json'
prompt_path = "/home/mshahidul/readctrl/prompts/minimum_info_extract _v2"
api_file = "/home/mshahidul/api_new.json"
save_path = "/home/mshahidul/readctrl/data/key_subclaims_testing/key_subclaims.json"
# Load the dataset
with open(data_path, 'r') as f:
dataset = json.load(f)
# Load the prompt template
with open(prompt_path, "r") as f:
prompt_template = f.read()
# Load API Key
with open(api_file, "r") as f:
api_keys = json.load(f)
openai_api_key = api_keys["openai"]
client = OpenAI(api_key=openai_api_key)
# --- 2. Helper Functions ---
def openai_return(prompt, model="gpt-5"):
"""Send a prompt to GPT and parse strictly formatted JSON."""
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant that outputs strictly in JSON format."},
{"role": "user", "content": prompt}
],
response_format={"type": "json_object"}
)
content = response.choices[0].message.content.strip()
return json.loads(content)
except Exception as e:
print(f"⚠️ Error processing API response: {e}")
return {"error": str(e), "raw_content": content if 'content' in locals() else None}
def format_subclaims(subclaim_list, prefix):
"""Formats subclaims with IDs (e.g., ST-1, GS-1) for better LLM tracking."""
if not isinstance(subclaim_list, list):
return str(subclaim_list)
return "\n".join([f"{prefix}-{i+1}: {text}" for i, text in enumerate(subclaim_list)])
# --- 3. Main Processing Loop ---
res = []
if os.path.exists(save_path):
with open(save_path, "r") as f:
res = json.load(f)
# Start from where we left off
start_index = len(res)
num_to_process = 100
for i in tqdm.tqdm(range(start_index, min(start_index + num_to_process, len(dataset)))):
item = dataset[i]
# 1. Extract raw data
source_text = item.get('fulltext', '')
source_subclaims_list = item.get('fulltext_subclaims', [])
gold_summary = item.get('summary', '')
gold_subclaims_list = item.get('summary_subclaims', [])
# 2. Format specifically for the prompt (Mapping IDs like ST-1, GS-1)
# This helps the LLM return the IDs you requested in your Output Format
source_subclaims_formatted = format_subclaims(source_subclaims_list, "ST")
gold_subclaims_formatted = format_subclaims(gold_subclaims_list, "GS")
# 3. Inject into prompt
prompt = prompt_template.replace("<<SOURCE_TEXT>>", source_text)\
.replace("<<SOURCE_TEXT_SUBCLAIMS>>", source_subclaims_formatted)\
.replace("<<GOLD_SUMMARY>>", gold_summary)\
.replace("<<GOLD_SUMMARY_SUBCLAIMS>>", gold_subclaims_formatted)
# 4. Call API
api_response = openai_return(prompt)
# 5. Build full result object
result_entry = {
"index": i,
"original_id": item.get('id'),
"input_data": {
"source_text": source_text,
"source_subclaims": source_subclaims_list,
"gold_summary": gold_summary,
"gold_subclaims": gold_subclaims_list
},
"llm_output": api_response
}
res.append(result_entry)
# Autosave every 5 samples
if len(res) % 5 == 0:
with open(save_path, "w") as f:
json.dump(res, f, indent=2, ensure_ascii=False)
# Final Save
with open(save_path, "w") as f:
json.dump(res, f, indent=2, ensure_ascii=False)
print(f"\n✅ Finished! Processed {len(res) - start_index} new samples.")
print(f"Total samples in {save_path}: {len(res)}")