Spaces:

Sina1138
/

ReView

Sleeping

App Files Files Community

ReView / glimpse-ui /data /ExtractDISAPEREData.py

Sina1138

Super-squash branch 'main' using huggingface_hub

6fe7180 6 months ago

raw

history blame contribute delete

3.99 kB

	import os
	import json
	import pandas as pd
	from pathlib import Path

	BASE_DIR = Path(__file__).resolve().parent.parent
	base_path = BASE_DIR / "data" / "DISAPERE-main" / "DISAPERE" / "final_dataset"
	output_path = BASE_DIR / "data" / "DISAPERE-main" / "SELFExtractedData"

	###################################################################################
	###################################################################################

	# EXTRACTING POLARITY SENTENCES FROM DISAPERE DATASET

	# def extract_polarity_sentences(json_dir):
	# data = []
	# for filename in os.listdir(json_dir):
	# if filename.endswith(".json"):
	# with open(os.path.join(json_dir, filename), "r") as f:
	# thread = json.load(f)
	# for sentence in thread.get("review_sentences", []):
	# text = sentence.get("text", "").strip()
	# polarity = sentence.get("polarity")
	# if text:
	# if polarity == "pol_positive":
	# label = 2
	# elif polarity == "pol_negative":
	# label = 0
	# else:
	# label = 1
	# data.append({"text": text, "label": label})
	# return pd.DataFrame(data)

	# # Extract and save each split
	# for split in ["train", "dev", "test"]:
	# df = extract_polarity_sentences(os.path.join(base_path, split))
	# out_file = os.path.join(output_path, f"disapere_polarity_{split}.csv")
	# df.to_csv(out_file, index=False)
	# print(f"{split.capitalize()} saved to {out_file}: {len(df)} samples")


	###################################################################################
	###################################################################################

	# 2. EXTRACTING TOPIC SENTENCES FROM DISAPERE DATASET
	#
	# === Topic Label Mapping ===
	# 1: "Structuring"
	# 0: "Evaluative"
	# 2: "Request"
	# 3: "Fact"
	# 4: "Social"
	# 5: "Other"
	# 6: "Substance"
	# 7: "Clarity"
	# 8: "Soundness/Correctness"
	# 9: "Originality"
	# 10: "Motivation/Impact"
	# 11: "Meaningful Comparison"
	# 12: "Replicability"

	# Final topic classes
	topic_classes = [
	"asp_substance",
	"asp_clarity",
	"asp_soundness-correctness",
	"asp_originality",
	"asp_impact",
	"asp_comparison",
	"asp_replicability",
	"None", # This is used for sentences that do not match any specific topic
	# "arg-structuring_summary"
	]

	label_map = {label: idx for idx, label in enumerate(topic_classes)}

	def extract_topic_sentences(json_dir):
	data = []
	for filename in os.listdir(json_dir):
	if filename.endswith(".json"):
	with open(os.path.join(json_dir, filename), "r") as f:
	thread = json.load(f)
	for sentence in thread.get("review_sentences", []):
	text = sentence.get("text", "").strip()
	aspect = sentence.get("aspect", "")
	# fine_action = sentence.get("fine_review_action", "")

	# Decide label source
	topic = aspect if aspect in label_map else "None"

	if text and topic in label_map:
	label = label_map[topic]
	data.append({"text": text, "label": label})
	return pd.DataFrame(data)

	# Extract and save each split
	for split in ["train", "dev", "test"]:
	df = extract_topic_sentences(os.path.join(base_path, split))
	out_file = os.path.join(output_path, f"disapere_topic_{split}.csv")
	df.to_csv(out_file, index=False)
	print(f"{split.capitalize()} saved to {out_file}: {len(df)} samples")

	###################################################################################
	###################################################################################