Spaces:

justhariharan
/

Review-RAG

Sleeping

Review-RAG / src /data_processing.py

HariHaran9597

Initial commit

1d70196 14 days ago

3.87 kB

	import xml.etree.ElementTree as ET
	import pandas as pd
	import os
	import argparse
	import requests

	ASPECT_CATEGORIES = ['food', 'service', 'ambiance', 'price', 'anecdotes/miscellaneous']

	def download_file(url: str, dest_path: str):
	print(f"Downloading from {url}...")
	response = requests.get(url)
	response.raise_for_status()
	with open(dest_path, 'wb') as f:
	f.write(response.content)
	print(f"Saved to {dest_path}")

	def parse_semeval_xml(xml_path: str) -> pd.DataFrame:
	print(f"Parsing: {xml_path}")
	tree = ET.parse(xml_path)
	root = tree.getroot()

	rows = []

	for sentence in root.findall('.//sentence'):
	text_node = sentence.find('text')
	if text_node is None or not text_node.text:
	continue

	text = text_node.text.strip()

	aspects_node = sentence.find('aspectCategories')
	if aspects_node is not None:
	for aspect_cat in aspects_node.findall('aspectCategory'):
	category = aspect_cat.get('category')
	polarity = aspect_cat.get('polarity')

	if category in ASPECT_CATEGORIES and polarity in ['positive', 'negative', 'neutral', 'conflict']:
	rows.append({
	'text': text,
	'aspect': category,
	'sentiment': polarity
	})

	df = pd.DataFrame(rows)
	print(f"Extracted {len(df)} aspect-sentiment pairs from {len(df['text'].unique()) if len(df) > 0 else 0} unique sentences.")
	return df

	def main():
	parser = argparse.ArgumentParser(description="Convert SemEval XML to CSV for RoBERTa fine-tuning.")
	parser.add_argument('--out_dir', type=str, default='data/processed', help='Directory to save CSVs')
	parser.add_argument('--raw_dir', type=str, default='data/raw', help='Directory to save raw XMLs')

	args = parser.parse_args()

	os.makedirs(args.out_dir, exist_ok=True)
	os.makedirs(args.raw_dir, exist_ok=True)

	# A reliable mirror for the ABSA dataset
	train_url = "https://s3.amazonaws.com/fast-ai-nlp/semeval2014_task4/Restaurants_Train_v2.xml"
	test_url = "https://s3.amazonaws.com/fast-ai-nlp/semeval2014_task4/Restaurants_Test_Gold.xml"

	train_xml = os.path.join(args.raw_dir, 'Restaurants_Train_v2.xml')
	test_xml = os.path.join(args.raw_dir, 'Restaurants_Test_Gold.xml')

	try:
	if os.path.exists('dataset/Restaurants_Train_v2.xml'):
	print("Found local dataset directory!")
	train_xml = 'dataset/Restaurants_Train_v2.xml'
	else:
	if not os.path.exists(train_xml):
	download_file(train_url, train_xml)

	# We only strictly require the training file since test phase B is missing 'polarity'
	full_df = parse_semeval_xml(train_xml)

	from sklearn.model_selection import train_test_split
	train_df, test_df = train_test_split(full_df, test_size=0.2, random_state=42)

	out_path_train = os.path.join(args.out_dir, 'train.csv')
	train_df.to_csv(out_path_train, index=False)
	print(f"Saved training data ({len(train_df)} rows) to: {out_path_train}")

	out_path_test = os.path.join(args.out_dir, 'test.csv')
	test_df.to_csv(out_path_test, index=False)
	print(f"Saved testing data ({len(test_df)} rows) to: {out_path_test}")

	print("\nData processing complete! You can now upload train.csv and test.csv to Colab.")

	except requests.exceptions.RequestException as e:
	print(f"Failed to download dataset: {e}")
	print("Please manually place 'Restaurants_Train_v2.xml' and 'Restaurants_Test_Gold.xml' in the data/raw folder.")

	if __name__ == "__main__":
	main()