import xml.etree.ElementTree as ET import pandas as pd import os import argparse import requests ASPECT_CATEGORIES = ['food', 'service', 'ambiance', 'price', 'anecdotes/miscellaneous'] def download_file(url: str, dest_path: str): print(f"Downloading from {url}...") response = requests.get(url) response.raise_for_status() with open(dest_path, 'wb') as f: f.write(response.content) print(f"Saved to {dest_path}") def parse_semeval_xml(xml_path: str) -> pd.DataFrame: print(f"Parsing: {xml_path}") tree = ET.parse(xml_path) root = tree.getroot() rows = [] for sentence in root.findall('.//sentence'): text_node = sentence.find('text') if text_node is None or not text_node.text: continue text = text_node.text.strip() aspects_node = sentence.find('aspectCategories') if aspects_node is not None: for aspect_cat in aspects_node.findall('aspectCategory'): category = aspect_cat.get('category') polarity = aspect_cat.get('polarity') if category in ASPECT_CATEGORIES and polarity in ['positive', 'negative', 'neutral', 'conflict']: rows.append({ 'text': text, 'aspect': category, 'sentiment': polarity }) df = pd.DataFrame(rows) print(f"Extracted {len(df)} aspect-sentiment pairs from {len(df['text'].unique()) if len(df) > 0 else 0} unique sentences.") return df def main(): parser = argparse.ArgumentParser(description="Convert SemEval XML to CSV for RoBERTa fine-tuning.") parser.add_argument('--out_dir', type=str, default='data/processed', help='Directory to save CSVs') parser.add_argument('--raw_dir', type=str, default='data/raw', help='Directory to save raw XMLs') args = parser.parse_args() os.makedirs(args.out_dir, exist_ok=True) os.makedirs(args.raw_dir, exist_ok=True) # A reliable mirror for the ABSA dataset train_url = "https://s3.amazonaws.com/fast-ai-nlp/semeval2014_task4/Restaurants_Train_v2.xml" test_url = "https://s3.amazonaws.com/fast-ai-nlp/semeval2014_task4/Restaurants_Test_Gold.xml" train_xml = os.path.join(args.raw_dir, 'Restaurants_Train_v2.xml') test_xml = os.path.join(args.raw_dir, 'Restaurants_Test_Gold.xml') try: if os.path.exists('dataset/Restaurants_Train_v2.xml'): print("Found local dataset directory!") train_xml = 'dataset/Restaurants_Train_v2.xml' else: if not os.path.exists(train_xml): download_file(train_url, train_xml) # We only strictly require the training file since test phase B is missing 'polarity' full_df = parse_semeval_xml(train_xml) from sklearn.model_selection import train_test_split train_df, test_df = train_test_split(full_df, test_size=0.2, random_state=42) out_path_train = os.path.join(args.out_dir, 'train.csv') train_df.to_csv(out_path_train, index=False) print(f"Saved training data ({len(train_df)} rows) to: {out_path_train}") out_path_test = os.path.join(args.out_dir, 'test.csv') test_df.to_csv(out_path_test, index=False) print(f"Saved testing data ({len(test_df)} rows) to: {out_path_test}") print("\nData processing complete! You can now upload train.csv and test.csv to Colab.") except requests.exceptions.RequestException as e: print(f"Failed to download dataset: {e}") print("Please manually place 'Restaurants_Train_v2.xml' and 'Restaurants_Test_Gold.xml' in the data/raw folder.") if __name__ == "__main__": main()