Spaces:

justhariharan
/

Review-RAG

Sleeping

File size: 3,870 Bytes

1d70196

import xml.etree.ElementTree as ET
import pandas as pd
import os
import argparse
import requests

ASPECT_CATEGORIES = ['food', 'service', 'ambiance', 'price', 'anecdotes/miscellaneous']

def download_file(url: str, dest_path: str):
    print(f"Downloading from {url}...")
    response = requests.get(url)
    response.raise_for_status()
    with open(dest_path, 'wb') as f:
        f.write(response.content)
    print(f"Saved to {dest_path}")

def parse_semeval_xml(xml_path: str) -> pd.DataFrame:
    print(f"Parsing: {xml_path}")
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    rows = []
    
    for sentence in root.findall('.//sentence'):
        text_node = sentence.find('text')
        if text_node is None or not text_node.text:
            continue
            
        text = text_node.text.strip()
        
        aspects_node = sentence.find('aspectCategories')
        if aspects_node is not None:
            for aspect_cat in aspects_node.findall('aspectCategory'):
                category = aspect_cat.get('category')
                polarity = aspect_cat.get('polarity')
                
                if category in ASPECT_CATEGORIES and polarity in ['positive', 'negative', 'neutral', 'conflict']:
                    rows.append({
                        'text': text,
                        'aspect': category,
                        'sentiment': polarity
                    })
                    
    df = pd.DataFrame(rows)
    print(f"Extracted {len(df)} aspect-sentiment pairs from {len(df['text'].unique()) if len(df) > 0 else 0} unique sentences.")
    return df

def main():
    parser = argparse.ArgumentParser(description="Convert SemEval XML to CSV for RoBERTa fine-tuning.")
    parser.add_argument('--out_dir', type=str, default='data/processed', help='Directory to save CSVs')
    parser.add_argument('--raw_dir', type=str, default='data/raw', help='Directory to save raw XMLs')
    
    args = parser.parse_args()
    
    os.makedirs(args.out_dir, exist_ok=True)
    os.makedirs(args.raw_dir, exist_ok=True)
    
    # A reliable mirror for the ABSA dataset
    train_url = "https://s3.amazonaws.com/fast-ai-nlp/semeval2014_task4/Restaurants_Train_v2.xml"
    test_url = "https://s3.amazonaws.com/fast-ai-nlp/semeval2014_task4/Restaurants_Test_Gold.xml"
    
    train_xml = os.path.join(args.raw_dir, 'Restaurants_Train_v2.xml')
    test_xml = os.path.join(args.raw_dir, 'Restaurants_Test_Gold.xml')
    
    try:
        if os.path.exists('dataset/Restaurants_Train_v2.xml'):
            print("Found local dataset directory!")
            train_xml = 'dataset/Restaurants_Train_v2.xml'
        else:
            if not os.path.exists(train_xml):
                download_file(train_url, train_xml)
                
        # We only strictly require the training file since test phase B is missing 'polarity'
        full_df = parse_semeval_xml(train_xml)
        
        from sklearn.model_selection import train_test_split
        train_df, test_df = train_test_split(full_df, test_size=0.2, random_state=42)
        
        out_path_train = os.path.join(args.out_dir, 'train.csv')
        train_df.to_csv(out_path_train, index=False)
        print(f"Saved training data ({len(train_df)} rows) to: {out_path_train}")
        
        out_path_test = os.path.join(args.out_dir, 'test.csv')
        test_df.to_csv(out_path_test, index=False)
        print(f"Saved testing data ({len(test_df)} rows) to: {out_path_test}")
        
        print("\nData processing complete! You can now upload train.csv and test.csv to Colab.")
        
    except requests.exceptions.RequestException as e:
         print(f"Failed to download dataset: {e}")
         print("Please manually place 'Restaurants_Train_v2.xml' and 'Restaurants_Test_Gold.xml' in the data/raw folder.")

if __name__ == "__main__":
    main()