#!/usr/bin/env python3 """ Test script to demonstrate CSV comma handling with and without quoting parameters. """ import pandas as pd import csv from io import StringIO def test_csv_comma_handling(): """Test how different CSV reading approaches handle commas in data.""" # Sample problematic CSV data problematic_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6 murder in,951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987 $ 100,000,950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172 normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0""" # Properly quoted CSV data quoted_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6 "murder in",951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987 "$ 100,000",950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172 normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0""" print("=== Testing CSV Comma Handling ===\n") # Test 1: Default pandas behavior (problematic) print("1. Default pandas behavior with problematic CSV:") try: df_default = pd.read_csv(StringIO(problematic_csv)) print(f" Columns detected: {len(df_default.columns)}") print(f" Column names: {list(df_default.columns)}") print(f" First row data: {df_default.iloc[0].tolist()}") print(f" Shape: {df_default.shape}") except Exception as e: print(f" Error: {e}") print() # Test 2: With quoting parameters (our solution) print("2. With quoting parameters (our solution):") try: df_quoted = pd.read_csv(StringIO(problematic_csv), quoting=csv.QUOTE_MINIMAL, quotechar='"') print(f" Columns detected: {len(df_quoted.columns)}") print(f" Column names: {list(df_quoted.columns)}") print(f" First row data: {df_quoted.iloc[0].tolist()}") print(f" Shape: {df_quoted.shape}") except Exception as e: print(f" Error: {e}") print() # Test 3: With properly quoted CSV print("3. With properly quoted CSV data:") try: df_proper = pd.read_csv(StringIO(quoted_csv)) print(f" Columns detected: {len(df_proper.columns)}") print(f" Column names: {list(df_proper.columns)}") print(f" First row word: '{df_proper.iloc[0]['word']}'") print(f" Second row word: '{df_proper.iloc[1]['word']}'") print(f" Shape: {df_proper.shape}") except Exception as e: print(f" Error: {e}") print() # Test 4: Show the difference print("4. Comparison of approaches:") print(" Without quoting: Data with commas gets split incorrectly") print(" With quoting: pandas can handle quoted fields properly") print(" Best practice: Quote fields that contain commas in the source CSV") if __name__ == "__main__": test_csv_comma_handling()