simple-text-analyzer / test /test_csv_comma_handling.py
egumasa's picture
tag set
025fa56
#!/usr/bin/env python3
"""
Test script to demonstrate CSV comma handling with and without quoting parameters.
"""
import pandas as pd
import csv
from io import StringIO
def test_csv_comma_handling():
"""Test how different CSV reading approaches handle commas in data."""
# Sample problematic CSV data
problematic_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6
murder in,951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987
$ 100,000,950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172
normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0"""
# Properly quoted CSV data
quoted_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6
"murder in",951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987
"$ 100,000",950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172
normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0"""
print("=== Testing CSV Comma Handling ===\n")
# Test 1: Default pandas behavior (problematic)
print("1. Default pandas behavior with problematic CSV:")
try:
df_default = pd.read_csv(StringIO(problematic_csv))
print(f" Columns detected: {len(df_default.columns)}")
print(f" Column names: {list(df_default.columns)}")
print(f" First row data: {df_default.iloc[0].tolist()}")
print(f" Shape: {df_default.shape}")
except Exception as e:
print(f" Error: {e}")
print()
# Test 2: With quoting parameters (our solution)
print("2. With quoting parameters (our solution):")
try:
df_quoted = pd.read_csv(StringIO(problematic_csv),
quoting=csv.QUOTE_MINIMAL, quotechar='"')
print(f" Columns detected: {len(df_quoted.columns)}")
print(f" Column names: {list(df_quoted.columns)}")
print(f" First row data: {df_quoted.iloc[0].tolist()}")
print(f" Shape: {df_quoted.shape}")
except Exception as e:
print(f" Error: {e}")
print()
# Test 3: With properly quoted CSV
print("3. With properly quoted CSV data:")
try:
df_proper = pd.read_csv(StringIO(quoted_csv))
print(f" Columns detected: {len(df_proper.columns)}")
print(f" Column names: {list(df_proper.columns)}")
print(f" First row word: '{df_proper.iloc[0]['word']}'")
print(f" Second row word: '{df_proper.iloc[1]['word']}'")
print(f" Shape: {df_proper.shape}")
except Exception as e:
print(f" Error: {e}")
print()
# Test 4: Show the difference
print("4. Comparison of approaches:")
print(" Without quoting: Data with commas gets split incorrectly")
print(" With quoting: pandas can handle quoted fields properly")
print(" Best practice: Quote fields that contain commas in the source CSV")
if __name__ == "__main__":
test_csv_comma_handling()