Spaces:
Building
Building
| #!/usr/bin/env python3 | |
| """ | |
| Test script to demonstrate CSV comma handling with and without quoting parameters. | |
| """ | |
| import pandas as pd | |
| import csv | |
| from io import StringIO | |
| def test_csv_comma_handling(): | |
| """Test how different CSV reading approaches handle commas in data.""" | |
| # Sample problematic CSV data | |
| problematic_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6 | |
| murder in,951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987 | |
| $ 100,000,950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172 | |
| normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0""" | |
| # Properly quoted CSV data | |
| quoted_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6 | |
| "murder in",951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987 | |
| "$ 100,000",950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172 | |
| normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0""" | |
| print("=== Testing CSV Comma Handling ===\n") | |
| # Test 1: Default pandas behavior (problematic) | |
| print("1. Default pandas behavior with problematic CSV:") | |
| try: | |
| df_default = pd.read_csv(StringIO(problematic_csv)) | |
| print(f" Columns detected: {len(df_default.columns)}") | |
| print(f" Column names: {list(df_default.columns)}") | |
| print(f" First row data: {df_default.iloc[0].tolist()}") | |
| print(f" Shape: {df_default.shape}") | |
| except Exception as e: | |
| print(f" Error: {e}") | |
| print() | |
| # Test 2: With quoting parameters (our solution) | |
| print("2. With quoting parameters (our solution):") | |
| try: | |
| df_quoted = pd.read_csv(StringIO(problematic_csv), | |
| quoting=csv.QUOTE_MINIMAL, quotechar='"') | |
| print(f" Columns detected: {len(df_quoted.columns)}") | |
| print(f" Column names: {list(df_quoted.columns)}") | |
| print(f" First row data: {df_quoted.iloc[0].tolist()}") | |
| print(f" Shape: {df_quoted.shape}") | |
| except Exception as e: | |
| print(f" Error: {e}") | |
| print() | |
| # Test 3: With properly quoted CSV | |
| print("3. With properly quoted CSV data:") | |
| try: | |
| df_proper = pd.read_csv(StringIO(quoted_csv)) | |
| print(f" Columns detected: {len(df_proper.columns)}") | |
| print(f" Column names: {list(df_proper.columns)}") | |
| print(f" First row word: '{df_proper.iloc[0]['word']}'") | |
| print(f" Second row word: '{df_proper.iloc[1]['word']}'") | |
| print(f" Shape: {df_proper.shape}") | |
| except Exception as e: | |
| print(f" Error: {e}") | |
| print() | |
| # Test 4: Show the difference | |
| print("4. Comparison of approaches:") | |
| print(" Without quoting: Data with commas gets split incorrectly") | |
| print(" With quoting: pandas can handle quoted fields properly") | |
| print(" Best practice: Quote fields that contain commas in the source CSV") | |
| if __name__ == "__main__": | |
| test_csv_comma_handling() | |