Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

simple-text-analyzer / test /test_csv_comma_handling.py

egumasa

tag set

025fa56 7 months ago

raw

history blame contribute delete

3.15 kB

	#!/usr/bin/env python3
	"""
	Test script to demonstrate CSV comma handling with and without quoting parameters.
	"""

	import pandas as pd
	import csv
	from io import StringIO

	def test_csv_comma_handling():
	"""Test how different CSV reading approaches handle commas in data."""

	# Sample problematic CSV data
	problematic_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6
	murder in,951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987
	$ 100,000,950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172
	normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0"""

	# Properly quoted CSV data
	quoted_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6
	"murder in",951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987
	"$ 100,000",950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172
	normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0"""

	print("=== Testing CSV Comma Handling ===\n")

	# Test 1: Default pandas behavior (problematic)
	print("1. Default pandas behavior with problematic CSV:")
	try:
	df_default = pd.read_csv(StringIO(problematic_csv))
	print(f" Columns detected: {len(df_default.columns)}")
	print(f" Column names: {list(df_default.columns)}")
	print(f" First row data: {df_default.iloc[0].tolist()}")
	print(f" Shape: {df_default.shape}")
	except Exception as e:
	print(f" Error: {e}")
	print()

	# Test 2: With quoting parameters (our solution)
	print("2. With quoting parameters (our solution):")
	try:
	df_quoted = pd.read_csv(StringIO(problematic_csv),
	quoting=csv.QUOTE_MINIMAL, quotechar='"')
	print(f" Columns detected: {len(df_quoted.columns)}")
	print(f" Column names: {list(df_quoted.columns)}")
	print(f" First row data: {df_quoted.iloc[0].tolist()}")
	print(f" Shape: {df_quoted.shape}")
	except Exception as e:
	print(f" Error: {e}")
	print()

	# Test 3: With properly quoted CSV
	print("3. With properly quoted CSV data:")
	try:
	df_proper = pd.read_csv(StringIO(quoted_csv))
	print(f" Columns detected: {len(df_proper.columns)}")
	print(f" Column names: {list(df_proper.columns)}")
	print(f" First row word: '{df_proper.iloc[0]['word']}'")
	print(f" Second row word: '{df_proper.iloc[1]['word']}'")
	print(f" Shape: {df_proper.shape}")
	except Exception as e:
	print(f" Error: {e}")
	print()

	# Test 4: Show the difference
	print("4. Comparison of approaches:")
	print(" Without quoting: Data with commas gets split incorrectly")
	print(" With quoting: pandas can handle quoted fields properly")
	print(" Best practice: Quote fields that contain commas in the source CSV")

	if __name__ == "__main__":
	test_csv_comma_handling()