Spaces:

MCP-1st-Birthday
/

sdlc-agent

Runtime error

App Files Files Community

sdlc-agent / src /data /clean_sample.py

Veeru-c

initial commit

06bd253 22 days ago

raw

history blame contribute delete

4.56 kB

	import pandas as pd
	import re
	import os

	def clean_value(val):
	"""Clean and normalize values"""
	if pd.isna(val):
	return None
	val_str = str(val).strip()
	# Remove leading codes like "13103_"
	val_str = re.sub(r'^\d+_', '', val_str)
	# Remove numpy type wrappers
	val_str = re.sub(r'^np\.(int\|float)\d*$(.+)$$', r'\2', val_str)
	return val_str if val_str and val_str.lower() not in ['nan', 'none', ''] else None

	def find_header_row(df):
	"""
	Intelligently find the row that contains real column headers.
	Returns: (header_row_index, data_start_row_index)
	"""
	for i in range(min(20, len(df))):
	row = df.iloc[i]

	# Count non-null values
	non_null_count = row.count()
	if non_null_count < len(df.columns) * 0.3:
	continue # Skip sparse rows

	# Check if this row has "Unnamed" in most values (skip these)
	unnamed_count = sum(1 for val in row if pd.notna(val) and "Unnamed" in str(val))
	if unnamed_count > non_null_count * 0.3:
	continue # Skip rows with too many "Unnamed"

	# Count how many values look like headers (strings, not numbers)
	header_like = 0
	for val in row:
	if pd.notna(val):
	val_str = str(val).strip()
	# Headers are usually strings, not pure numbers
	if val_str and not val_str.replace('.', '').replace(',', '').replace('-', '').replace(' ', '').isdigit():
	header_like += 1

	# If most non-null values look like headers, this is likely the header row
	if header_like >= non_null_count * 0.5:
	return i, i + 1

	return None, None

	def process_local_file(file_path):
	print(f"Processing {file_path}...")

	try:
	# Read the file to extract headers from specific rows
	# Based on inspection:
	# Row 7 (index 7): Destination Areas (cols 4+)
	# Row 9 (index 9): Origin Areas metadata (cols 0-3)

	# Read first 15 rows to get headers
	df_headers = pd.read_csv(file_path, header=None, nrows=15, low_memory=False)

	# Construct headers
	headers = []

	# Cols 0-3 from Row 9
	row9 = df_headers.iloc[9]
	for i in range(4):
	val = clean_value(row9[i])
	headers.append(val if val else f"Meta_{i}")

	# Cols 4+ from Row 7
	row7 = df_headers.iloc[7]
	for i in range(4, len(row7)):
	val = clean_value(row7[i])
	# Add prefix to indicate it's a destination column
	headers.append(f"Dest_{val}" if val else f"Col_{i}")

	# Read data starting from Row 10
	# We need to read the whole file now, skipping first 10 rows
	df = pd.read_csv(file_path, header=None, skiprows=10, low_memory=False)

	# Assign headers
	if len(df.columns) != len(headers):
	print(f"Column count mismatch: Data has {len(df.columns)}, Headers have {len(headers)}")
	# Adjust headers if needed
	if len(df.columns) < len(headers):
	headers = headers[:len(df.columns)]
	else:
	headers += [f"Extra_{i}" for i in range(len(headers), len(df.columns))]

	# Deduplicate headers
	unique_headers = []
	seen_headers = {}
	for h in headers:
	if h in seen_headers:
	seen_headers[h] += 1
	unique_headers.append(f"{h}_{seen_headers[h]}")
	else:
	seen_headers[h] = 0
	unique_headers.append(h)
	headers = unique_headers

	df.columns = headers

	# Clean values in all columns
	for col in df.columns:
	# Only clean string columns to avoid converting numbers to strings if not needed
	if df[col].dtype == 'object':
	df[col] = df[col].apply(clean_value)

	# Save clean CSV
	output_path = "clean_sample.csv"
	df.to_csv(output_path, index=False)
	print(f"Saved clean CSV to {output_path}")
	print(f"Shape: {df.shape}")
	print("\nFirst 5 rows:")
	# Print first 5 columns to avoid mess
	print(df.iloc[:5, :5].to_markdown(index=False))

	except Exception as e:
	print(f"Error processing file: {e}")
	import traceback
	traceback.print_exc()

	if __name__ == "__main__":
	process_local_file("sample.csv")