Spaces:

Agents-MCP-Hackathon
/

supplier-performance-analysis-agent

Sleeping

App Files Files Community

supplier-performance-analysis-agent / agent /utils.py

kamalakn

Create agent/utils.py

6e2c94a verified 9 months ago

raw

history blame contribute delete

14.3 kB

	import pandas as pd
	import numpy as np
	from docx import Document
	import tempfile
	import os
	import re
	from typing import List, Dict, Any, Union, Optional
	import io
	import matplotlib.pyplot as plt
	#import seaborn as sns

	class FileProcessor:
	"""Utility class for handling file operations"""

	@staticmethod
	def read_excel_file(file) -> pd.DataFrame:
	"""
	Read Excel or CSV file and return DataFrame
	"""
	try:
	if file.name.endswith('.csv'):
	df = pd.read_csv(file)
	elif file.name.endswith(('.xlsx', '.xls')):
	df = pd.read_excel(file)
	else:
	raise ValueError(f"Unsupported file format: {file.name}")

	# Clean column names
	df.columns = df.columns.str.strip()
	return df

	except Exception as e:
	raise Exception(f"Error reading file {file.name}: {str(e)}")

	@staticmethod
	def save_temp_docx(uploaded_file) -> str:
	"""
	Save uploaded Word document to temporary file and return path
	"""
	with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
	tmp_file.write(uploaded_file.read())
	return tmp_file.name

	@staticmethod
	def cleanup_temp_file(file_path: str):
	"""
	Clean up temporary file
	"""
	try:
	if os.path.exists(file_path):
	os.unlink(file_path)
	except Exception as e:
	print(f"Warning: Could not delete temp file {file_path}: {e}")

	class DataValidator:
	"""Utility class for data validation"""

	@staticmethod
	def validate_supplier_data(df: pd.DataFrame) -> Dict[str, Any]:
	"""
	Validate supplier data DataFrame and return validation results
	"""
	validation_results = {
	"is_valid": True,
	"errors": [],
	"warnings": [],
	"summary": {}
	}

	required_columns = ['Supplier', 'OnTimeDeliveryRate', 'DefectRate']

	# Check for required columns
	missing_columns = [col for col in required_columns if col not in df.columns]
	if missing_columns:
	validation_results["is_valid"] = False
	validation_results["errors"].append(f"Missing required columns: {missing_columns}")

	# Check data types and ranges
	if 'OnTimeDeliveryRate' in df.columns:
	invalid_rates = df[
	(df['OnTimeDeliveryRate'] < 0) \| (df['OnTimeDeliveryRate'] > 100)
	]
	if not invalid_rates.empty:
	validation_results["warnings"].append(
	f"OnTimeDeliveryRate should be between 0-100. Found {len(invalid_rates)} invalid values."
	)

	if 'DefectRate' in df.columns:
	invalid_defects = df[(df['DefectRate'] < 0) \| (df['DefectRate'] > 100)]
	if not invalid_defects.empty:
	validation_results["warnings"].append(
	f"DefectRate should be between 0-100. Found {len(invalid_defects)} invalid values."
	)

	# Summary statistics
	validation_results["summary"] = {
	"total_rows": len(df),
	"total_columns": len(df.columns),
	"duplicate_suppliers": df.duplicated(subset=['Supplier']).sum() if 'Supplier' in df.columns else 0,
	"missing_values": df.isnull().sum().to_dict()
	}

	return validation_results

	@staticmethod
	def clean_supplier_data(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Clean and preprocess supplier data
	"""
	df_clean = df.copy()

	# Remove leading/trailing whitespace from string columns
	string_columns = df_clean.select_dtypes(include=['object']).columns
	for col in string_columns:
	df_clean[col] = df_clean[col].astype(str).str.strip()

	# Handle missing values
	numeric_columns = df_clean.select_dtypes(include=[np.number]).columns
	for col in numeric_columns:
	df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

	# Remove duplicate suppliers (keep first occurrence)
	if 'Supplier' in df_clean.columns:
	df_clean = df_clean.drop_duplicates(subset=['Supplier'], keep='first')

	return df_clean

	class AuditProcessor:
	"""Utility class for processing audit documents"""

	@staticmethod
	def extract_text_from_docx(file_path: str) -> str:
	"""
	Extract text from Word document
	"""
	try:
	doc = Document(file_path)
	return "\n".join([para.text for para in doc.paragraphs])
	except Exception as e:
	raise Exception(f"Error reading document: {str(e)}")

	@staticmethod
	def extract_supplier_name(content: str, filename: str) -> str:
	"""
	Extract supplier name from document content or filename
	"""
	# Try to find supplier name in content using regex
	patterns = [
	r'SUPPLIER\s+(\d+)S?', # SUPPLIER 1S, SUPPLIER 2, etc.
	r'Supplier\s+(\d+)', # Supplier 1, Supplier 2, etc.
	r'supplier\s+(\d+)', # supplier 1, supplier 2, etc.
	]

	for pattern in patterns:
	match = re.search(pattern, content, re.IGNORECASE)
	if match:
	return f"Supplier {match.group(1)}"

	# Fallback to filename parsing
	filename_patterns = [
	r'Audit.Report.?(\d+)', # Audit_Report_1.docx, etc.
	r'supplier.*?(\d+)', # supplier_1_audit.docx, etc.
	r'audit.*?(\d+)', # audit_1.docx, etc.
	]

	for pattern in filename_patterns:
	match = re.search(pattern, filename, re.IGNORECASE)
	if match:
	return f"Supplier {match.group(1)}"

	# Final fallback
	return f"Supplier from {filename}"

	@staticmethod
	def count_audit_findings(content: str) -> int:
	"""
	Count audit findings in document content using multiple approaches
	"""
	content_lower = content.lower()

	# Initialize count
	total_findings = 0

	# Method 1: Look for explicit numbers
	explicit_patterns = [
	r'(\d+)\s+major\s+finding',
	r'(\d+)\s+minor\s+finding',
	r'(\d+)\s+finding',
	r'no\s+findings?\s+observed',
	r'zero\s+findings?'
	]

	for pattern in explicit_patterns:
	matches = re.findall(pattern, content_lower)
	if matches:
	if 'no' in pattern or 'zero' in pattern:
	return 0
	for match in matches:
	total_findings += int(match)

	# Method 2: Count individual mentions if no explicit numbers found
	if total_findings == 0:
	finding_keywords = [
	'major finding', 'minor finding', 'critical finding',
	'observation', 'non-conformance', 'deficiency'
	]

	for keyword in finding_keywords:
	count = content_lower.count(keyword)
	# Weight different types of findings
	if 'major' in keyword or 'critical' in keyword:
	total_findings += count * 2 # Major findings count double
	else:
	total_findings += count

	# Method 3: Special case handling
	if 'no findings' in content_lower:
	return 0

	return total_findings

	@staticmethod
	def extract_audit_metadata(content: str) -> Dict[str, Any]:
	"""
	Extract metadata from audit document
	"""
	metadata = {
	'audit_type': 'Unknown',
	'location': 'Unknown',
	'auditor': 'Unknown',
	'date': 'Unknown',
	'iso_standard': None
	}

	# Extract audit type
	if re.search(r'surveillance\s+audit', content, re.IGNORECASE):
	metadata['audit_type'] = 'Surveillance'
	elif re.search(r'certification\s+audit', content, re.IGNORECASE):
	metadata['audit_type'] = 'Certification'

	# Extract location
	location_match = re.search(r'located\s+in\s+([^.]+)', content, re.IGNORECASE)
	if location_match:
	metadata['location'] = location_match.group(1).strip()

	# Extract ISO standard
	iso_match = re.search(r'ISO\s+(\d+:\d+)', content)
	if iso_match:
	metadata['iso_standard'] = iso_match.group(1)

	return metadata

	class ChartGenerator:
	"""Utility class for generating charts and visualizations"""

	@staticmethod
	def create_performance_chart(scores: Dict[str, float],
	title: str = "Supplier Performance Scores",
	figsize: tuple = (10, 6)) -> str:
	"""
	Create a performance chart and return the file path
	"""
	plt.figure(figsize=figsize)

	# Sort scores for better visualization
	sorted_scores = dict(sorted(scores.items(), key=lambda x: x[1], reverse=True))

	bars = plt.bar(sorted_scores.keys(), sorted_scores.values(),
	color='skyblue', edgecolor='navy', alpha=0.7)

	# Add value labels on bars
	for bar in bars:
	height = bar.get_height()
	plt.text(bar.get_x() + bar.get_width()/2., height + 1,
	f'{height:.1f}', ha='center', va='bottom', fontweight='bold')

	plt.title(title, fontsize=16, fontweight='bold')
	plt.ylabel("Performance Score", fontsize=12)
	plt.xlabel("Suppliers", fontsize=12)
	plt.xticks(rotation=45, ha='right')
	plt.grid(axis='y', alpha=0.3)
	plt.tight_layout()

	# Save to temporary file
	chart_path = tempfile.mktemp(suffix='.png')
	plt.savefig(chart_path, dpi=300, bbox_inches='tight')
	plt.close()

	return chart_path

	@staticmethod
	def create_distribution_chart(data: Dict[str, Any],
	title: str = "Distribution Analysis") -> str:
	"""
	Create distribution charts for various metrics
	"""
	fig, axes = plt.subplots(2, 2, figsize=(12, 8))
	fig.suptitle(title, fontsize=16, fontweight='bold')

	# Example: You can expand this based on your data structure
	# This is a template that can be customized

	chart_path = tempfile.mktemp(suffix='.png')
	plt.savefig(chart_path, dpi=300, bbox_inches='tight')
	plt.close()

	return chart_path

	class ScoreCalculator:
	"""Utility class for calculating various performance scores"""

	@staticmethod
	def calculate_weighted_score(metrics: Dict[str, float],
	weights: Dict[str, float]) -> float:
	"""
	Calculate weighted performance score
	"""
	total_score = 0.0
	total_weight = 0.0

	for metric, value in metrics.items():
	if metric in weights:
	weight = weights[metric]
	total_score += value * weight
	total_weight += weight

	return total_score / total_weight if total_weight > 0 else 0.0

	@staticmethod
	def normalize_score(score: float, min_val: float = 0, max_val: float = 100) -> float:
	"""
	Normalize score to a specific range
	"""
	return max(min_val, min(max_val, score))

	@staticmethod
	def calculate_performance_tier(score: float) -> str:
	"""
	Determine performance tier based on score
	"""
	if score >= 90:
	return "Excellent"
	elif score >= 80:
	return "Good"
	elif score >= 70:
	return "Satisfactory"
	elif score >= 60:
	return "Needs Improvement"
	else:
	return "Poor"

	class ReportFormatter:
	"""Utility class for formatting reports and outputs"""

	@staticmethod
	def format_score_table(scores: Dict[str, float]) -> pd.DataFrame:
	"""
	Format scores into a nicely structured DataFrame
	"""
	df = pd.DataFrame(list(scores.items()), columns=['Supplier', 'Score'])
	df = df.sort_values('Score', ascending=False)
	df['Rank'] = range(1, len(df) + 1)
	df['Performance Tier'] = df['Score'].apply(ScoreCalculator.calculate_performance_tier)
	df['Score'] = df['Score'].round(2)

	return df[['Rank', 'Supplier', 'Score', 'Performance Tier']]

	@staticmethod
	def format_findings_summary(findings: Dict[str, int]) -> pd.DataFrame:
	"""
	Format audit findings into a structured DataFrame
	"""
	df = pd.DataFrame(list(findings.items()),
	columns=['Supplier', 'Findings Count'])
	df['Risk Level'] = df['Findings Count'].apply(
	lambda x: 'High' if x > 3 else 'Medium' if x > 1 else 'Low'
	)

	return df.sort_values('Findings Count', ascending=False)

	# Helper functions for common operations
	def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
	"""
	Safely divide two numbers, returning default if denominator is zero
	"""
	return numerator / denominator if denominator != 0 else default

	def extract_numeric_value(text: str, default: float = 0.0) -> float:
	"""
	Extract numeric value from text string
	"""
	if pd.isna(text):
	return default

	# Try to extract number from string
	numbers = re.findall(r'\d+\.?\d*', str(text))
	if numbers:
	return float(numbers[0])

	return default

	def clean_text(text: str) -> str:
	"""
	Clean and normalize text
	"""
	if pd.isna(text):
	return ""

	# Remove extra whitespace and normalize
	cleaned = re.sub(r'\s+', ' ', str(text).strip())
	return cleaned