Spaces:

Agents-MCP-Hackathon
/

Python-Code-to-Diagram-Generator-MCP

Sleeping

App Files Files Community

Python-Code-to-Diagram-Generator-MCP / examples /string_processing.py

navred61

copied from private space

99a41ea 9 months ago

raw

history blame

5.13 kB

	"""
	String processing pipeline functions for testing function analysis.
	"""

	import re
	from typing import List


	def normalize_whitespace(text):
	"""Normalize whitespace by removing extra spaces and newlines."""
	# Replace multiple whitespace with single space
	text = re.sub(r'\s+', ' ', text)
	# Strip leading and trailing whitespace
	return text.strip()


	def remove_special_characters(text, keep_chars=""):
	"""Remove special characters, optionally keeping specified characters."""
	# Keep alphanumeric, spaces, and specified characters
	pattern = fr"[^a-zA-Z0-9\s{re.escape(keep_chars)}]"
	return re.sub(pattern, '', text)


	def convert_to_lowercase(text):
	"""Convert text to lowercase."""
	return text.lower()


	def remove_stopwords(text, stopwords=None):
	"""Remove common stopwords from text."""
	if stopwords is None:
	stopwords = {
	'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
	'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be',
	'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
	'will', 'would', 'could', 'should', 'may', 'might', 'must'
	}

	words = text.split()
	filtered_words = [word for word in words if word.lower() not in stopwords]
	return ' '.join(filtered_words)


	def extract_keywords(text, min_length=3):
	"""Extract keywords (words longer than min_length)."""
	words = text.split()
	keywords = [word for word in words if len(word) >= min_length]
	return keywords


	def count_word_frequency(text):
	"""Count frequency of each word in text."""
	words = text.split()
	frequency = {}
	for word in words:
	frequency[word] = frequency.get(word, 0) + 1
	return frequency


	def capitalize_words(text, exceptions=None):
	"""Capitalize first letter of each word, with exceptions."""
	if exceptions is None:
	exceptions = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}

	words = text.split()
	capitalized = []

	for i, word in enumerate(words):
	if i == 0 or word.lower() not in exceptions:
	capitalized.append(word.capitalize())
	else:
	capitalized.append(word.lower())

	return ' '.join(capitalized)


	def truncate_text(text, max_length=100, suffix="..."):
	"""Truncate text to specified length with suffix."""
	if len(text) <= max_length:
	return text

	truncated = text[:max_length - len(suffix)]
	# Try to break at last complete word
	last_space = truncated.rfind(' ')
	if last_space > max_length * 0.8: # If we can break at a word boundary
	truncated = truncated[:last_space]

	return truncated + suffix


	def text_processing_pipeline(text, operations=None):
	"""Process text through a pipeline of operations."""
	if operations is None:
	operations = [
	'normalize_whitespace',
	'remove_special_characters',
	'convert_to_lowercase',
	'remove_stopwords'
	]

	# Map operation names to functions
	operation_map = {
	'normalize_whitespace': normalize_whitespace,
	'remove_special_characters': remove_special_characters,
	'convert_to_lowercase': convert_to_lowercase,
	'remove_stopwords': remove_stopwords,
	'capitalize_words': capitalize_words,
	'truncate_text': truncate_text
	}

	result = text
	processing_steps = []

	for operation in operations:
	if operation in operation_map:
	before = result
	result = operation_map[operation](result)
	processing_steps.append({
	'operation': operation,
	'before': before[:50] + "..." if len(before) > 50 else before,
	'after': result[:50] + "..." if len(result) > 50 else result
	})

	return result, processing_steps


	def analyze_text_statistics(text):
	"""Analyze various statistics about the text."""
	words = text.split()

	stats = {
	'character_count': len(text),
	'word_count': len(words),
	'sentence_count': len(re.findall(r'[.!?]+', text)),
	'average_word_length': sum(len(word) for word in words) / len(words) if words else 0,
	'longest_word': max(words, key=len) if words else "",
	'shortest_word': min(words, key=len) if words else ""
	}

	return stats


	if __name__ == "__main__":
	sample_text = """
	This is a SAMPLE text with various formatting issues!!!
	It has multiple spaces, special @#$% characters, and
	needs some serious cleaning & processing...
	"""

	print("Original text:")
	print(repr(sample_text))

	processed_text, steps = text_processing_pipeline(sample_text)

	print("\nProcessing steps:")
	for step in steps:
	print(f"After {step['operation']}:")
	print(f" {step['after']}")

	print(f"\nFinal result: {processed_text}")

	stats = analyze_text_statistics(processed_text)
	print(f"\nText statistics: {stats}")