Spaces:

vn6295337
/

Instant-SWOT-Agent

Sleeping

App Files Files Community

Instant-SWOT-Agent / src /services /swot_parser.py

vn6295337

Initial commit: Instant SWOT Agent

0c591a7 about 1 month ago

raw

history blame contribute delete

4.42 kB

	"""
	SWOT text parsing service.
	Extracts structured SWOT data from markdown/text reports.
	"""

	import re


	def parse_swot_text(text: str) -> dict:
	"""
	Parse SWOT text into structured sections.

	Args:
	text: Raw SWOT analysis text with sections marked by headers

	Returns:
	Dictionary with keys: strengths, weaknesses, opportunities, threats
	Each containing a list of bullet points
	"""
	sections = {
	"strengths": [],
	"weaknesses": [],
	"opportunities": [],
	"threats": []
	}

	current_section = None
	lines = text.split('\n')

	# Regex to match various bullet formats: -, *, •, numbered lists (1., 2.), etc.
	bullet_pattern = re.compile(r'^[\s][-•]\s(.+)$\|^[\s]\d+[.)]\s*(.+)$')

	for line in lines:
	line = line.strip()
	if not line:
	continue

	lower_line = line.lower()

	# Check for section headers (with various formats: ##, **, :, etc.)
	# Clean the line of markdown formatting for header detection
	clean_lower = re.sub(r'[#*_:\[\]()]', '', lower_line).strip()

	if 'strength' in clean_lower and len(clean_lower) < 50:
	current_section = 'strengths'
	# Check if there's content after the header on same line
	after_header = _extract_after_header(line, 'strength')
	if after_header:
	sections[current_section].append(after_header)
	continue
	elif 'weakness' in clean_lower and len(clean_lower) < 50:
	current_section = 'weaknesses'
	after_header = _extract_after_header(line, 'weakness')
	if after_header:
	sections[current_section].append(after_header)
	continue
	elif 'opportunit' in clean_lower and len(clean_lower) < 50:
	current_section = 'opportunities'
	after_header = _extract_after_header(line, 'opportunit')
	if after_header:
	sections[current_section].append(after_header)
	continue
	elif 'threat' in clean_lower and len(clean_lower) < 50:
	current_section = 'threats'
	after_header = _extract_after_header(line, 'threat')
	if after_header:
	sections[current_section].append(after_header)
	continue

	# If we're in a section, try to extract content
	if current_section:
	# Try bullet pattern first
	match = bullet_pattern.match(line)
	if match:
	# Get whichever group matched
	item = match.group(1) or match.group(2)
	if item and item.strip():
	sections[current_section].append(item.strip())
	elif not _is_header_line(line) and len(line) > 10:
	# Plain text line that's not a header - might be content
	# Only add if it looks like actual content (not too short)
	sections[current_section].append(line)

	return sections


	def _extract_after_header(line: str, keyword: str) -> str:
	"""Extract content that appears after a section header on the same line."""
	# Find where the keyword ends and check for content after
	lower = line.lower()
	idx = lower.find(keyword)
	if idx == -1:
	return ""

	# Find end of the header word
	end_idx = idx + len(keyword)
	# Skip past any trailing 's', 'es', 'ies' for plurals
	while end_idx < len(line) and line[end_idx].isalpha():
	end_idx += 1

	# Get remainder and clean it
	remainder = line[end_idx:].strip()
	# Remove common separators: :, -, etc.
	remainder = re.sub(r'^[:\-–—\s]+', '', remainder).strip()
	# Remove markdown formatting
	remainder = re.sub(r'^[#_]+\s', '', remainder).strip()

	# If there's substantial content, return it
	if len(remainder) > 10 and not remainder.lower().startswith(('strength', 'weakness', 'opportunit', 'threat')):
	return remainder
	return ""


	def _is_header_line(line: str) -> bool:
	"""Check if a line appears to be a header rather than content."""
	# Lines that are mostly formatting or very short are likely headers
	clean = re.sub(r'[#*_:\-–—\[\]()]', '', line).strip()
	if len(clean) < 5:
	return True
	# Lines ending with : are often headers
	if line.rstrip().endswith(':'):
	return True
	return False