vn6295337's picture
Initial commit: Instant SWOT Agent
0c591a7
"""
SWOT text parsing service.
Extracts structured SWOT data from markdown/text reports.
"""
import re
def parse_swot_text(text: str) -> dict:
"""
Parse SWOT text into structured sections.
Args:
text: Raw SWOT analysis text with sections marked by headers
Returns:
Dictionary with keys: strengths, weaknesses, opportunities, threats
Each containing a list of bullet points
"""
sections = {
"strengths": [],
"weaknesses": [],
"opportunities": [],
"threats": []
}
current_section = None
lines = text.split('\n')
# Regex to match various bullet formats: -, *, β€’, numbered lists (1., 2.), etc.
bullet_pattern = re.compile(r'^[\s]*[-*β€’]\s*(.+)$|^[\s]*\d+[.)]\s*(.+)$')
for line in lines:
line = line.strip()
if not line:
continue
lower_line = line.lower()
# Check for section headers (with various formats: ##, **, :, etc.)
# Clean the line of markdown formatting for header detection
clean_lower = re.sub(r'[#*_:\[\]()]', '', lower_line).strip()
if 'strength' in clean_lower and len(clean_lower) < 50:
current_section = 'strengths'
# Check if there's content after the header on same line
after_header = _extract_after_header(line, 'strength')
if after_header:
sections[current_section].append(after_header)
continue
elif 'weakness' in clean_lower and len(clean_lower) < 50:
current_section = 'weaknesses'
after_header = _extract_after_header(line, 'weakness')
if after_header:
sections[current_section].append(after_header)
continue
elif 'opportunit' in clean_lower and len(clean_lower) < 50:
current_section = 'opportunities'
after_header = _extract_after_header(line, 'opportunit')
if after_header:
sections[current_section].append(after_header)
continue
elif 'threat' in clean_lower and len(clean_lower) < 50:
current_section = 'threats'
after_header = _extract_after_header(line, 'threat')
if after_header:
sections[current_section].append(after_header)
continue
# If we're in a section, try to extract content
if current_section:
# Try bullet pattern first
match = bullet_pattern.match(line)
if match:
# Get whichever group matched
item = match.group(1) or match.group(2)
if item and item.strip():
sections[current_section].append(item.strip())
elif not _is_header_line(line) and len(line) > 10:
# Plain text line that's not a header - might be content
# Only add if it looks like actual content (not too short)
sections[current_section].append(line)
return sections
def _extract_after_header(line: str, keyword: str) -> str:
"""Extract content that appears after a section header on the same line."""
# Find where the keyword ends and check for content after
lower = line.lower()
idx = lower.find(keyword)
if idx == -1:
return ""
# Find end of the header word
end_idx = idx + len(keyword)
# Skip past any trailing 's', 'es', 'ies' for plurals
while end_idx < len(line) and line[end_idx].isalpha():
end_idx += 1
# Get remainder and clean it
remainder = line[end_idx:].strip()
# Remove common separators: :, -, etc.
remainder = re.sub(r'^[:\-–—\s]+', '', remainder).strip()
# Remove markdown formatting
remainder = re.sub(r'^[#*_]+\s*', '', remainder).strip()
# If there's substantial content, return it
if len(remainder) > 10 and not remainder.lower().startswith(('strength', 'weakness', 'opportunit', 'threat')):
return remainder
return ""
def _is_header_line(line: str) -> bool:
"""Check if a line appears to be a header rather than content."""
# Lines that are mostly formatting or very short are likely headers
clean = re.sub(r'[#*_:\-–—\[\]()]', '', line).strip()
if len(clean) < 5:
return True
# Lines ending with : are often headers
if line.rstrip().endswith(':'):
return True
return False