Spaces:
Sleeping
Sleeping
File size: 4,419 Bytes
0c591a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
"""
SWOT text parsing service.
Extracts structured SWOT data from markdown/text reports.
"""
import re
def parse_swot_text(text: str) -> dict:
"""
Parse SWOT text into structured sections.
Args:
text: Raw SWOT analysis text with sections marked by headers
Returns:
Dictionary with keys: strengths, weaknesses, opportunities, threats
Each containing a list of bullet points
"""
sections = {
"strengths": [],
"weaknesses": [],
"opportunities": [],
"threats": []
}
current_section = None
lines = text.split('\n')
# Regex to match various bullet formats: -, *, •, numbered lists (1., 2.), etc.
bullet_pattern = re.compile(r'^[\s]*[-*•]\s*(.+)$|^[\s]*\d+[.)]\s*(.+)$')
for line in lines:
line = line.strip()
if not line:
continue
lower_line = line.lower()
# Check for section headers (with various formats: ##, **, :, etc.)
# Clean the line of markdown formatting for header detection
clean_lower = re.sub(r'[#*_:\[\]()]', '', lower_line).strip()
if 'strength' in clean_lower and len(clean_lower) < 50:
current_section = 'strengths'
# Check if there's content after the header on same line
after_header = _extract_after_header(line, 'strength')
if after_header:
sections[current_section].append(after_header)
continue
elif 'weakness' in clean_lower and len(clean_lower) < 50:
current_section = 'weaknesses'
after_header = _extract_after_header(line, 'weakness')
if after_header:
sections[current_section].append(after_header)
continue
elif 'opportunit' in clean_lower and len(clean_lower) < 50:
current_section = 'opportunities'
after_header = _extract_after_header(line, 'opportunit')
if after_header:
sections[current_section].append(after_header)
continue
elif 'threat' in clean_lower and len(clean_lower) < 50:
current_section = 'threats'
after_header = _extract_after_header(line, 'threat')
if after_header:
sections[current_section].append(after_header)
continue
# If we're in a section, try to extract content
if current_section:
# Try bullet pattern first
match = bullet_pattern.match(line)
if match:
# Get whichever group matched
item = match.group(1) or match.group(2)
if item and item.strip():
sections[current_section].append(item.strip())
elif not _is_header_line(line) and len(line) > 10:
# Plain text line that's not a header - might be content
# Only add if it looks like actual content (not too short)
sections[current_section].append(line)
return sections
def _extract_after_header(line: str, keyword: str) -> str:
"""Extract content that appears after a section header on the same line."""
# Find where the keyword ends and check for content after
lower = line.lower()
idx = lower.find(keyword)
if idx == -1:
return ""
# Find end of the header word
end_idx = idx + len(keyword)
# Skip past any trailing 's', 'es', 'ies' for plurals
while end_idx < len(line) and line[end_idx].isalpha():
end_idx += 1
# Get remainder and clean it
remainder = line[end_idx:].strip()
# Remove common separators: :, -, etc.
remainder = re.sub(r'^[:\-–—\s]+', '', remainder).strip()
# Remove markdown formatting
remainder = re.sub(r'^[#*_]+\s*', '', remainder).strip()
# If there's substantial content, return it
if len(remainder) > 10 and not remainder.lower().startswith(('strength', 'weakness', 'opportunit', 'threat')):
return remainder
return ""
def _is_header_line(line: str) -> bool:
"""Check if a line appears to be a header rather than content."""
# Lines that are mostly formatting or very short are likely headers
clean = re.sub(r'[#*_:\-–—\[\]()]', '', line).strip()
if len(clean) < 5:
return True
# Lines ending with : are often headers
if line.rstrip().endswith(':'):
return True
return False
|