File size: 4,419 Bytes
0c591a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
SWOT text parsing service.
Extracts structured SWOT data from markdown/text reports.
"""

import re


def parse_swot_text(text: str) -> dict:
    """
    Parse SWOT text into structured sections.

    Args:
        text: Raw SWOT analysis text with sections marked by headers

    Returns:
        Dictionary with keys: strengths, weaknesses, opportunities, threats
        Each containing a list of bullet points
    """
    sections = {
        "strengths": [],
        "weaknesses": [],
        "opportunities": [],
        "threats": []
    }

    current_section = None
    lines = text.split('\n')

    # Regex to match various bullet formats: -, *, •, numbered lists (1., 2.), etc.
    bullet_pattern = re.compile(r'^[\s]*[-*•]\s*(.+)$|^[\s]*\d+[.)]\s*(.+)$')

    for line in lines:
        line = line.strip()
        if not line:
            continue

        lower_line = line.lower()

        # Check for section headers (with various formats: ##, **, :, etc.)
        # Clean the line of markdown formatting for header detection
        clean_lower = re.sub(r'[#*_:\[\]()]', '', lower_line).strip()

        if 'strength' in clean_lower and len(clean_lower) < 50:
            current_section = 'strengths'
            # Check if there's content after the header on same line
            after_header = _extract_after_header(line, 'strength')
            if after_header:
                sections[current_section].append(after_header)
            continue
        elif 'weakness' in clean_lower and len(clean_lower) < 50:
            current_section = 'weaknesses'
            after_header = _extract_after_header(line, 'weakness')
            if after_header:
                sections[current_section].append(after_header)
            continue
        elif 'opportunit' in clean_lower and len(clean_lower) < 50:
            current_section = 'opportunities'
            after_header = _extract_after_header(line, 'opportunit')
            if after_header:
                sections[current_section].append(after_header)
            continue
        elif 'threat' in clean_lower and len(clean_lower) < 50:
            current_section = 'threats'
            after_header = _extract_after_header(line, 'threat')
            if after_header:
                sections[current_section].append(after_header)
            continue

        # If we're in a section, try to extract content
        if current_section:
            # Try bullet pattern first
            match = bullet_pattern.match(line)
            if match:
                # Get whichever group matched
                item = match.group(1) or match.group(2)
                if item and item.strip():
                    sections[current_section].append(item.strip())
            elif not _is_header_line(line) and len(line) > 10:
                # Plain text line that's not a header - might be content
                # Only add if it looks like actual content (not too short)
                sections[current_section].append(line)

    return sections


def _extract_after_header(line: str, keyword: str) -> str:
    """Extract content that appears after a section header on the same line."""
    # Find where the keyword ends and check for content after
    lower = line.lower()
    idx = lower.find(keyword)
    if idx == -1:
        return ""

    # Find end of the header word
    end_idx = idx + len(keyword)
    # Skip past any trailing 's', 'es', 'ies' for plurals
    while end_idx < len(line) and line[end_idx].isalpha():
        end_idx += 1

    # Get remainder and clean it
    remainder = line[end_idx:].strip()
    # Remove common separators: :, -, etc.
    remainder = re.sub(r'^[:\-–—\s]+', '', remainder).strip()
    # Remove markdown formatting
    remainder = re.sub(r'^[#*_]+\s*', '', remainder).strip()

    # If there's substantial content, return it
    if len(remainder) > 10 and not remainder.lower().startswith(('strength', 'weakness', 'opportunit', 'threat')):
        return remainder
    return ""


def _is_header_line(line: str) -> bool:
    """Check if a line appears to be a header rather than content."""
    # Lines that are mostly formatting or very short are likely headers
    clean = re.sub(r'[#*_:\-–—\[\]()]', '', line).strip()
    if len(clean) < 5:
        return True
    # Lines ending with : are often headers
    if line.rstrip().endswith(':'):
        return True
    return False