Spaces:
Sleeping
Sleeping
| """ | |
| SWOT text parsing service. | |
| Extracts structured SWOT data from markdown/text reports. | |
| """ | |
| import re | |
| def parse_swot_text(text: str) -> dict: | |
| """ | |
| Parse SWOT text into structured sections. | |
| Args: | |
| text: Raw SWOT analysis text with sections marked by headers | |
| Returns: | |
| Dictionary with keys: strengths, weaknesses, opportunities, threats | |
| Each containing a list of bullet points | |
| """ | |
| sections = { | |
| "strengths": [], | |
| "weaknesses": [], | |
| "opportunities": [], | |
| "threats": [] | |
| } | |
| current_section = None | |
| lines = text.split('\n') | |
| # Regex to match various bullet formats: -, *, β’, numbered lists (1., 2.), etc. | |
| bullet_pattern = re.compile(r'^[\s]*[-*β’]\s*(.+)$|^[\s]*\d+[.)]\s*(.+)$') | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| lower_line = line.lower() | |
| # Check for section headers (with various formats: ##, **, :, etc.) | |
| # Clean the line of markdown formatting for header detection | |
| clean_lower = re.sub(r'[#*_:\[\]()]', '', lower_line).strip() | |
| if 'strength' in clean_lower and len(clean_lower) < 50: | |
| current_section = 'strengths' | |
| # Check if there's content after the header on same line | |
| after_header = _extract_after_header(line, 'strength') | |
| if after_header: | |
| sections[current_section].append(after_header) | |
| continue | |
| elif 'weakness' in clean_lower and len(clean_lower) < 50: | |
| current_section = 'weaknesses' | |
| after_header = _extract_after_header(line, 'weakness') | |
| if after_header: | |
| sections[current_section].append(after_header) | |
| continue | |
| elif 'opportunit' in clean_lower and len(clean_lower) < 50: | |
| current_section = 'opportunities' | |
| after_header = _extract_after_header(line, 'opportunit') | |
| if after_header: | |
| sections[current_section].append(after_header) | |
| continue | |
| elif 'threat' in clean_lower and len(clean_lower) < 50: | |
| current_section = 'threats' | |
| after_header = _extract_after_header(line, 'threat') | |
| if after_header: | |
| sections[current_section].append(after_header) | |
| continue | |
| # If we're in a section, try to extract content | |
| if current_section: | |
| # Try bullet pattern first | |
| match = bullet_pattern.match(line) | |
| if match: | |
| # Get whichever group matched | |
| item = match.group(1) or match.group(2) | |
| if item and item.strip(): | |
| sections[current_section].append(item.strip()) | |
| elif not _is_header_line(line) and len(line) > 10: | |
| # Plain text line that's not a header - might be content | |
| # Only add if it looks like actual content (not too short) | |
| sections[current_section].append(line) | |
| return sections | |
| def _extract_after_header(line: str, keyword: str) -> str: | |
| """Extract content that appears after a section header on the same line.""" | |
| # Find where the keyword ends and check for content after | |
| lower = line.lower() | |
| idx = lower.find(keyword) | |
| if idx == -1: | |
| return "" | |
| # Find end of the header word | |
| end_idx = idx + len(keyword) | |
| # Skip past any trailing 's', 'es', 'ies' for plurals | |
| while end_idx < len(line) and line[end_idx].isalpha(): | |
| end_idx += 1 | |
| # Get remainder and clean it | |
| remainder = line[end_idx:].strip() | |
| # Remove common separators: :, -, etc. | |
| remainder = re.sub(r'^[:\-ββ\s]+', '', remainder).strip() | |
| # Remove markdown formatting | |
| remainder = re.sub(r'^[#*_]+\s*', '', remainder).strip() | |
| # If there's substantial content, return it | |
| if len(remainder) > 10 and not remainder.lower().startswith(('strength', 'weakness', 'opportunit', 'threat')): | |
| return remainder | |
| return "" | |
| def _is_header_line(line: str) -> bool: | |
| """Check if a line appears to be a header rather than content.""" | |
| # Lines that are mostly formatting or very short are likely headers | |
| clean = re.sub(r'[#*_:\-ββ\[\]()]', '', line).strip() | |
| if len(clean) < 5: | |
| return True | |
| # Lines ending with : are often headers | |
| if line.rstrip().endswith(':'): | |
| return True | |
| return False | |