Spaces:
Paused
Paused
Fix strengths parsing: ensure full sentences and limit to exactly three
Browse files- app/models/llm_analyzer.py +28 -3
app/models/llm_analyzer.py
CHANGED
|
@@ -654,9 +654,34 @@ def parse_and_format_analysis(raw_analysis):
|
|
| 654 |
strengths_match = re.search(r'\*\*Strengths\*\*\s*(.*?)(?=\*\*Areas for Improvement\*\*|\*\*Practice Tips\*\*|$)', raw_analysis, re.IGNORECASE | re.DOTALL)
|
| 655 |
if strengths_match:
|
| 656 |
strengths_text = strengths_match.group(1)
|
| 657 |
-
# Extract numbered items
|
| 658 |
-
|
| 659 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 660 |
|
| 661 |
# Extract areas for improvement using the new structured format
|
| 662 |
weaknesses_match = re.search(r'\*\*Areas for Improvement\*\*\s*(.*?)(?=\*\*Practice Tips\*\*|$)', raw_analysis, re.IGNORECASE | re.DOTALL)
|
|
|
|
| 654 |
strengths_match = re.search(r'\*\*Strengths\*\*\s*(.*?)(?=\*\*Areas for Improvement\*\*|\*\*Practice Tips\*\*|$)', raw_analysis, re.IGNORECASE | re.DOTALL)
|
| 655 |
if strengths_match:
|
| 656 |
strengths_text = strengths_match.group(1)
|
| 657 |
+
# Extract numbered items allowing digits/percentages in description until next numbered item
|
| 658 |
+
# Pattern: 1. Topic: Sentence.
|
| 659 |
+
strength_items = re.findall(r'\n?\s*\d+\.\s*([^:]+):\s*([\s\S]*?)(?=(?:\n\s*\d+\.|\n\s*\*\*|$))', strengths_text)
|
| 660 |
+
cleaned_strengths = []
|
| 661 |
+
for topic, desc in strength_items:
|
| 662 |
+
topic_clean = topic.strip()
|
| 663 |
+
# Take first full sentence from desc
|
| 664 |
+
desc_text = desc.strip().replace('\n', ' ')
|
| 665 |
+
# Ensure we end at the first period or em dash break
|
| 666 |
+
sentence_match = re.match(r'(.+?\.)', desc_text)
|
| 667 |
+
if sentence_match:
|
| 668 |
+
desc_clean = sentence_match.group(1).strip()
|
| 669 |
+
else:
|
| 670 |
+
# Fallback: up to 180 chars
|
| 671 |
+
desc_clean = desc_text[:180].strip()
|
| 672 |
+
if topic_clean and desc_clean:
|
| 673 |
+
cleaned_strengths.append(f"{topic_clean}: {desc_clean}")
|
| 674 |
+
# Ensure exactly three strengths
|
| 675 |
+
formatted_analysis['strengths'] = cleaned_strengths[:3]
|
| 676 |
+
# If fewer than three were parsed, fall back to any colon-style lines present
|
| 677 |
+
if len(formatted_analysis['strengths']) < 3:
|
| 678 |
+
fallback_items = re.findall(r'^[\-•]?\s*([^:\n]+):\s*(.+)$', strengths_text, re.MULTILINE)
|
| 679 |
+
for topic, desc in fallback_items:
|
| 680 |
+
item = f"{topic.strip()}: {desc.strip()}"
|
| 681 |
+
if item not in formatted_analysis['strengths']:
|
| 682 |
+
formatted_analysis['strengths'].append(item)
|
| 683 |
+
if len(formatted_analysis['strengths']) >= 3:
|
| 684 |
+
break
|
| 685 |
|
| 686 |
# Extract areas for improvement using the new structured format
|
| 687 |
weaknesses_match = re.search(r'\*\*Areas for Improvement\*\*\s*(.*?)(?=\*\*Practice Tips\*\*|$)', raw_analysis, re.IGNORECASE | re.DOTALL)
|