chenemii commited on
Commit
4773d5a
·
1 Parent(s): 7c0b2f1

Fix strengths parsing: ensure full sentences and limit to exactly three

Browse files
Files changed (1) hide show
  1. app/models/llm_analyzer.py +28 -3
app/models/llm_analyzer.py CHANGED
@@ -654,9 +654,34 @@ def parse_and_format_analysis(raw_analysis):
654
  strengths_match = re.search(r'\*\*Strengths\*\*\s*(.*?)(?=\*\*Areas for Improvement\*\*|\*\*Practice Tips\*\*|$)', raw_analysis, re.IGNORECASE | re.DOTALL)
655
  if strengths_match:
656
  strengths_text = strengths_match.group(1)
657
- # Extract numbered items (1. Topic: Description)
658
- strength_items = re.findall(r'\d+\.\s*([^:]+):\s*([^\n\d]+)', strengths_text)
659
- formatted_analysis['strengths'] = [f"{topic.strip()}: {desc.strip()}" for topic, desc in strength_items if topic.strip() and desc.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
660
 
661
  # Extract areas for improvement using the new structured format
662
  weaknesses_match = re.search(r'\*\*Areas for Improvement\*\*\s*(.*?)(?=\*\*Practice Tips\*\*|$)', raw_analysis, re.IGNORECASE | re.DOTALL)
 
654
  strengths_match = re.search(r'\*\*Strengths\*\*\s*(.*?)(?=\*\*Areas for Improvement\*\*|\*\*Practice Tips\*\*|$)', raw_analysis, re.IGNORECASE | re.DOTALL)
655
  if strengths_match:
656
  strengths_text = strengths_match.group(1)
657
+ # Extract numbered items allowing digits/percentages in description until next numbered item
658
+ # Pattern: 1. Topic: Sentence.
659
+ strength_items = re.findall(r'\n?\s*\d+\.\s*([^:]+):\s*([\s\S]*?)(?=(?:\n\s*\d+\.|\n\s*\*\*|$))', strengths_text)
660
+ cleaned_strengths = []
661
+ for topic, desc in strength_items:
662
+ topic_clean = topic.strip()
663
+ # Take first full sentence from desc
664
+ desc_text = desc.strip().replace('\n', ' ')
665
+ # Ensure we end at the first period or em dash break
666
+ sentence_match = re.match(r'(.+?\.)', desc_text)
667
+ if sentence_match:
668
+ desc_clean = sentence_match.group(1).strip()
669
+ else:
670
+ # Fallback: up to 180 chars
671
+ desc_clean = desc_text[:180].strip()
672
+ if topic_clean and desc_clean:
673
+ cleaned_strengths.append(f"{topic_clean}: {desc_clean}")
674
+ # Ensure exactly three strengths
675
+ formatted_analysis['strengths'] = cleaned_strengths[:3]
676
+ # If fewer than three were parsed, fall back to any colon-style lines present
677
+ if len(formatted_analysis['strengths']) < 3:
678
+ fallback_items = re.findall(r'^[\-•]?\s*([^:\n]+):\s*(.+)$', strengths_text, re.MULTILINE)
679
+ for topic, desc in fallback_items:
680
+ item = f"{topic.strip()}: {desc.strip()}"
681
+ if item not in formatted_analysis['strengths']:
682
+ formatted_analysis['strengths'].append(item)
683
+ if len(formatted_analysis['strengths']) >= 3:
684
+ break
685
 
686
  # Extract areas for improvement using the new structured format
687
  weaknesses_match = re.search(r'\*\*Areas for Improvement\*\*\s*(.*?)(?=\*\*Practice Tips\*\*|$)', raw_analysis, re.IGNORECASE | re.DOTALL)