vn6295337 Claude Opus 4.5 commited on
Commit
e3b7287
·
1 Parent(s): 6fd7fcd

fix: Update citation patterns to match new SWOT output format

Browse files

The SWOT output format changed from VALUE [M##] to [M##] Metric: Value
but the citation validator patterns weren't updated, causing:
- 0% citation coverage (pattern didn't match)
- 25+ "uncited" values flagged (all values appeared uncited)
- Score capped at 4.5/10

Changes:
- Added CITATION_PATTERN_NEW for [M##] Metric: Value format
- Kept CITATION_PATTERN_OLD for backwards compatibility
- Added CITATION_REF_PATTERN for simple [M##] counting
- Updated extract_citations() to try both patterns
- Updated find_uncited_numbers() to exclude both patterns
- Updated get_citation_count() to use ref pattern

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. src/utils/numeric_validator.py +52 -14
src/utils/numeric_validator.py CHANGED
@@ -9,12 +9,22 @@ import re
9
  from typing import Optional
10
 
11
 
12
- # Pattern to match citations like: $394.3B [M01], 25.3% [M02], 32.5 [M04]
13
- CITATION_PATTERN = re.compile(
 
 
 
 
 
 
 
14
  r'([\d,$\.]+[BMK%]?)\s*\[M(\d{2})\]',
15
  re.IGNORECASE
16
  )
17
 
 
 
 
18
 
19
  def normalize_value(text: str) -> Optional[float]:
20
  """
@@ -97,6 +107,10 @@ def extract_citations(text: str) -> list[dict]:
97
  """
98
  Extract all [M##] citations from text.
99
 
 
 
 
 
100
  Returns list of dicts:
101
  [
102
  {"ref_id": "M01", "cited_value": "$394.3B", "normalized": 394300000000.0},
@@ -104,16 +118,36 @@ def extract_citations(text: str) -> list[dict]:
104
  ]
105
  """
106
  citations = []
107
- for match in CITATION_PATTERN.finditer(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  cited_value = match.group(1)
109
  ref_num = match.group(2)
110
  ref_id = f"M{ref_num}"
111
- normalized = normalize_value(cited_value)
112
- citations.append({
113
- "ref_id": ref_id,
114
- "cited_value": cited_value,
115
- "normalized": normalized
116
- })
 
 
 
117
  return citations
118
 
119
 
@@ -262,11 +296,15 @@ def find_uncited_numbers(swot_text: str, metric_reference: dict) -> list[dict]:
262
  """
263
  uncited = []
264
 
265
- # Get all cited positions to exclude
266
- cited_matches = list(CITATION_PATTERN.finditer(swot_text))
267
  cited_positions = set()
268
- for match in cited_matches:
269
- # Mark the entire citation span as "cited"
 
 
 
 
 
270
  cited_positions.update(range(match.start(), match.end()))
271
 
272
  # Find all metric-like numbers
@@ -344,7 +382,7 @@ def validate_uncited_numbers(swot_text: str, metric_reference: dict) -> list[str
344
 
345
  def get_citation_count(swot_text: str) -> int:
346
  """Count the number of [M##] citations in the text."""
347
- return len(CITATION_PATTERN.findall(swot_text))
348
 
349
 
350
  def validate_minimum_citations(swot_text: str, metric_reference: dict, min_ratio: float = 0.5) -> dict:
 
9
  from typing import Optional
10
 
11
 
12
+ # Pattern to match citations in NEW format: [M01] Revenue: $394.3B - insight
13
+ # Matches: [M##] followed by metric name, colon, and value
14
+ CITATION_PATTERN_NEW = re.compile(
15
+ r'\[M(\d{2})\]\s*[^:]+:\s*(\$?[\d,]+\.?\d*[BMKTx%]?)',
16
+ re.IGNORECASE
17
+ )
18
+
19
+ # Pattern to match citations in OLD format: $394.3B [M01] (kept for backwards compatibility)
20
+ CITATION_PATTERN_OLD = re.compile(
21
  r'([\d,$\.]+[BMK%]?)\s*\[M(\d{2})\]',
22
  re.IGNORECASE
23
  )
24
 
25
+ # Combined pattern to find any [M##] reference (for citation counting)
26
+ CITATION_REF_PATTERN = re.compile(r'\[M(\d{2})\]', re.IGNORECASE)
27
+
28
 
29
  def normalize_value(text: str) -> Optional[float]:
30
  """
 
107
  """
108
  Extract all [M##] citations from text.
109
 
110
+ Supports both formats:
111
+ - NEW: [M01] Revenue: $394.3B - insight
112
+ - OLD: $394.3B [M01]
113
+
114
  Returns list of dicts:
115
  [
116
  {"ref_id": "M01", "cited_value": "$394.3B", "normalized": 394300000000.0},
 
118
  ]
119
  """
120
  citations = []
121
+ seen_refs = set()
122
+
123
+ # Try NEW format first: [M##] Metric: Value
124
+ for match in CITATION_PATTERN_NEW.finditer(text):
125
+ ref_num = match.group(1)
126
+ cited_value = match.group(2)
127
+ ref_id = f"M{ref_num}"
128
+ if ref_id not in seen_refs:
129
+ normalized = normalize_value(cited_value)
130
+ citations.append({
131
+ "ref_id": ref_id,
132
+ "cited_value": cited_value,
133
+ "normalized": normalized
134
+ })
135
+ seen_refs.add(ref_id)
136
+
137
+ # Also try OLD format: Value [M##]
138
+ for match in CITATION_PATTERN_OLD.finditer(text):
139
  cited_value = match.group(1)
140
  ref_num = match.group(2)
141
  ref_id = f"M{ref_num}"
142
+ if ref_id not in seen_refs:
143
+ normalized = normalize_value(cited_value)
144
+ citations.append({
145
+ "ref_id": ref_id,
146
+ "cited_value": cited_value,
147
+ "normalized": normalized
148
+ })
149
+ seen_refs.add(ref_id)
150
+
151
  return citations
152
 
153
 
 
296
  """
297
  uncited = []
298
 
299
+ # Get all cited positions to exclude (check both NEW and OLD patterns)
 
300
  cited_positions = set()
301
+
302
+ # NEW format: [M##] Metric: Value
303
+ for match in CITATION_PATTERN_NEW.finditer(swot_text):
304
+ cited_positions.update(range(match.start(), match.end()))
305
+
306
+ # OLD format: Value [M##]
307
+ for match in CITATION_PATTERN_OLD.finditer(swot_text):
308
  cited_positions.update(range(match.start(), match.end()))
309
 
310
  # Find all metric-like numbers
 
382
 
383
  def get_citation_count(swot_text: str) -> int:
384
  """Count the number of [M##] citations in the text."""
385
+ return len(CITATION_REF_PATTERN.findall(swot_text))
386
 
387
 
388
  def validate_minimum_citations(swot_text: str, metric_reference: dict, min_ratio: float = 0.5) -> dict: